From a69d1d2eee6e7ba9648b58cb51305049db8786cd Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:17:52 +0000 Subject: [PATCH 01/24] Update salmon indexing module --- modules.json | 2 +- modules/nf-core/salmon/index/main.nf | 27 +++++++----- .../nf-core/salmon/index/tests/main.nf.test | 43 +++++++++++++++++-- .../salmon/index/tests/main.nf.test.snap | 31 +++++++++---- 4 files changed, 81 insertions(+), 22 deletions(-) diff --git a/modules.json b/modules.json index 1c8fbb6f6..320f2c245 100644 --- a/modules.json +++ b/modules.json @@ -181,7 +181,7 @@ }, "salmon/index": { "branch": "master", - "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358", + "git_sha": "25ddc0bb25292280923eed07e6351789a671e86a", "installed_by": ["fastq_subsample_fq_salmon"] }, "salmon/quant": { diff --git a/modules/nf-core/salmon/index/main.nf b/modules/nf-core/salmon/index/main.nf index 3d653c0d0..2e9c6224c 100644 --- a/modules/nf-core/salmon/index/main.nf +++ b/modules/nf-core/salmon/index/main.nf @@ -20,22 +20,29 @@ process SALMON_INDEX { script: def args = task.ext.args ?: '' - def get_decoy_ids = "grep '^>' $genome_fasta | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" - def gentrome = "gentrome.fa" - if (genome_fasta.endsWith('.gz')) { - get_decoy_ids = "grep '^>' <(gunzip -c $genome_fasta) | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" - gentrome = "gentrome.fa.gz" + def decoys = '' + def fasta = transcript_fasta + if (genome_fasta){ + if (genome_fasta.endsWith('.gz')) { + genome_fasta = "<(gunzip -c $genome_fasta)" + } + decoys='-d decoys.txt' + fasta='gentrome.fa' + } + if (transcript_fasta.endsWith('.gz')) { + transcript_fasta = "<(gunzip -c $transcript_fasta)" } """ - $get_decoy_ids - sed -i.bak -e 's/>//g' decoys.txt - cat $transcript_fasta $genome_fasta > $gentrome + if [ -n '$genome_fasta' ]; then + grep '^>' $genome_fasta | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 | sed 's/>//g' > decoys.txt + cat $transcript_fasta $genome_fasta > $fasta + fi salmon \\ index \\ --threads $task.cpus \\ - -t $gentrome \\ - -d decoys.txt \\ + -t $fasta \\ + $decoys \\ $args \\ -i salmon diff --git a/modules/nf-core/salmon/index/tests/main.nf.test b/modules/nf-core/salmon/index/tests/main.nf.test index 0caf30d3e..30b7359b9 100644 --- a/modules/nf-core/salmon/index/tests/main.nf.test +++ b/modules/nf-core/salmon/index/tests/main.nf.test @@ -3,6 +3,10 @@ nextflow_process { name "Test Process SALMON_INDEX" script "../main.nf" process "SALMON_INDEX" + tag "modules" + tag "modules_nfcore" + tag "salmon" + tag "salmon/index" test("sarscov2") { @@ -22,13 +26,43 @@ nextflow_process { assertAll( { assert process.success }, { assert path(process.out.index.get(0)).exists() }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot( + file(process.out.index[0]).listFiles().collect { it.getName() }.sort().toString(), + process.out.versions + ).match()} ) } } - test("sarscov2 stub") { + test("sarscov2 transcriptome only") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([]) + input[1] = Channel.of([file(params.modules_testdata_base_path + "genomics/sarscov2/genome/transcriptome.fasta", checkIfExists: true)]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.index.get(0)).exists() }, + { assert snapshot( + file(process.out.index[0]).listFiles().collect { it.getName() }.sort().toString(), + process.out.versions + ).match()} + ) + } + + } + + test("sarscov2 stub") { options "-stub" when { params { @@ -46,7 +80,10 @@ nextflow_process { assertAll( { assert process.success }, { assert path(process.out.index.get(0)).exists() }, - { assert snapshot(process.out.versions).match("versions stub") } + { assert snapshot( + file(process.out.index[0]).listFiles().collect { it.getName() }.sort().toString(), + process.out.versions + ).match()} ) } diff --git a/modules/nf-core/salmon/index/tests/main.nf.test.snap b/modules/nf-core/salmon/index/tests/main.nf.test.snap index e5899b511..f8ed44d7d 100644 --- a/modules/nf-core/salmon/index/tests/main.nf.test.snap +++ b/modules/nf-core/salmon/index/tests/main.nf.test.snap @@ -1,26 +1,41 @@ { - "versions": { + "sarscov2 stub": { "content": [ + "[complete_ref_lens.bin, ctable.bin, ctg_offsets.bin, duplicate_clusters.tsv, info.json, mphf.bin, pos.bin, pre_indexing.log, rank.bin, refAccumLengths.bin, ref_indexing.log, reflengths.bin, refseq.bin, seq.bin, versionInfo.json]", [ "versions.yml:md5,85337fa0a286ea35073ee5260974e307" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-10-18T10:00:47.087293189" + "timestamp": "2025-01-20T12:57:51.498323" }, - "versions stub": { + "sarscov2": { "content": [ + "[complete_ref_lens.bin, ctable.bin, ctg_offsets.bin, duplicate_clusters.tsv, info.json, mphf.bin, pos.bin, pre_indexing.log, rank.bin, refAccumLengths.bin, ref_indexing.log, reflengths.bin, refseq.bin, seq.bin, versionInfo.json]", [ "versions.yml:md5,85337fa0a286ea35073ee5260974e307" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-10-18T10:01:03.89824494" + "timestamp": "2025-01-20T12:57:33.474302" + }, + "sarscov2 transcriptome only": { + "content": [ + "[complete_ref_lens.bin, ctable.bin, ctg_offsets.bin, duplicate_clusters.tsv, info.json, mphf.bin, pos.bin, pre_indexing.log, rank.bin, refAccumLengths.bin, ref_indexing.log, reflengths.bin, refseq.bin, seq.bin, versionInfo.json]", + [ + "versions.yml:md5,85337fa0a286ea35073ee5260974e307" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-20T12:57:42.420247" } } \ No newline at end of file From c4a416d9bf39b6cebcd5b19bdabcc4be6972868f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:18:30 +0000 Subject: [PATCH 02/24] Make fasta optional for gtf filtering --- bin/filter_gtf.py | 15 ++++++++------- modules/local/gtf_filter/main.nf | 8 ++++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index b2215fde6..4393fdc9e 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -6,7 +6,7 @@ import argparse import re import statistics -from typing import Set +from typing import Optional, Set # Create a logger logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") @@ -27,14 +27,15 @@ def tab_delimited(file: str) -> float: return statistics.median(line.count("\t") for line in data.split("\n")) -def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: +def filter_gtf(fasta: Optional[str], gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") - seq_names_in_genome = extract_fasta_seq_names(fasta) - logger.info(f"Extracted chromosome sequence names from {fasta}") - logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome))) + if (fasta is not None): + seq_names_in_genome = extract_fasta_seq_names(fasta) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome))) seq_names_in_gtf = set() try: @@ -44,7 +45,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i seq_name = line.split("\t")[0] seq_names_in_gtf.add(seq_name) # Add sequence name to the set - if seq_name in seq_names_in_genome: + if fasta is None or seq_name in seq_names_in_genome: if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line): out.write(line) line_count += 1 @@ -63,7 +64,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i if __name__ == "__main__": parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") parser.add_argument("--gtf", type=str, required=True, help="GTF file") - parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") + parser.add_argument("--fasta", type=str, required=False, help="Genome fasta file") parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") parser.add_argument( "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file" diff --git a/modules/local/gtf_filter/main.nf b/modules/local/gtf_filter/main.nf index 60eb9a9bd..bba995475 100644 --- a/modules/local/gtf_filter/main.nf +++ b/modules/local/gtf_filter/main.nf @@ -18,11 +18,15 @@ process GTF_FILTER { task.ext.when == null || task.ext.when script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/ + fasta_text='' + if (fasta){ + fasta_text="--fasta $fasta" + } """ filter_gtf.py \\ --gtf $gtf \\ - --fasta $fasta \\ - --prefix ${fasta.baseName} + $fasta_text \\ + --prefix ${gtf.baseName} cat <<-END_VERSIONS > versions.yml "${task.process}": From 072323809ccecccf9aa6a6fb4eb3bd35583dbc46 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:19:27 +0000 Subject: [PATCH 03/24] Allow no fasta during param checks --- .../local/utils_nfcore_rnaseq_pipeline/main.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf index 5e8820ddc..03ced5d4a 100644 --- a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf @@ -180,8 +180,14 @@ def validateInputParameters() { genomeExistsError() - if (!params.fasta) { - error("Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file.") + if ( + !params.fasta && + ( + ! params.skip_alignment || // Alignment needs fasta + ! params.transcript_fasta // Dynamically making a transcript fasta needs the fasta + ) + ) { + error("Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file. You must supply a genome FASTA file or use --skip_alignment and provide your own transcript fasta using --transcript_fasta for use in quantification.") } if (!params.gtf && !params.gff) { From 7c73f77b81f264a25285080597811aaae61267f7 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:19:50 +0000 Subject: [PATCH 04/24] Rework prepare_genome for optional fasta --- subworkflows/local/prepare_genome/main.nf | 453 ++++++++++++---------- 1 file changed, 247 insertions(+), 206 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index c1b8b4c66..38f7c2f9b 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -1,5 +1,5 @@ // -// Uncompress and prepare reference genome files +// Example: Grouping FASTA-dependent steps while preserving "index-provided" logic // include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip' @@ -36,16 +36,17 @@ include { GTF_FILTER } from '../../../modules/local/gt include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' workflow PREPARE_GENOME { + take: - fasta // file: /path/to/genome.fasta - gtf // file: /path/to/genome.gtf - gff // file: /path/to/genome.gff - additional_fasta // file: /path/to/additional.fasta - transcript_fasta // file: /path/to/transcript.fasta - gene_bed // file: /path/to/gene.bed - splicesites // file: /path/to/splicesites.txt - bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt - sortmerna_fasta_list // file: /path/to/sortmerna_fasta_list.txt + fasta // file: /path/to/genome.fasta (optional!) + gtf // file: /path/to/genome.gtf + gff // file: /path/to/genome.gff + additional_fasta // file: /path/to/additional.fasta + transcript_fasta // file: /path/to/transcript.fasta + gene_bed // file: /path/to/gene.bed + splicesites // file: /path/to/splicesites.txt + bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt + sortmerna_fasta_list // file: /path/to/sortmerna_fasta_list.txt star_index // directory: /path/to/star/index/ rsem_index // directory: /path/to/rsem/index/ salmon_index // directory: /path/to/salmon/index/ @@ -53,333 +54,373 @@ workflow PREPARE_GENOME { hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/bbsplit/index/ sortmerna_index // directory: /path/to/sortmerna/index/ - gencode // boolean: whether the genome is from GENCODE - featurecounts_group_type // string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts - aligner // string: Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2' - pseudo_aligner // string: Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner' - skip_gtf_filter // boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs - skip_bbsplit // boolean: Skip BBSplit for removal of non-reference genome reads - skip_sortmerna // boolean: Skip sortmerna for removal of reads mapping to sequences in sortmerna_fasta_list - skip_alignment // boolean: Skip all of the alignment-based processes within the pipeline - skip_pseudo_alignment // boolean: Skip all of the pseudoalignment-based processes within the pipeline + gencode // boolean + featurecounts_group_type // string + aligner // string: 'star_salmon', 'star_rsem', 'hisat2' + pseudo_aligner // string (e.g. 'salmon') + skip_gtf_filter // boolean + skip_bbsplit // boolean + skip_sortmerna // boolean + skip_alignment // boolean + skip_pseudo_alignment // boolean main: + // Versions collector ch_versions = Channel.empty() - // - // Uncompress genome fasta file if required - // - if (fasta.endsWith('.gz')) { - ch_fasta = GUNZIP_FASTA ( [ [:], file(fasta, checkIfExists: true) ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) - } else { - ch_fasta = Channel.value(file(fasta, checkIfExists: true)) + //--------------------------- + // 1) Uncompress GTF or GFF -> GTF + //--------------------------- + ch_gtf = Channel.empty() + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ([ [:], file(gtf) ]).gunzip.map { it[1] }.first() + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.value(file(gtf)) + } + } else if (gff) { + def ch_gff + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ([ [:], file(gff) ]).gunzip.first() + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.value(file(gff)).map { [ [:], it ] } + } + ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] }.first() + ch_versions = ch_versions.mix(GFFREAD.out.versions) } - // - // Uncompress GTF annotation file or create from GFF3 if required - // - if (gtf || gff) { - if (gtf) { - if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], file(gtf, checkIfExists: true) ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } else { - ch_gtf = Channel.value(file(gtf, checkIfExists: true)) - } - } else if (gff) { - if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ( [ [:], file(gff, checkIfExists: true) ] ).gunzip - ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - } else { - ch_gff = Channel.value(file(gff, checkIfExists: true)).map { [ [:], it ] } - } - ch_gtf = GFFREAD ( ch_gff, [] ).gtf.map { it[1] } - ch_versions = ch_versions.mix(GFFREAD.out.versions) - } + //------------------------------------- + // 2) Check if we actually have a FASTA + //------------------------------------- + def fasta_provided = (fasta ? true : false) - // Determine whether to filter the GTF or not - def filter_gtf = - (( - // Condition 1: Alignment is required and aligner is set - !skip_alignment && aligner - ) || - ( - // Condition 2: Pseudoalignment is required and pseudoaligner is set - !skip_pseudo_alignment && pseudo_aligner - ) || - ( - // Condition 3: Transcript FASTA file is not provided - !transcript_fasta - )) && - ( - // Condition 4: --skip_gtf_filter is not provided - !skip_gtf_filter - ) - if (filter_gtf) { - GTF_FILTER ( ch_fasta, ch_gtf ) - ch_gtf = GTF_FILTER.out.genome_gtf - ch_versions = ch_versions.mix(GTF_FILTER.out.versions) + ch_fasta = Channel.of([]) + if (fasta_provided) { + // Uncompress FASTA if needed + if (fasta.endsWith('.gz')) { + ch_fasta = GUNZIP_FASTA ([ [:], file(fasta) ]).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) + } else { + ch_fasta = Channel.value(file(fasta)) } } - // - // Uncompress additional fasta file and concatenate with reference fasta and gtf files - // - def biotype = gencode ? "gene_type" : featurecounts_group_type - if (additional_fasta) { + //---------------------------------------- + // 3) Filter GTF if needed & FASTA present + //---------------------------------------- + def filter_gtf_needed = ( + (!skip_alignment && aligner) || + (!skip_pseudo_alignment && pseudo_aligner) || + (!transcript_fasta) + ) && !skip_gtf_filter + + if (filter_gtf_needed) { + GTF_FILTER(ch_fasta, ch_gtf) + ch_gtf = GTF_FILTER.out.genome_gtf.first() + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) + } + + //--------------------------------------------------- + // 4) Concatenate additional FASTA (if both are given) + //--------------------------------------------------- + ch_add_fasta = Channel.empty() + if (fasta_provided && additional_fasta) { if (additional_fasta.endsWith('.gz')) { - ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], file(additional_fasta, checkIfExists: true) ] ).gunzip.map { it[1] } + ch_add_fasta = GUNZIP_ADDITIONAL_FASTA([ [:], file(additional_fasta) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) } else { - ch_add_fasta = Channel.value(file(additional_fasta, checkIfExists: true)) + ch_add_fasta = Channel.value(file(additional_fasta)) } - CUSTOM_CATADDITIONALFASTA ( - ch_fasta.combine(ch_gtf).map { fasta, gtf -> [ [:], fasta, gtf ] }, + CUSTOM_CATADDITIONALFASTA( + ch_fasta.combine(ch_gtf).map { f, g -> [ [:], f, g ] }, ch_add_fasta.map { [ [:], it ] }, - biotype + gencode ? "gene_type" : featurecounts_group_type ) ch_fasta = CUSTOM_CATADDITIONALFASTA.out.fasta.map { it[1] }.first() ch_gtf = CUSTOM_CATADDITIONALFASTA.out.gtf.map { it[1] }.first() ch_versions = ch_versions.mix(CUSTOM_CATADDITIONALFASTA.out.versions) } - // - // Uncompress gene BED annotation file or create from GTF if required - // + //------------------------------------------------------ + // 5) Uncompress gene BED or create from GTF if not given + //------------------------------------------------------ + ch_gene_bed = Channel.empty() if (gene_bed) { if (gene_bed.endsWith('.gz')) { - ch_gene_bed = GUNZIP_GENE_BED ( [ [:], file(gene_bed, checkIfExists: true) ] ).gunzip.map { it[1] } + ch_gene_bed = GUNZIP_GENE_BED ([ [:], file(gene_bed) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) } else { - ch_gene_bed = Channel.value(file(gene_bed, checkIfExists: true)) + ch_gene_bed = Channel.value(file(gene_bed)) } } else { - ch_gene_bed = GTF2BED ( ch_gtf ).bed + ch_gene_bed = GTF2BED(ch_gtf).bed ch_versions = ch_versions.mix(GTF2BED.out.versions) } - // - // Uncompress transcript fasta file / create if required - // + //---------------------------------------------------------------------- + // 6) Transcript FASTA: + // - If provided, decompress (optionally preprocess if GENCODE) + // - If not provided but have genome+GTF, create from them + //---------------------------------------------------------------------- + ch_transcript_fasta = Channel.empty() if (transcript_fasta) { + // Use user-provided transcript FASTA if (transcript_fasta.endsWith('.gz')) { - ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], file(transcript_fasta, checkIfExists: true) ] ).gunzip.map { it[1] } + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ([ [:], file(transcript_fasta) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) } else { - ch_transcript_fasta = Channel.value(file(transcript_fasta, checkIfExists: true)) + ch_transcript_fasta = Channel.value(file(transcript_fasta)) } if (gencode) { - PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) + PREPROCESS_TRANSCRIPTS_FASTA_GENCODE(ch_transcript_fasta) ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } - } else { - ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta + } else if (fasta_provided) { + // Build transcripts from genome if we have it + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA(ch_fasta, ch_gtf).transcript_fasta ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } - // - // Create chromosome sizes file - // - CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } ) - ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] } - ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } - ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) + //------------------------------------------------------- + // 7) FAI / chrom.sizes only if we actually have a genome + //------------------------------------------------------- + ch_fai = Channel.empty() + ch_chrom_sizes = Channel.empty() + if (fasta_provided) { + CUSTOM_GETCHROMSIZES(ch_fasta.map { [ [:], it ] }) + ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] } + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) + } - // - // Get list of indices that need to be created - // + //------------------------------------------------ + // 8) Determine which indices we actually want built + //------------------------------------------------ def prepare_tool_indices = [] - if (!skip_bbsplit) { prepare_tool_indices << 'bbsplit' } - if (!skip_sortmerna) { prepare_tool_indices << 'sortmerna' } - if (!skip_alignment) { prepare_tool_indices << aligner } - if (!skip_pseudo_alignment && pseudo_aligner) { prepare_tool_indices << pseudo_aligner } + if (!skip_bbsplit) { prepare_tool_indices << 'bbsplit' } + if (!skip_sortmerna) { prepare_tool_indices << 'sortmerna' } + if (!skip_alignment && aligner) { prepare_tool_indices << aligner } + if (!skip_pseudo_alignment && pseudo_aligner) { prepare_tool_indices << pseudo_aligner } - // - // Uncompress BBSplit index or generate from scratch if required - // + //--------------------------------------------------------- + // 9) BBSplit index: uses FASTA only if we generate from scratch + //--------------------------------------------------------- ch_bbsplit_index = Channel.empty() if ('bbsplit' in prepare_tool_indices) { if (bbsplit_index) { + // Use user-provided bbsplit index if (bbsplit_index.endsWith('.tar.gz')) { - ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], bbsplit_index ] ).untar.map { it[1] } + ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ([ [:], file(bbsplit_index) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) } else { ch_bbsplit_index = Channel.value(file(bbsplit_index)) } - } else { + } + else if (fasta_provided) { + // Build it from scratch if we have FASTA Channel .from(file(bbsplit_fasta_list)) - .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta - .flatMap { id, fasta -> [ [ 'id', id ], [ 'fasta', file(fasta, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key + .splitCsv() + .flatMap { id, fafile -> [ [ 'id', id ], [ 'fasta', file(fafile) ] ] } .groupTuple() - .map { it -> it[1] } // Get rid of keys and keep grouped values - .collect { [ it ] } // Collect entries as a list to pass as "tuple val(short_names), path(path_to_fasta)" to module + .map { it -> it[1] } + .collect { [ it ] } .set { ch_bbsplit_fasta_list } - ch_bbsplit_index = BBMAP_BBSPLIT ( [ [:], [] ], [], ch_fasta, ch_bbsplit_fasta_list, true ).index - ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions) + ch_bbsplit_index = BBMAP_BBSPLIT( + [ [:], [] ], + [], + ch_fasta, + ch_bbsplit_fasta_list, + true + ).index + ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions) } + // else: no FASTA and no user-provided index -> remains empty } - // - // Uncompress sortmerna index or generate from scratch if required - // + //------------------------------------------------------------- + // 10) SortMeRNA index does not require the genome FASTA at all + //------------------------------------------------------------- ch_sortmerna_index = Channel.empty() - ch_rrna_fastas = Channel.empty() - + ch_rrna_fastas = Channel.empty() if ('sortmerna' in prepare_tool_indices) { - ribo_db = file(sortmerna_fasta_list) - - // SortMeRNA needs the rRNAs even if we're providing the index + // We always need the rRNA FASTAs + def ribo_db = file(sortmerna_fasta_list) ch_rrna_fastas = Channel.from(ribo_db.readLines()) - .map { row -> file(row, checkIfExists: true) } + .map { row -> file(row) } if (sortmerna_index) { if (sortmerna_index.endsWith('.tar.gz')) { - ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ( [ [:], sortmerna_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_SORTMERNA_INDEX.out.versions) + ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ([ [:], file(sortmerna_index) ]).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_SORTMERNA_INDEX.out.versions) } else { - ch_sortmerna_index = Channel.value([[:], file(sortmerna_index)]) + ch_sortmerna_index = Channel.value([ [:], file(sortmerna_index) ]) } } else { - - SORTMERNA_INDEX ( - Channel.of([ [],[] ]), + // Build new SortMeRNA index from the rRNA references + SORTMERNA_INDEX( + Channel.of([ [], [] ]), ch_rrna_fastas.collect().map { [ 'rrna_refs', it ] }, - Channel.of([ [],[] ]) + Channel.of([ [], [] ]) ) ch_sortmerna_index = SORTMERNA_INDEX.out.index.first() - ch_versions = ch_versions.mix(SORTMERNA_INDEX.out.versions) + ch_versions = ch_versions.mix(SORTMERNA_INDEX.out.versions) } } - // - // Uncompress STAR index or generate from scratch if required - // + //---------------------------------------------------- + // 11) STAR index (e.g. for 'star_salmon') -> needs FASTA if built + //---------------------------------------------------- ch_star_index = Channel.empty() if ('star_salmon' in prepare_tool_indices) { if (star_index) { if (star_index.endsWith('.tar.gz')) { - ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } + ch_star_index = UNTAR_STAR_INDEX ([ [:], file(star_index) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) } else { ch_star_index = Channel.value(file(star_index)) } - } else { - // Check if an AWS iGenome has been provided to use the appropriate version of STAR + } + else if (fasta_provided) { + // Build new STAR index + // Possibly check AWS iGenome conditions def is_aws_igenome = false - if (fasta && gtf) { - if ((file(fasta).getName() - '.gz' == 'genome.fa') && (file(gtf).getName() - '.gz' == 'genes.gtf')) { - is_aws_igenome = true - } + if (file(fasta).getName() - '.gz' == 'genome.fa' && file(gtf).getName() - '.gz' == 'genes.gtf') { + is_aws_igenome = true } if (is_aws_igenome) { - ch_star_index = STAR_GENOMEGENERATE_IGENOMES ( ch_fasta, ch_gtf ).index + ch_star_index = STAR_GENOMEGENERATE_IGENOMES(ch_fasta, ch_gtf).index ch_versions = ch_versions.mix(STAR_GENOMEGENERATE_IGENOMES.out.versions) } else { - ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] } + ch_star_index = STAR_GENOMEGENERATE( + ch_fasta.map { [ [:], it ] }, + ch_gtf.map { [ [:], it ] } + ).index.map { it[1] } ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) } } } - // - // Uncompress RSEM index or generate from scratch if required - // + //------------------------------------------------ + // 12) RSEM index -> needs FASTA & GTF if built + //------------------------------------------------ ch_rsem_index = Channel.empty() if ('star_rsem' in prepare_tool_indices) { if (rsem_index) { if (rsem_index.endsWith('.tar.gz')) { - ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], rsem_index ] ).untar.map { it[1] } + ch_rsem_index = UNTAR_RSEM_INDEX ([ [:], file(rsem_index) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) } else { ch_rsem_index = Channel.value(file(rsem_index)) } - } else { - ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME ( ch_fasta, ch_gtf ).index + } + else if (fasta_provided) { + ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME(ch_fasta, ch_gtf).index ch_versions = ch_versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) } } - // - // Uncompress HISAT2 index or generate from scratch if required - // + //--------------------------------------------------------- + // 13) HISAT2 index -> needs FASTA & GTF if built + //--------------------------------------------------------- ch_splicesites = Channel.empty() ch_hisat2_index = Channel.empty() if ('hisat2' in prepare_tool_indices) { - if (!splicesites) { - ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf.map { [ [:], it ] } ).txt.map { it[1] } - ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) - } else { + // splicesites + if (splicesites) { ch_splicesites = Channel.value(file(splicesites)) } + else if (fasta_provided) { + ch_splicesites = HISAT2_EXTRACTSPLICESITES(ch_gtf.map { [ [:], it ] }).txt.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) + } + // the index if (hisat2_index) { if (hisat2_index.endsWith('.tar.gz')) { - ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], hisat2_index ] ).untar.map { it[1] } + ch_hisat2_index = UNTAR_HISAT2_INDEX ([ [:], file(hisat2_index) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) } else { ch_hisat2_index = Channel.value(file(hisat2_index)) } - } else { - ch_hisat2_index = HISAT2_BUILD ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] }, ch_splicesites.map { [ [:], it ] } ).index.map { it[1] } - ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) + } + else if (fasta_provided) { + ch_hisat2_index = HISAT2_BUILD( + ch_fasta.map { [ [:], it ] }, + ch_gtf.map { [ [:], it ] }, + ch_splicesites.map { [ [:], it ] } + ).index.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) } } - // - // Uncompress Salmon index or generate from scratch if required - // + //------------------------------------------------------ + // 14) Salmon index -> can skip genome if transcript_fasta is enough + //------------------------------------------------------ ch_salmon_index = Channel.empty() - if (salmon_index) { - if (salmon_index.endsWith('.tar.gz')) { - ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) - } else { - ch_salmon_index = Channel.value(file(salmon_index)) + if ('salmon' in prepare_tool_indices) { + if (salmon_index) { + // use user-provided salmon index + if (salmon_index.endsWith('.tar.gz')) { + ch_salmon_index = UNTAR_SALMON_INDEX ([ [:], file(salmon_index) ]).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) + } else { + ch_salmon_index = Channel.value(file(salmon_index)) + } } - } else { - if ('salmon' in prepare_tool_indices) { - ch_salmon_index = SALMON_INDEX ( ch_fasta, ch_transcript_fasta ).index + else if (ch_transcript_fasta && fasta_provided) { + // build from transcript FASTA + genome FASTA + ch_salmon_index = SALMON_INDEX(ch_fasta, ch_transcript_fasta).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) + } + else if (ch_transcript_fasta) { + // some Salmon module can run with just a transcript FASTA + ch_salmon_index = SALMON_INDEX([], ch_transcript_fasta).index ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } } - // - // Uncompress Kallisto index or generate from scratch if required - // + //-------------------------------------------------- + // 15) Kallisto index -> only needs transcript FASTA + //-------------------------------------------------- ch_kallisto_index = Channel.empty() - if (kallisto_index) { - if (kallisto_index.endsWith('.tar.gz')) { - ch_kallisto_index = UNTAR_KALLISTO_INDEX ( [ [:], kallisto_index ] ).untar - ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) - } else { - ch_kallisto_index = Channel.value([[:], file(kallisto_index)]) + if ('kallisto' in prepare_tool_indices) { + if (kallisto_index) { + if (kallisto_index.endsWith('.tar.gz')) { + ch_kallisto_index = UNTAR_KALLISTO_INDEX ([ [:], file(kallisto_index) ]).untar + ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) + } else { + ch_kallisto_index = Channel.value([ [:], file(kallisto_index) ]) + } } - } else { - if ('kallisto' in prepare_tool_indices) { - ch_kallisto_index = KALLISTO_INDEX ( ch_transcript_fasta.map { [ [:], it] } ).index - ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + else if (ch_transcript_fasta) { + ch_kallisto_index = KALLISTO_INDEX(ch_transcript_fasta.map { [ [:], it ] }).index + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) } } + //------------------ + // 16) Emit channels + //------------------ emit: - fasta = ch_fasta // channel: path(genome.fasta) - gtf = ch_gtf // channel: path(genome.gtf) - fai = ch_fai // channel: path(genome.fai) - gene_bed = ch_gene_bed // channel: path(gene.bed) - transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) - chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) - splicesites = ch_splicesites // channel: path(genome.splicesites.txt) - bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) - rrna_fastas = ch_rrna_fastas // channel: path(sortmerna_fasta_list) - sortmerna_index = ch_sortmerna_index // channel: path(sortmerna/index/) - star_index = ch_star_index // channel: path(star/index/) - rsem_index = ch_rsem_index // channel: path(rsem/index/) - hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) - salmon_index = ch_salmon_index // channel: path(salmon/index/) - kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + fasta = ch_fasta + gtf = ch_gtf + fai = ch_fai + gene_bed = ch_gene_bed + transcript_fasta = ch_transcript_fasta + chrom_sizes = ch_chrom_sizes + splicesites = ch_splicesites + bbsplit_index = ch_bbsplit_index + rrna_fastas = ch_rrna_fastas + sortmerna_index = ch_sortmerna_index + star_index = ch_star_index + rsem_index = ch_rsem_index + hisat2_index = ch_hisat2_index + salmon_index = ch_salmon_index + kallisto_index = ch_kallisto_index + versions = ch_versions.ifEmpty(null) } From 53a1638837061667c152d2c52fa8bcc1e9ca69d7 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:20:19 +0000 Subject: [PATCH 05/24] Fix bbsplit param usage for optional fasta --- workflows/rnaseq/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf index 27c7e5639..6e9ed6abe 100755 --- a/workflows/rnaseq/main.nf +++ b/workflows/rnaseq/main.nf @@ -141,7 +141,7 @@ workflow RNASEQ { ch_sortmerna_index, ch_bbsplit_index, ch_ribo_db, - params.skip_bbsplit, + params.skip_bbsplit || ! params.fasta, params.skip_fastqc || params.skip_qc, params.skip_trimming, params.skip_umi_extract, From 4bb0af18372f2009df260f53371d4c007de41aae Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:20:35 +0000 Subject: [PATCH 06/24] Add test for no fasta --- tests/nofasta.nf.test | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tests/nofasta.nf.test diff --git a/tests/nofasta.nf.test b/tests/nofasta.nf.test new file mode 100644 index 000000000..4413d4c41 --- /dev/null +++ b/tests/nofasta.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test pipeline by skipping trimming options" + script "../main.nf" + + test("Params: no fasta") { + + when { + params { + outdir = "$outputDir" + skip_alignment = true + fasta = null + additional_fasta = null + salmon_index = null + transcript_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/d1f59361a013a8820c824d606f5853db0d6c7999/reference/transcriptome_match_gtf.fa" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we tests pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_rnaseq_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} From 64a45474f140b7a6862864bc72597afbcf37dd85 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:28:13 +0000 Subject: [PATCH 07/24] lint fix --- subworkflows/local/prepare_genome/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 38f7c2f9b..ce2ce7097 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -74,7 +74,7 @@ workflow PREPARE_GENOME { ch_gtf = Channel.empty() if (gtf) { if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ([ [:], file(gtf) ]).gunzip.map { it[1] }.first() + ch_gtf = GUNZIP_GTF ([ [:], file(gtf) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } else { ch_gtf = Channel.value(file(gtf)) @@ -82,12 +82,12 @@ workflow PREPARE_GENOME { } else if (gff) { def ch_gff if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ([ [:], file(gff) ]).gunzip.first() + ch_gff = GUNZIP_GFF ([ [:], file(gff) ]).gunzip ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) } else { ch_gff = Channel.value(file(gff)).map { [ [:], it ] } } - ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] }.first() + ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] } ch_versions = ch_versions.mix(GFFREAD.out.versions) } @@ -252,7 +252,7 @@ workflow PREPARE_GENOME { // We always need the rRNA FASTAs def ribo_db = file(sortmerna_fasta_list) ch_rrna_fastas = Channel.from(ribo_db.readLines()) - .map { row -> file(row) } + .map { row -> file(row) } if (sortmerna_index) { if (sortmerna_index.endsWith('.tar.gz')) { From d6ef689d481c9784410bb415619449ef00adb2c5 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:28:23 +0000 Subject: [PATCH 08/24] Add snap for new test --- tests/nofasta.nf.test | 2 +- tests/nofasta.nf.test.snap | 429 +++++++++++++++++++++++++++++++++++++ 2 files changed, 430 insertions(+), 1 deletion(-) create mode 100644 tests/nofasta.nf.test.snap diff --git a/tests/nofasta.nf.test b/tests/nofasta.nf.test index 4413d4c41..41c5d9e94 100644 --- a/tests/nofasta.nf.test +++ b/tests/nofasta.nf.test @@ -1,6 +1,6 @@ nextflow_pipeline { - name "Test pipeline by skipping trimming options" + name "Test pipeline by omitting fasta input" script "../main.nf" test("Params: no fasta") { diff --git a/tests/nofasta.nf.test.snap b/tests/nofasta.nf.test.snap new file mode 100644 index 000000000..f3593cdb6 --- /dev/null +++ b/tests/nofasta.nf.test.snap @@ -0,0 +1,429 @@ +{ + "Params: no fasta": { + "content": [ + 41, + { + "CAT_FASTQ": { + "cat": 9.5 + }, + "CUSTOM_TX2GENE": { + "python": "3.10.4" + }, + "DESEQ2_QC_PSEUDO": { + "r-base": "4.0.3", + "bioconductor-deseq2": "1.28.0" + }, + "FASTQC": { + "fastqc": "0.12.1" + }, + "FQ_LINT": { + "fq": "0.12.0 (2024-07-08)" + }, + "FQ_SUBSAMPLE": { + "fq": "0.12.0 (2024-07-08)" + }, + "GTF2BED": { + "perl": "5.26.2" + }, + "GTF_FILTER": { + "python": "3.9.5" + }, + "GUNZIP_GTF": { + "gunzip": 1.1 + }, + "SALMON_INDEX": { + "salmon": "1.10.3" + }, + "SALMON_QUANT": { + "salmon": "1.10.3" + }, + "SE_GENE": { + "bioconductor-summarizedexperiment": "1.32.0" + }, + "TRIMGALORE": { + "trimgalore": "0.6.10", + "cutadapt": 4.9, + "pigz": 2.8 + }, + "TXIMETA_TXIMPORT": { + "bioconductor-tximeta": "1.20.1" + }, + "Workflow": { + "nf-core/rnaseq": "v3.19.0dev" + } + }, + [ + "fastqc", + "fastqc/raw", + "fastqc/raw/RAP1_IAA_30M_REP1_raw_1_fastqc.html", + "fastqc/raw/RAP1_IAA_30M_REP1_raw_1_fastqc.zip", + "fastqc/raw/RAP1_IAA_30M_REP1_raw_2_fastqc.html", + "fastqc/raw/RAP1_IAA_30M_REP1_raw_2_fastqc.zip", + "fastqc/raw/RAP1_UNINDUCED_REP1_raw_fastqc.html", + "fastqc/raw/RAP1_UNINDUCED_REP1_raw_fastqc.zip", + "fastqc/raw/RAP1_UNINDUCED_REP2_raw_fastqc.html", + "fastqc/raw/RAP1_UNINDUCED_REP2_raw_fastqc.zip", + "fastqc/raw/WT_REP1_raw_1_fastqc.html", + "fastqc/raw/WT_REP1_raw_1_fastqc.zip", + "fastqc/raw/WT_REP1_raw_2_fastqc.html", + "fastqc/raw/WT_REP1_raw_2_fastqc.zip", + "fastqc/raw/WT_REP2_raw_1_fastqc.html", + "fastqc/raw/WT_REP2_raw_1_fastqc.zip", + "fastqc/raw/WT_REP2_raw_2_fastqc.html", + "fastqc/raw/WT_REP2_raw_2_fastqc.zip", + "fastqc/trim", + "fastqc/trim/RAP1_IAA_30M_REP1_trimmed_1_val_1_fastqc.html", + "fastqc/trim/RAP1_IAA_30M_REP1_trimmed_1_val_1_fastqc.zip", + "fastqc/trim/RAP1_IAA_30M_REP1_trimmed_2_val_2_fastqc.html", + "fastqc/trim/RAP1_IAA_30M_REP1_trimmed_2_val_2_fastqc.zip", + "fastqc/trim/RAP1_UNINDUCED_REP1_trimmed_trimmed_fastqc.html", + "fastqc/trim/RAP1_UNINDUCED_REP1_trimmed_trimmed_fastqc.zip", + "fastqc/trim/RAP1_UNINDUCED_REP2_trimmed_trimmed_fastqc.html", + "fastqc/trim/RAP1_UNINDUCED_REP2_trimmed_trimmed_fastqc.zip", + "fastqc/trim/WT_REP1_trimmed_1_val_1_fastqc.html", + "fastqc/trim/WT_REP1_trimmed_1_val_1_fastqc.zip", + "fastqc/trim/WT_REP1_trimmed_2_val_2_fastqc.html", + "fastqc/trim/WT_REP1_trimmed_2_val_2_fastqc.zip", + "fastqc/trim/WT_REP2_trimmed_1_val_1_fastqc.html", + "fastqc/trim/WT_REP2_trimmed_1_val_1_fastqc.zip", + "fastqc/trim/WT_REP2_trimmed_2_val_2_fastqc.html", + "fastqc/trim/WT_REP2_trimmed_2_val_2_fastqc.zip", + "fq_lint", + "fq_lint/raw", + "fq_lint/raw/RAP1_IAA_30M_REP1.fq_lint.txt", + "fq_lint/raw/RAP1_UNINDUCED_REP1.fq_lint.txt", + "fq_lint/raw/RAP1_UNINDUCED_REP2.fq_lint.txt", + "fq_lint/raw/WT_REP1.fq_lint.txt", + "fq_lint/raw/WT_REP2.fq_lint.txt", + "fq_lint/trimmed", + "fq_lint/trimmed/RAP1_IAA_30M_REP1.fq_lint.txt", + "fq_lint/trimmed/RAP1_UNINDUCED_REP1.fq_lint.txt", + "fq_lint/trimmed/RAP1_UNINDUCED_REP2.fq_lint.txt", + "fq_lint/trimmed/WT_REP1.fq_lint.txt", + "fq_lint/trimmed/WT_REP2.fq_lint.txt", + "multiqc", + "multiqc/multiqc_report.html", + "multiqc/multiqc_report_data", + "multiqc/multiqc_report_data/cutadapt_filtered_reads_plot.txt", + "multiqc/multiqc_report_data/cutadapt_trimmed_sequences_plot_3_Counts.txt", + "multiqc/multiqc_report_data/cutadapt_trimmed_sequences_plot_3_Obs_Exp.txt", + "multiqc/multiqc_report_data/fastqc_raw-status-check-heatmap.txt", + "multiqc/multiqc_report_data/fastqc_raw_adapter_content_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_overrepresented_sequences_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_per_base_n_content_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_per_base_sequence_quality_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_per_sequence_gc_content_plot_Counts.txt", + "multiqc/multiqc_report_data/fastqc_raw_per_sequence_gc_content_plot_Percentages.txt", + "multiqc/multiqc_report_data/fastqc_raw_per_sequence_quality_scores_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_sequence_counts_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_sequence_duplication_levels_plot.txt", + "multiqc/multiqc_report_data/fastqc_raw_top_overrepresented_sequences_table.txt", + "multiqc/multiqc_report_data/fastqc_sequence_length_distribution_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed-status-check-heatmap.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_overrepresented_sequences_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_per_base_n_content_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_per_base_sequence_quality_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_per_sequence_gc_content_plot_Counts.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_per_sequence_gc_content_plot_Percentages.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_per_sequence_quality_scores_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_sequence_counts_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_sequence_duplication_levels_plot.txt", + "multiqc/multiqc_report_data/fastqc_trimmed_top_overrepresented_sequences_table.txt", + "multiqc/multiqc_report_data/multiqc.log", + "multiqc/multiqc_report_data/multiqc_citations.txt", + "multiqc/multiqc_report_data/multiqc_cutadapt.txt", + "multiqc/multiqc_report_data/multiqc_data.json", + "multiqc/multiqc_report_data/multiqc_fastqc_fastqc_raw.txt", + "multiqc/multiqc_report_data/multiqc_fastqc_fastqc_trimmed.txt", + "multiqc/multiqc_report_data/multiqc_general_stats.txt", + "multiqc/multiqc_report_data/multiqc_salmon.txt", + "multiqc/multiqc_report_data/multiqc_sample-relationships.txt", + "multiqc/multiqc_report_data/multiqc_sample-relationships_1.txt", + "multiqc/multiqc_report_data/multiqc_software_versions.txt", + "multiqc/multiqc_report_data/multiqc_sources.txt", + "multiqc/multiqc_report_data/salmon_plot.txt", + "multiqc/multiqc_report_plots", + "multiqc/multiqc_report_plots/pdf", + "multiqc/multiqc_report_plots/pdf/cutadapt_filtered_reads_plot-cnt.pdf", + "multiqc/multiqc_report_plots/pdf/cutadapt_filtered_reads_plot-pct.pdf", + "multiqc/multiqc_report_plots/pdf/cutadapt_trimmed_sequences_plot_3_Counts.pdf", + "multiqc/multiqc_report_plots/pdf/cutadapt_trimmed_sequences_plot_3_Obs_Exp.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw-status-check-heatmap.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_adapter_content_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_overrepresented_sequences_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_per_base_n_content_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_per_base_sequence_quality_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_per_sequence_gc_content_plot_Counts.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_per_sequence_gc_content_plot_Percentages.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_per_sequence_quality_scores_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_sequence_counts_plot-cnt.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_sequence_counts_plot-pct.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_sequence_duplication_levels_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_raw_top_overrepresented_sequences_table.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_sequence_length_distribution_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed-status-check-heatmap.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_overrepresented_sequences_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_per_base_n_content_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_per_base_sequence_quality_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_per_sequence_gc_content_plot_Counts.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_per_sequence_gc_content_plot_Percentages.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_per_sequence_quality_scores_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_sequence_counts_plot-cnt.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_sequence_counts_plot-pct.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_sequence_duplication_levels_plot.pdf", + "multiqc/multiqc_report_plots/pdf/fastqc_trimmed_top_overrepresented_sequences_table.pdf", + "multiqc/multiqc_report_plots/pdf/general_stats_table.pdf", + "multiqc/multiqc_report_plots/pdf/salmon_plot.pdf", + "multiqc/multiqc_report_plots/pdf/sample-relationships.pdf", + "multiqc/multiqc_report_plots/png", + "multiqc/multiqc_report_plots/png/cutadapt_filtered_reads_plot-cnt.png", + "multiqc/multiqc_report_plots/png/cutadapt_filtered_reads_plot-pct.png", + "multiqc/multiqc_report_plots/png/cutadapt_trimmed_sequences_plot_3_Counts.png", + "multiqc/multiqc_report_plots/png/cutadapt_trimmed_sequences_plot_3_Obs_Exp.png", + "multiqc/multiqc_report_plots/png/fastqc_raw-status-check-heatmap.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_adapter_content_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_overrepresented_sequences_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_per_base_n_content_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_per_base_sequence_quality_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_per_sequence_gc_content_plot_Counts.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_per_sequence_gc_content_plot_Percentages.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_per_sequence_quality_scores_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_sequence_counts_plot-cnt.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_sequence_counts_plot-pct.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_sequence_duplication_levels_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_raw_top_overrepresented_sequences_table.png", + "multiqc/multiqc_report_plots/png/fastqc_sequence_length_distribution_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed-status-check-heatmap.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_overrepresented_sequences_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_per_base_n_content_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_per_base_sequence_quality_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_per_sequence_gc_content_plot_Counts.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_per_sequence_gc_content_plot_Percentages.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_per_sequence_quality_scores_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_sequence_counts_plot-cnt.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_sequence_counts_plot-pct.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_sequence_duplication_levels_plot.png", + "multiqc/multiqc_report_plots/png/fastqc_trimmed_top_overrepresented_sequences_table.png", + "multiqc/multiqc_report_plots/png/general_stats_table.png", + "multiqc/multiqc_report_plots/png/salmon_plot.png", + "multiqc/multiqc_report_plots/png/sample-relationships.png", + "multiqc/multiqc_report_plots/svg", + "multiqc/multiqc_report_plots/svg/cutadapt_filtered_reads_plot-cnt.svg", + "multiqc/multiqc_report_plots/svg/cutadapt_filtered_reads_plot-pct.svg", + "multiqc/multiqc_report_plots/svg/cutadapt_trimmed_sequences_plot_3_Counts.svg", + "multiqc/multiqc_report_plots/svg/cutadapt_trimmed_sequences_plot_3_Obs_Exp.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw-status-check-heatmap.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_adapter_content_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_overrepresented_sequences_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_per_base_n_content_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_per_base_sequence_quality_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_per_sequence_gc_content_plot_Counts.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_per_sequence_gc_content_plot_Percentages.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_per_sequence_quality_scores_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_sequence_counts_plot-cnt.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_sequence_counts_plot-pct.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_sequence_duplication_levels_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_raw_top_overrepresented_sequences_table.svg", + "multiqc/multiqc_report_plots/svg/fastqc_sequence_length_distribution_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed-status-check-heatmap.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_overrepresented_sequences_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_per_base_n_content_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_per_base_sequence_quality_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_per_sequence_gc_content_plot_Counts.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_per_sequence_gc_content_plot_Percentages.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_per_sequence_quality_scores_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_sequence_counts_plot-cnt.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_sequence_counts_plot-pct.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_sequence_duplication_levels_plot.svg", + "multiqc/multiqc_report_plots/svg/fastqc_trimmed_top_overrepresented_sequences_table.svg", + "multiqc/multiqc_report_plots/svg/general_stats_table.svg", + "multiqc/multiqc_report_plots/svg/salmon_plot.svg", + "multiqc/multiqc_report_plots/svg/sample-relationships.svg", + "pipeline_info", + "pipeline_info/nf_core_rnaseq_software_mqc_versions.yml", + "salmon", + "salmon/RAP1_IAA_30M_REP1", + "salmon/RAP1_IAA_30M_REP1/aux_info", + "salmon/RAP1_IAA_30M_REP1/aux_info/ambig_info.tsv", + "salmon/RAP1_IAA_30M_REP1/aux_info/expected_bias.gz", + "salmon/RAP1_IAA_30M_REP1/aux_info/fld.gz", + "salmon/RAP1_IAA_30M_REP1/aux_info/meta_info.json", + "salmon/RAP1_IAA_30M_REP1/aux_info/observed_bias.gz", + "salmon/RAP1_IAA_30M_REP1/aux_info/observed_bias_3p.gz", + "salmon/RAP1_IAA_30M_REP1/cmd_info.json", + "salmon/RAP1_IAA_30M_REP1/libParams", + "salmon/RAP1_IAA_30M_REP1/libParams/flenDist.txt", + "salmon/RAP1_IAA_30M_REP1/lib_format_counts.json", + "salmon/RAP1_IAA_30M_REP1/logs", + "salmon/RAP1_IAA_30M_REP1/logs/salmon_quant.log", + "salmon/RAP1_IAA_30M_REP1/quant.genes.sf", + "salmon/RAP1_IAA_30M_REP1/quant.sf", + "salmon/RAP1_UNINDUCED_REP1", + "salmon/RAP1_UNINDUCED_REP1/aux_info", + "salmon/RAP1_UNINDUCED_REP1/aux_info/ambig_info.tsv", + "salmon/RAP1_UNINDUCED_REP1/aux_info/expected_bias.gz", + "salmon/RAP1_UNINDUCED_REP1/aux_info/fld.gz", + "salmon/RAP1_UNINDUCED_REP1/aux_info/meta_info.json", + "salmon/RAP1_UNINDUCED_REP1/aux_info/observed_bias.gz", + "salmon/RAP1_UNINDUCED_REP1/aux_info/observed_bias_3p.gz", + "salmon/RAP1_UNINDUCED_REP1/cmd_info.json", + "salmon/RAP1_UNINDUCED_REP1/libParams", + "salmon/RAP1_UNINDUCED_REP1/libParams/flenDist.txt", + "salmon/RAP1_UNINDUCED_REP1/lib_format_counts.json", + "salmon/RAP1_UNINDUCED_REP1/logs", + "salmon/RAP1_UNINDUCED_REP1/logs/salmon_quant.log", + "salmon/RAP1_UNINDUCED_REP1/quant.genes.sf", + "salmon/RAP1_UNINDUCED_REP1/quant.sf", + "salmon/RAP1_UNINDUCED_REP2", + "salmon/RAP1_UNINDUCED_REP2/aux_info", + "salmon/RAP1_UNINDUCED_REP2/aux_info/ambig_info.tsv", + "salmon/RAP1_UNINDUCED_REP2/aux_info/expected_bias.gz", + "salmon/RAP1_UNINDUCED_REP2/aux_info/fld.gz", + "salmon/RAP1_UNINDUCED_REP2/aux_info/meta_info.json", + "salmon/RAP1_UNINDUCED_REP2/aux_info/observed_bias.gz", + "salmon/RAP1_UNINDUCED_REP2/aux_info/observed_bias_3p.gz", + "salmon/RAP1_UNINDUCED_REP2/cmd_info.json", + "salmon/RAP1_UNINDUCED_REP2/libParams", + "salmon/RAP1_UNINDUCED_REP2/libParams/flenDist.txt", + "salmon/RAP1_UNINDUCED_REP2/lib_format_counts.json", + "salmon/RAP1_UNINDUCED_REP2/logs", + "salmon/RAP1_UNINDUCED_REP2/logs/salmon_quant.log", + "salmon/RAP1_UNINDUCED_REP2/quant.genes.sf", + "salmon/RAP1_UNINDUCED_REP2/quant.sf", + "salmon/WT_REP1", + "salmon/WT_REP1/aux_info", + "salmon/WT_REP1/aux_info/ambig_info.tsv", + "salmon/WT_REP1/aux_info/expected_bias.gz", + "salmon/WT_REP1/aux_info/fld.gz", + "salmon/WT_REP1/aux_info/meta_info.json", + "salmon/WT_REP1/aux_info/observed_bias.gz", + "salmon/WT_REP1/aux_info/observed_bias_3p.gz", + "salmon/WT_REP1/cmd_info.json", + "salmon/WT_REP1/libParams", + "salmon/WT_REP1/libParams/flenDist.txt", + "salmon/WT_REP1/lib_format_counts.json", + "salmon/WT_REP1/logs", + "salmon/WT_REP1/logs/salmon_quant.log", + "salmon/WT_REP1/quant.genes.sf", + "salmon/WT_REP1/quant.sf", + "salmon/WT_REP2", + "salmon/WT_REP2/aux_info", + "salmon/WT_REP2/aux_info/ambig_info.tsv", + "salmon/WT_REP2/aux_info/expected_bias.gz", + "salmon/WT_REP2/aux_info/fld.gz", + "salmon/WT_REP2/aux_info/meta_info.json", + "salmon/WT_REP2/aux_info/observed_bias.gz", + "salmon/WT_REP2/aux_info/observed_bias_3p.gz", + "salmon/WT_REP2/cmd_info.json", + "salmon/WT_REP2/libParams", + "salmon/WT_REP2/libParams/flenDist.txt", + "salmon/WT_REP2/lib_format_counts.json", + "salmon/WT_REP2/logs", + "salmon/WT_REP2/logs/salmon_quant.log", + "salmon/WT_REP2/quant.genes.sf", + "salmon/WT_REP2/quant.sf", + "salmon/deseq2_qc", + "salmon/deseq2_qc/R_sessionInfo.log", + "salmon/deseq2_qc/deseq2.dds.RData", + "salmon/deseq2_qc/deseq2.pca.vals.txt", + "salmon/deseq2_qc/deseq2.plots.pdf", + "salmon/deseq2_qc/deseq2.sample.dists.txt", + "salmon/deseq2_qc/size_factors", + "salmon/deseq2_qc/size_factors/RAP1_IAA_30M_REP1.txt", + "salmon/deseq2_qc/size_factors/RAP1_UNINDUCED_REP1.txt", + "salmon/deseq2_qc/size_factors/RAP1_UNINDUCED_REP2.txt", + "salmon/deseq2_qc/size_factors/WT_REP1.txt", + "salmon/deseq2_qc/size_factors/WT_REP2.txt", + "salmon/deseq2_qc/size_factors/deseq2.size_factors.RData", + "salmon/salmon.merged.gene_counts.SummarizedExperiment.rds", + "salmon/salmon.merged.gene_counts.tsv", + "salmon/salmon.merged.gene_counts_length_scaled.SummarizedExperiment.rds", + "salmon/salmon.merged.gene_counts_length_scaled.tsv", + "salmon/salmon.merged.gene_counts_scaled.SummarizedExperiment.rds", + "salmon/salmon.merged.gene_counts_scaled.tsv", + "salmon/salmon.merged.gene_lengths.tsv", + "salmon/salmon.merged.gene_tpm.tsv", + "salmon/salmon.merged.transcript_counts.SummarizedExperiment.rds", + "salmon/salmon.merged.transcript_counts.tsv", + "salmon/salmon.merged.transcript_lengths.tsv", + "salmon/salmon.merged.transcript_tpm.tsv", + "salmon/tx2gene.tsv", + "trimgalore", + "trimgalore/RAP1_IAA_30M_REP1_trimmed_1.fastq.gz_trimming_report.txt", + "trimgalore/RAP1_IAA_30M_REP1_trimmed_2.fastq.gz_trimming_report.txt", + "trimgalore/RAP1_UNINDUCED_REP1_trimmed.fastq.gz_trimming_report.txt", + "trimgalore/RAP1_UNINDUCED_REP2_trimmed.fastq.gz_trimming_report.txt", + "trimgalore/WT_REP1_trimmed_1.fastq.gz_trimming_report.txt", + "trimgalore/WT_REP1_trimmed_2.fastq.gz_trimming_report.txt", + "trimgalore/WT_REP2_trimmed_1.fastq.gz_trimming_report.txt", + "trimgalore/WT_REP2_trimmed_2.fastq.gz_trimming_report.txt" + ], + [ + "cutadapt_filtered_reads_plot.txt:md5,6fa381627f7c1f664f3d4b2cb79cce90", + "cutadapt_trimmed_sequences_plot_3_Counts.txt:md5,13dfa866fd91dbb072689efe9aa83b1f", + "cutadapt_trimmed_sequences_plot_3_Obs_Exp.txt:md5,07145dd8dd3db654859b18eb0389046c", + "fastqc_raw-status-check-heatmap.txt:md5,5a89b0d8d162f6b1dbdaf39457bbc03b", + "fastqc_raw_adapter_content_plot.txt:md5,da0389be84cfdd189b1d045212eb2974", + "fastqc_raw_overrepresented_sequences_plot.txt:md5,25d88ea8a72f55e8a374ae802bc7f0b1", + "fastqc_raw_per_base_n_content_plot.txt:md5,d368d7e36ca2f73dcde61f2b486d8213", + "fastqc_raw_per_base_sequence_quality_plot.txt:md5,5c3065b549129702b185ea1b817da420", + "fastqc_raw_per_sequence_gc_content_plot_Counts.txt:md5,9ddaa50167117d3c9188ccf015427704", + "fastqc_raw_per_sequence_gc_content_plot_Percentages.txt:md5,f10ee2881b61308af35f304aa3d810a3", + "fastqc_raw_per_sequence_quality_scores_plot.txt:md5,b5f9a02933e3065952237afd2ec9ce82", + "fastqc_raw_sequence_counts_plot.txt:md5,cbae4979d5db66d3b894abcf8d1c453c", + "fastqc_raw_sequence_duplication_levels_plot.txt:md5,8812cee16f6ca65e2c33635754de1772", + "fastqc_sequence_length_distribution_plot.txt:md5,6fe2c985606abad947bcca99b015ae33", + "fastqc_trimmed-status-check-heatmap.txt:md5,22a03548736b88b23be6bc0c9ef1b4a6", + "fastqc_trimmed_overrepresented_sequences_plot.txt:md5,c755e9d044ea1a82b2c8edde867b4878", + "fastqc_trimmed_per_base_n_content_plot.txt:md5,418610c1ce119cb786ad434db75d366e", + "fastqc_trimmed_per_base_sequence_quality_plot.txt:md5,bd22e06e41c096ad4f745d40fe96a1e5", + "fastqc_trimmed_per_sequence_gc_content_plot_Counts.txt:md5,004c60768ceb6197765154e3eaa37b7a", + "fastqc_trimmed_per_sequence_gc_content_plot_Percentages.txt:md5,95d29060b687f745288ad1ec47750037", + "fastqc_trimmed_per_sequence_quality_scores_plot.txt:md5,0f9834cc19f76dd5c87cf8cba7435a7c", + "fastqc_trimmed_sequence_counts_plot.txt:md5,9fd642bdd1da354f296bb8092205608f", + "fastqc_trimmed_sequence_duplication_levels_plot.txt:md5,0758257b497283b1ef28171e694db6db", + "multiqc_citations.txt:md5,f789abe663d4b4214f0ddeb413a7f150", + "multiqc_cutadapt.txt:md5,583b7b9ba76b26162bb9610ed746454b", + "multiqc_fastqc_fastqc_raw.txt:md5,81c3c1a2575a1891a7f2a9637a0f2cc0", + "multiqc_fastqc_fastqc_trimmed.txt:md5,54743154d0e8858980acffeb5b6f6a97", + "ambig_info.tsv:md5,5e9128e825dd0173d1eda78709cebb47", + "expected_bias.gz:md5,3407f87245d0003e0ffbfdf6d8c04f20", + "observed_bias.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "observed_bias_3p.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "cmd_info.json:md5,e89ce91f3cc03732bee42c381f3b1d1b", + "lib_format_counts.json:md5,288063651e63fda4ed95834d252cefd3", + "ambig_info.tsv:md5,12e70e29f44c7786f081e4b59e4ce7ce", + "expected_bias.gz:md5,3407f87245d0003e0ffbfdf6d8c04f20", + "observed_bias.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "observed_bias_3p.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "cmd_info.json:md5,d4db8fa9cb231648076bda1ec4e34114", + "lib_format_counts.json:md5,8a2ab54a8ec1d78be040c9bec57b5101", + "ambig_info.tsv:md5,fe67ac3fc0f0fc813216c09aec21c4e8", + "expected_bias.gz:md5,3407f87245d0003e0ffbfdf6d8c04f20", + "observed_bias.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "observed_bias_3p.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "cmd_info.json:md5,394264c8964da0cdd1c40d1c16995ceb", + "lib_format_counts.json:md5,c1ed7330956d6411d833fed78f1182bd", + "ambig_info.tsv:md5,3bc4e9d9fb9a95086a94c8c01349e67f", + "expected_bias.gz:md5,3407f87245d0003e0ffbfdf6d8c04f20", + "observed_bias.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "observed_bias_3p.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "cmd_info.json:md5,8009941fa85ff1e32128bceafc974ea8", + "lib_format_counts.json:md5,bcb3207290115f621a93a198ec5b6dfd", + "ambig_info.tsv:md5,f7304f8876d8c47aab65205236d7b721", + "expected_bias.gz:md5,3407f87245d0003e0ffbfdf6d8c04f20", + "observed_bias.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "observed_bias_3p.gz:md5,92bcd0592d22a6a58d0360fc76103e56", + "cmd_info.json:md5,3e6d06d032abd3f5cd7e79df1b5bcde0", + "lib_format_counts.json:md5,fce2c4840048c294c016f45f0df15384", + "R_sessionInfo.log:md5,fb0da0d7ad6994ed66a8e68348b19676", + "tx2gene.tsv:md5,1be389a28cc26d94b19ea918959ac72e" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-01-21T18:27:30.970398934" + } +} \ No newline at end of file From cbd5201e8b6fff266f9bf9b2dab18cbcc5e93cb6 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:30:25 +0000 Subject: [PATCH 09/24] Restore output comments --- subworkflows/local/prepare_genome/main.nf | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index ce2ce7097..1ebdfc076 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -407,20 +407,20 @@ workflow PREPARE_GENOME { // 16) Emit channels //------------------ emit: - fasta = ch_fasta - gtf = ch_gtf - fai = ch_fai - gene_bed = ch_gene_bed - transcript_fasta = ch_transcript_fasta - chrom_sizes = ch_chrom_sizes - splicesites = ch_splicesites - bbsplit_index = ch_bbsplit_index - rrna_fastas = ch_rrna_fastas - sortmerna_index = ch_sortmerna_index - star_index = ch_star_index - rsem_index = ch_rsem_index - hisat2_index = ch_hisat2_index - salmon_index = ch_salmon_index - kallisto_index = ch_kallisto_index - versions = ch_versions.ifEmpty(null) + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + fai = ch_fai // channel: path(genome.fai) + gene_bed = ch_gene_bed // channel: path(gene.bed) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + rrna_fastas = ch_rrna_fastas // channel: path(sortmerna_fasta_list) + sortmerna_index = ch_sortmerna_index // channel: path(sortmerna/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) + kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } From 47b292c0ebd9e51bb1cfdf94f24201e57d4f975b Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:32:46 +0000 Subject: [PATCH 10/24] Restore input comments --- subworkflows/local/prepare_genome/main.nf | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 1ebdfc076..da78362cc 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -54,15 +54,15 @@ workflow PREPARE_GENOME { hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/bbsplit/index/ sortmerna_index // directory: /path/to/sortmerna/index/ - gencode // boolean - featurecounts_group_type // string - aligner // string: 'star_salmon', 'star_rsem', 'hisat2' - pseudo_aligner // string (e.g. 'salmon') - skip_gtf_filter // boolean - skip_bbsplit // boolean - skip_sortmerna // boolean - skip_alignment // boolean - skip_pseudo_alignment // boolean + gencode // boolean: whether the genome is from GENCODE + featurecounts_group_type // string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts + aligner // string: Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2' + pseudo_aligner // string: Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner' + skip_gtf_filter // boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs + skip_bbsplit // boolean: Skip BBSplit for removal of non-reference genome reads + skip_sortmerna // boolean: Skip sortmerna for removal of reads mapping to sequences in sortmerna_fasta_list + skip_alignment // boolean: Skip all of the alignment-based processes within the pipeline + skip_pseudo_alignment // boolean: Skip all of the pseudoalignment-based processes within the pipeline main: // Versions collector From b5e676b9a7bb5f54315eb1b09cef88ffba9bd0f9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:34:24 +0000 Subject: [PATCH 11/24] Restore file comment --- subworkflows/local/prepare_genome/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index da78362cc..6ec2f1050 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -1,5 +1,5 @@ // -// Example: Grouping FASTA-dependent steps while preserving "index-provided" logic +// Uncompress and prepare reference genome files // include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip' From b622f53fe2df80b9a9b81cd5f32259eccfd97faf Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:37:59 +0000 Subject: [PATCH 12/24] Restore existence checks --- subworkflows/local/prepare_genome/main.nf | 60 +++++++++++------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 6ec2f1050..a20a16141 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -74,18 +74,18 @@ workflow PREPARE_GENOME { ch_gtf = Channel.empty() if (gtf) { if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ([ [:], file(gtf) ]).gunzip.map { it[1] } + ch_gtf = GUNZIP_GTF ([ [:], file(gtf, checkIfExists: true) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } else { - ch_gtf = Channel.value(file(gtf)) + ch_gtf = Channel.value(file(gtf, checkIfExists: true)) } } else if (gff) { def ch_gff if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ([ [:], file(gff) ]).gunzip + ch_gff = GUNZIP_GFF ([ [:], file(gff, checkIfExists: true) ]).gunzip ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) } else { - ch_gff = Channel.value(file(gff)).map { [ [:], it ] } + ch_gff = Channel.value(file(gff, checkIfExists: true)).map { [ [:], it ] } } ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] } ch_versions = ch_versions.mix(GFFREAD.out.versions) @@ -100,10 +100,10 @@ workflow PREPARE_GENOME { if (fasta_provided) { // Uncompress FASTA if needed if (fasta.endsWith('.gz')) { - ch_fasta = GUNZIP_FASTA ([ [:], file(fasta) ]).gunzip.map { it[1] } + ch_fasta = GUNZIP_FASTA ([ [:], file(fasta, checkIfExists: true) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) } else { - ch_fasta = Channel.value(file(fasta)) + ch_fasta = Channel.value(file(fasta, checkIfExists: true)) } } @@ -128,10 +128,10 @@ workflow PREPARE_GENOME { ch_add_fasta = Channel.empty() if (fasta_provided && additional_fasta) { if (additional_fasta.endsWith('.gz')) { - ch_add_fasta = GUNZIP_ADDITIONAL_FASTA([ [:], file(additional_fasta) ]).gunzip.map { it[1] } + ch_add_fasta = GUNZIP_ADDITIONAL_FASTA([ [:], file(additional_fasta, checkIfExists: true) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) } else { - ch_add_fasta = Channel.value(file(additional_fasta)) + ch_add_fasta = Channel.value(file(additional_fasta, checkIfExists: true)) } CUSTOM_CATADDITIONALFASTA( @@ -150,10 +150,10 @@ workflow PREPARE_GENOME { ch_gene_bed = Channel.empty() if (gene_bed) { if (gene_bed.endsWith('.gz')) { - ch_gene_bed = GUNZIP_GENE_BED ([ [:], file(gene_bed) ]).gunzip.map { it[1] } + ch_gene_bed = GUNZIP_GENE_BED ([ [:], file(gene_bed, checkIfExists: true) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) } else { - ch_gene_bed = Channel.value(file(gene_bed)) + ch_gene_bed = Channel.value(file(gene_bed, checkIfExists: true)) } } else { ch_gene_bed = GTF2BED(ch_gtf).bed @@ -169,10 +169,10 @@ workflow PREPARE_GENOME { if (transcript_fasta) { // Use user-provided transcript FASTA if (transcript_fasta.endsWith('.gz')) { - ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ([ [:], file(transcript_fasta) ]).gunzip.map { it[1] } + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ([ [:], file(transcript_fasta, checkIfExists: true) ]).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) } else { - ch_transcript_fasta = Channel.value(file(transcript_fasta)) + ch_transcript_fasta = Channel.value(file(transcript_fasta, checkIfExists: true)) } if (gencode) { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE(ch_transcript_fasta) @@ -214,18 +214,18 @@ workflow PREPARE_GENOME { if (bbsplit_index) { // Use user-provided bbsplit index if (bbsplit_index.endsWith('.tar.gz')) { - ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ([ [:], file(bbsplit_index) ]).untar.map { it[1] } + ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ([ [:], file(bbsplit_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) } else { - ch_bbsplit_index = Channel.value(file(bbsplit_index)) + ch_bbsplit_index = Channel.value(file(bbsplit_index, checkIfExists: true)) } } else if (fasta_provided) { // Build it from scratch if we have FASTA Channel - .from(file(bbsplit_fasta_list)) + .from(file(bbsplit_fasta_list, checkIfExists: true)) .splitCsv() - .flatMap { id, fafile -> [ [ 'id', id ], [ 'fasta', file(fafile) ] ] } + .flatMap { id, fafile -> [ [ 'id', id ], [ 'fasta', file(fafile, checkIfExists: true) ] ] } .groupTuple() .map { it -> it[1] } .collect { [ it ] } @@ -256,10 +256,10 @@ workflow PREPARE_GENOME { if (sortmerna_index) { if (sortmerna_index.endsWith('.tar.gz')) { - ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ([ [:], file(sortmerna_index) ]).untar.map { it[1] } + ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ([ [:], file(sortmerna_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_SORTMERNA_INDEX.out.versions) } else { - ch_sortmerna_index = Channel.value([ [:], file(sortmerna_index) ]) + ch_sortmerna_index = Channel.value([ [:], file(sortmerna_index, checkIfExists: true) ]) } } else { // Build new SortMeRNA index from the rRNA references @@ -280,17 +280,17 @@ workflow PREPARE_GENOME { if ('star_salmon' in prepare_tool_indices) { if (star_index) { if (star_index.endsWith('.tar.gz')) { - ch_star_index = UNTAR_STAR_INDEX ([ [:], file(star_index) ]).untar.map { it[1] } + ch_star_index = UNTAR_STAR_INDEX ([ [:], file(star_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) } else { - ch_star_index = Channel.value(file(star_index)) + ch_star_index = Channel.value(file(star_index, checkIfExists: true)) } } else if (fasta_provided) { // Build new STAR index // Possibly check AWS iGenome conditions def is_aws_igenome = false - if (file(fasta).getName() - '.gz' == 'genome.fa' && file(gtf).getName() - '.gz' == 'genes.gtf') { + if (file(fasta, checkIfExists: true).getName() - '.gz' == 'genome.fa' && file(gtf, checkIfExists: true).getName() - '.gz' == 'genes.gtf') { is_aws_igenome = true } if (is_aws_igenome) { @@ -313,10 +313,10 @@ workflow PREPARE_GENOME { if ('star_rsem' in prepare_tool_indices) { if (rsem_index) { if (rsem_index.endsWith('.tar.gz')) { - ch_rsem_index = UNTAR_RSEM_INDEX ([ [:], file(rsem_index) ]).untar.map { it[1] } + ch_rsem_index = UNTAR_RSEM_INDEX ([ [:], file(rsem_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) } else { - ch_rsem_index = Channel.value(file(rsem_index)) + ch_rsem_index = Channel.value(file(rsem_index, checkIfExists: true)) } } else if (fasta_provided) { @@ -333,7 +333,7 @@ workflow PREPARE_GENOME { if ('hisat2' in prepare_tool_indices) { // splicesites if (splicesites) { - ch_splicesites = Channel.value(file(splicesites)) + ch_splicesites = Channel.value(file(splicesites, checkIfExists: true)) } else if (fasta_provided) { ch_splicesites = HISAT2_EXTRACTSPLICESITES(ch_gtf.map { [ [:], it ] }).txt.map { it[1] } @@ -342,10 +342,10 @@ workflow PREPARE_GENOME { // the index if (hisat2_index) { if (hisat2_index.endsWith('.tar.gz')) { - ch_hisat2_index = UNTAR_HISAT2_INDEX ([ [:], file(hisat2_index) ]).untar.map { it[1] } + ch_hisat2_index = UNTAR_HISAT2_INDEX ([ [:], file(hisat2_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) } else { - ch_hisat2_index = Channel.value(file(hisat2_index)) + ch_hisat2_index = Channel.value(file(hisat2_index, checkIfExists: true)) } } else if (fasta_provided) { @@ -366,10 +366,10 @@ workflow PREPARE_GENOME { if (salmon_index) { // use user-provided salmon index if (salmon_index.endsWith('.tar.gz')) { - ch_salmon_index = UNTAR_SALMON_INDEX ([ [:], file(salmon_index) ]).untar.map { it[1] } + ch_salmon_index = UNTAR_SALMON_INDEX ([ [:], file(salmon_index, checkIfExists: true) ]).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) } else { - ch_salmon_index = Channel.value(file(salmon_index)) + ch_salmon_index = Channel.value(file(salmon_index, checkIfExists: true)) } } else if (ch_transcript_fasta && fasta_provided) { @@ -391,10 +391,10 @@ workflow PREPARE_GENOME { if ('kallisto' in prepare_tool_indices) { if (kallisto_index) { if (kallisto_index.endsWith('.tar.gz')) { - ch_kallisto_index = UNTAR_KALLISTO_INDEX ([ [:], file(kallisto_index) ]).untar + ch_kallisto_index = UNTAR_KALLISTO_INDEX ([ [:], file(kallisto_index, checkIfExists: true) ]).untar ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) } else { - ch_kallisto_index = Channel.value([ [:], file(kallisto_index) ]) + ch_kallisto_index = Channel.value([ [:], file(kallisto_index, checkIfExists: true) ]) } } else if (ch_transcript_fasta) { From 0fdf742eddcc9e71b1b4424d08cab50a2f476558 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:50:17 +0000 Subject: [PATCH 13/24] Remove some unecessary changes --- subworkflows/local/prepare_genome/main.nf | 55 ++++++++++++++++------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index a20a16141..6051d7eff 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -135,7 +135,7 @@ workflow PREPARE_GENOME { } CUSTOM_CATADDITIONALFASTA( - ch_fasta.combine(ch_gtf).map { f, g -> [ [:], f, g ] }, + ch_fasta.combine(ch_gtf).map { fasta, gtf -> [ [:], fasta, gtf ] }, ch_add_fasta.map { [ [:], it ] }, gencode ? "gene_type" : featurecounts_group_type ) @@ -224,11 +224,11 @@ workflow PREPARE_GENOME { // Build it from scratch if we have FASTA Channel .from(file(bbsplit_fasta_list, checkIfExists: true)) - .splitCsv() - .flatMap { id, fafile -> [ [ 'id', id ], [ 'fasta', file(fafile, checkIfExists: true) ] ] } + .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta + .flatMap { id, fafile -> [ [ 'id', id ], [ 'fasta', file(fafile, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key .groupTuple() - .map { it -> it[1] } - .collect { [ it ] } + .map { it -> it[1] } // Get rid of keys and keep grouped values + .collect { [ it ] } // Collect entries as a list to pass as "tuple val(short_names), path(path_to_fasta)" to module .set { ch_bbsplit_fasta_list } ch_bbsplit_index = BBMAP_BBSPLIT( @@ -362,6 +362,30 @@ workflow PREPARE_GENOME { // 14) Salmon index -> can skip genome if transcript_fasta is enough //------------------------------------------------------ ch_salmon_index = Channel.empty() + // + // Uncompress Salmon index or generate from scratch if required + // + ch_salmon_index = Channel.empty() + if (salmon_index) { + if (salmon_index.endsWith('.tar.gz')) { + ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) + } else { + ch_salmon_index = Channel.value(file(salmon_index)) + } + } else if ('salmon' in prepare_tool_indices) { + if (ch_transcript_fasta && fasta_provided) { + // build from transcript FASTA + genome FASTA + ch_salmon_index = SALMON_INDEX(ch_fasta, ch_transcript_fasta).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) + } + else if (ch_transcript_fasta) { + // some Salmon module can run with just a transcript FASTA + ch_salmon_index = SALMON_INDEX([], ch_transcript_fasta).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) + } + } + if ('salmon' in prepare_tool_indices) { if (salmon_index) { // use user-provided salmon index @@ -388,18 +412,17 @@ workflow PREPARE_GENOME { // 15) Kallisto index -> only needs transcript FASTA //-------------------------------------------------- ch_kallisto_index = Channel.empty() - if ('kallisto' in prepare_tool_indices) { - if (kallisto_index) { - if (kallisto_index.endsWith('.tar.gz')) { - ch_kallisto_index = UNTAR_KALLISTO_INDEX ([ [:], file(kallisto_index, checkIfExists: true) ]).untar - ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) - } else { - ch_kallisto_index = Channel.value([ [:], file(kallisto_index, checkIfExists: true) ]) - } + if (kallisto_index) { + if (kallisto_index.endsWith('.tar.gz')) { + ch_kallisto_index = UNTAR_KALLISTO_INDEX ( [ [:], kallisto_index ] ).untar + ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) + } else { + ch_kallisto_index = Channel.value([[:], file(kallisto_index)]) } - else if (ch_transcript_fasta) { - ch_kallisto_index = KALLISTO_INDEX(ch_transcript_fasta.map { [ [:], it ] }).index - ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + } else { + if ('kallisto' in prepare_tool_indices) { + ch_kallisto_index = KALLISTO_INDEX ( ch_transcript_fasta.map { [ [:], it] } ).index + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) } } From f139bbede4994ad79150d5102e2d4d388178470a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 18:55:08 +0000 Subject: [PATCH 14/24] Update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83e2bc3b4..a9c12a685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [PR #1480](https://github.com/nf-core/rnaseq/pull/1480) - Bump version after release 3.18.0 - [PR #1482](https://github.com/nf-core/rnaseq/pull/1482) - Update trimgalore module for save_unpaired fix -- [pR #1486](https://github.com/nf-core/rnaseq/pull/1486) - Bump STAR build for multiprocessing fix +- [PR #1486](https://github.com/nf-core/rnaseq/pull/1486) - Bump STAR build for multiprocessing fix +- [PR #1490](https://github.com/nf-core/rnaseq/pull/1490) - Make genomic FASTA input optional # 3.18.0 - 2024-12-19 From a9684ea3c1d6566d65b81bef7248eddde2115803 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Jan 2025 19:02:10 +0000 Subject: [PATCH 15/24] Remove duplicate section --- subworkflows/local/prepare_genome/main.nf | 27 +---------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 6051d7eff..f4c82778e 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -361,10 +361,7 @@ workflow PREPARE_GENOME { //------------------------------------------------------ // 14) Salmon index -> can skip genome if transcript_fasta is enough //------------------------------------------------------ - ch_salmon_index = Channel.empty() - // - // Uncompress Salmon index or generate from scratch if required - // + ch_salmon_index = Channel.empty() if (salmon_index) { if (salmon_index.endsWith('.tar.gz')) { @@ -386,28 +383,6 @@ workflow PREPARE_GENOME { } } - if ('salmon' in prepare_tool_indices) { - if (salmon_index) { - // use user-provided salmon index - if (salmon_index.endsWith('.tar.gz')) { - ch_salmon_index = UNTAR_SALMON_INDEX ([ [:], file(salmon_index, checkIfExists: true) ]).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) - } else { - ch_salmon_index = Channel.value(file(salmon_index, checkIfExists: true)) - } - } - else if (ch_transcript_fasta && fasta_provided) { - // build from transcript FASTA + genome FASTA - ch_salmon_index = SALMON_INDEX(ch_fasta, ch_transcript_fasta).index - ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) - } - else if (ch_transcript_fasta) { - // some Salmon module can run with just a transcript FASTA - ch_salmon_index = SALMON_INDEX([], ch_transcript_fasta).index - ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) - } - } - //-------------------------------------------------- // 15) Kallisto index -> only needs transcript FASTA //-------------------------------------------------- From 35ec56c8c5a87f7effb6ed21420ec5ffcccf8b2f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 13:15:18 +0000 Subject: [PATCH 16/24] Fix for tweaked filtered GTF name --- subworkflows/local/prepare_genome/tests/main.nf.test.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/prepare_genome/tests/main.nf.test.snap b/subworkflows/local/prepare_genome/tests/main.nf.test.snap index f5c0a10f7..220ea99f8 100644 --- a/subworkflows/local/prepare_genome/tests/main.nf.test.snap +++ b/subworkflows/local/prepare_genome/tests/main.nf.test.snap @@ -451,7 +451,7 @@ "/ngi-igenomes/testdata/nf-core/pipelines/rnaseq/3.15/reference/genome.fasta" ], [ - "genome.filtered.gtf:md5,ef6fccd153a21c329670462d602ed2d0" + "genes_with_empty_tid.filtered.gtf:md5,ef6fccd153a21c329670462d602ed2d0" ], [ "genome.fasta.fai:md5,2cd76d936cbfa386b14154506c2041b2" @@ -672,7 +672,7 @@ "genome.filtered.bed:md5,d41d8cd98f00b204e9800998ecf8427e" ], [ - "genome.filtered.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + "genes_with_empty_tid.filtered.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" ], [ From 0d4ef8f36f1fed2dd8a5110631e6fee08f1beee9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 13:15:50 +0000 Subject: [PATCH 17/24] Fix for tweaked filtered GTF name --- subworkflows/local/prepare_genome/tests/main.nf.test.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/prepare_genome/tests/main.nf.test.snap b/subworkflows/local/prepare_genome/tests/main.nf.test.snap index 220ea99f8..51cd16449 100644 --- a/subworkflows/local/prepare_genome/tests/main.nf.test.snap +++ b/subworkflows/local/prepare_genome/tests/main.nf.test.snap @@ -457,7 +457,7 @@ "genome.fasta.fai:md5,2cd76d936cbfa386b14154506c2041b2" ], [ - "genome.filtered.bed:md5,e507dc33673e76c32abe344f4dc07952" + "genes_with_empty_tid.filtered.bed:md5,e507dc33673e76c32abe344f4dc07952" ], [ "genome.fasta.sizes:md5,29218009212157c49dbc6596621ec780" @@ -669,7 +669,7 @@ "/ngi-igenomes/testdata/nf-core/pipelines/rnaseq/3.15/reference/genome.fasta" ], [ - "genome.filtered.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + "genes_with_empty_tid.filtered.bed:md5,d41d8cd98f00b204e9800998ecf8427e" ], [ "genes_with_empty_tid.filtered.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" From ae062b939a1c24c38abd791968d012d865fbbeca Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 14:02:50 +0000 Subject: [PATCH 18/24] Update docs --- docs/usage.md | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 48cafce85..74e4f9839 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,7 +128,7 @@ The `--aligner hisat2` option is not currently supported using ARM architecture By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. By default, even `--skip_alignment set` Salmon will still use the genomic FASTA file, providing the sequences as 'decoys' (see [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)), and this is the recommended mode of operation in this situation. However, if you do not supply a FASTA file, Salmon will run without those decoys, using only transcript sequences in the index. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual). @@ -209,7 +209,7 @@ When supplying reference files as discussed below, it is important to be consist ### Explicit reference file specification (recommended) -The minimum reference genome requirements for this pipeline are a FASTA and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like: +The minimum reference genome requirements for this pipeline are a FASTA file (genome and/ or trnascriptome) and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like: ``` latest_release=$(curl -s 'http://rest.ensembl.org/info/software?content-type=application/json' | grep -o '"release":[0-9]*' | cut -d: -f2) @@ -227,6 +227,7 @@ Notes: - If `--gene_bed` is not provided then it will be generated from the GTF file. - If `--additional_fasta` is provided then the features in this file (e.g. ERCC spike-ins) will be automatically concatenated onto both the reference FASTA file as well as the GTF annotation before building the appropriate indices. - When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)). +- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended**, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) whenever possible—unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode). #### Reference genome @@ -304,7 +305,7 @@ Notes: ### GTF filtering -By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter. +By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file (where supplied), and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter. ## Contamination screening options @@ -332,6 +333,21 @@ nextflow run \ -profile docker ``` +You can also run without a genomic FASTA file, provided you skip the alignment step and provide a transcriptome FASTA directly: + +```bash +nextflow run \ + nf-core/rnaseq \ + --input \ + --outdir \ + --gtf \ + --transcript_fasta \ + --skip_alignment \ + -profile docker +``` + +This is not usually recommended unless you also supply a previously generated decoy-aware Salmon transcriptome. + > **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above. This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. From efb8e077ca793278aa6c6a530737663485c1933d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 14:11:28 +0000 Subject: [PATCH 19/24] Temporarily disable 'latest-everything' testing due to incompatibilities with nf-core --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7a620df22..0d3cb8255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,7 +51,6 @@ jobs: matrix: NXF_VER: - "24.04.2" - - "latest-everything" nf_test_files: ["${{ fromJson(needs.nf-test-changes.outputs.nf_test_files) }}"] profile: - "docker" From 445ca7d3b5d7a99298db421775a255dede4f704d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 14:43:57 +0000 Subject: [PATCH 20/24] Apply suggestions from code review Co-authored-by: Maxime U Garcia --- docs/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 74e4f9839..ede440311 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -209,7 +209,7 @@ When supplying reference files as discussed below, it is important to be consist ### Explicit reference file specification (recommended) -The minimum reference genome requirements for this pipeline are a FASTA file (genome and/ or trnascriptome) and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like: +The minimum reference genome requirements for this pipeline are a FASTA file (genome and/ or transcriptome) and GTF file, all other files required to run the pipeline can be generated from these files. For example, the latest reference files for human can be derived from Ensembl like: ``` latest_release=$(curl -s 'http://rest.ensembl.org/info/software?content-type=application/json' | grep -o '"release":[0-9]*' | cut -d: -f2) @@ -227,7 +227,7 @@ Notes: - If `--gene_bed` is not provided then it will be generated from the GTF file. - If `--additional_fasta` is provided then the features in this file (e.g. ERCC spike-ins) will be automatically concatenated onto both the reference FASTA file as well as the GTF annotation before building the appropriate indices. - When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)). -- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended**, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) whenever possible—unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode). +- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended**, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) whenever possible, unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode). #### Reference genome From c45fbe5b7cf7e49839d9e035ebb91f20d4fcfb98 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 14:51:42 +0000 Subject: [PATCH 21/24] Apply suggestions from code review --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index ede440311..3ae0a366f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,7 +128,7 @@ The `--aligner hisat2` option is not currently supported using ARM architecture By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. By default, even `--skip_alignment set` Salmon will still use the genomic FASTA file, providing the sequences as 'decoys' (see [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)), and this is the recommended mode of operation in this situation. However, if you do not supply a FASTA file, Salmon will run without those decoys, using only transcript sequences in the index. +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. By default, when specifying `--pseudo_aligner salmon` without an index, even with `--skip_alignment set` Salmon will still use the genomic FASTA file when building an index, providing the sequences as 'decoys' (see [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)), and this is the recommended mode of operation in this situation. However, if you do not supply a FASTA file, Salmon will run without those decoys, using only transcript sequences in the index. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual). @@ -227,7 +227,7 @@ Notes: - If `--gene_bed` is not provided then it will be generated from the GTF file. - If `--additional_fasta` is provided then the features in this file (e.g. ERCC spike-ins) will be automatically concatenated onto both the reference FASTA file as well as the GTF annotation before building the appropriate indices. - When using `--aligner star_rsem`, both the STAR and RSEM indices should be present in the path specified by `--rsem_index` (see [#568](https://github.com/nf-core/rnaseq/issues/568)). -- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended**, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) whenever possible, unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode). +- If the `--skip_alignment` option is used along with `--transcript_fasta`, the pipeline can technically run without providing the genomic FASTA (`--fasta`). However, this approach is **not recommended** with `--pseudo_aligner salmon`, as any dynamically generated Salmon index will lack decoys. To ensure optimal indexing with decoys, it is **highly recommended** to include the genomic FASTA (`--fasta`) with Salmon, unless a pre-existing decoy-aware Salmon index is supplied. For more details on the benefits of decoy-aware indexing, refer to the [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode). #### Reference genome @@ -346,7 +346,7 @@ nextflow run \ -profile docker ``` -This is not usually recommended unless you also supply a previously generated decoy-aware Salmon transcriptome. +This is not usually recommended with Salmon unless you also supply a previously generated decoy-aware Salmon transcriptome. > **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above. From bd585b071285e79c0b43910a328f8b56e3366fd0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 16:33:09 +0000 Subject: [PATCH 22/24] Fix file names in snap --- .../prepare_genome/tests/main.nf.test.snap | 244 +++++++++--------- 1 file changed, 122 insertions(+), 122 deletions(-) diff --git a/subworkflows/local/prepare_genome/tests/main.nf.test.snap b/subworkflows/local/prepare_genome/tests/main.nf.test.snap index 51cd16449..a53a9ba4f 100644 --- a/subworkflows/local/prepare_genome/tests/main.nf.test.snap +++ b/subworkflows/local/prepare_genome/tests/main.nf.test.snap @@ -48,10 +48,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:42:50.037244182" + "timestamp": "2025-01-22T16:31:30.831489059" }, "skip_pseudo_alignment - stub": { "content": [ @@ -103,10 +103,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:36:51.108422744" + "timestamp": "2025-01-22T16:26:39.589338794" }, "skip_gtf_filter": { "content": [ @@ -157,10 +157,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:21:35.428370538" + "timestamp": "2025-01-22T16:17:41.356308487" }, "gencode = false - stub": { "content": [ @@ -212,10 +212,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:34:45.22576799" + "timestamp": "2025-01-22T16:25:11.750062191" }, "gff = false - stub": { "content": [ @@ -267,10 +267,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:37:50.119432994" + "timestamp": "2025-01-22T16:27:27.479980116" }, "skip_pseudoalignment = true - stub": { "content": [ @@ -322,10 +322,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:44:30.027925166" + "timestamp": "2025-01-22T16:32:43.04638782" }, "featurecounts_group_type = 'gene_type' - stub": { "content": [ @@ -377,10 +377,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:42:09.256212794" + "timestamp": "2025-01-22T16:31:08.944159607" }, "gtf = false": { "content": [ @@ -434,10 +434,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:24:03.080886311" + "timestamp": "2025-01-22T16:19:16.226455835" }, "gfp = false": { "content": [ @@ -488,10 +488,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:25:11.277955385" + "timestamp": "2025-01-22T16:20:01.366185938" }, "skip_bbsplit = true": { "content": [ @@ -542,10 +542,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:32:08.044528172" + "timestamp": "2025-01-22T16:23:43.324382545" }, "salmon_index = false - stub": { "content": [ @@ -597,10 +597,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:40:20.088184403" + "timestamp": "2025-01-22T16:29:42.989824541" }, "skip_alignment": { "content": [ @@ -652,10 +652,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:22:31.848885428" + "timestamp": "2025-01-22T16:18:25.928617257" }, "gfp = false - stub": { "content": [ @@ -669,10 +669,10 @@ "/ngi-igenomes/testdata/nf-core/pipelines/rnaseq/3.15/reference/genome.fasta" ], [ - "genes_with_empty_tid.filtered.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + "genome.filtered.bed:md5,d41d8cd98f00b204e9800998ecf8427e" ], [ - "genes_with_empty_tid.filtered.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + "genome.filtered.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" ], [ @@ -706,10 +706,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:38:24.011200927" + "timestamp": "2025-01-22T16:27:50.906825801" }, "gencode = false": { "content": [ @@ -761,10 +761,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:21:09.927667083" + "timestamp": "2025-01-22T16:17:19.152939508" }, "default options": { "content": [ @@ -816,10 +816,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:20:44.91884847" + "timestamp": "2025-01-22T16:16:39.558002005" }, "gencode = true - stub": { "content": [ @@ -872,10 +872,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:41:38.130901233" + "timestamp": "2025-01-22T16:30:42.966501563" }, "skip_alignment - stub": { "content": [ @@ -927,10 +927,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:36:19.887113789" + "timestamp": "2025-01-22T16:26:16.37556928" }, "skip_bbsplit = true - stub": { "content": [ @@ -1040,10 +1040,10 @@ } ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:43:15.317916936" + "timestamp": "2025-01-22T16:31:58.139538299" }, "transcriptome = false": { "content": [ @@ -1096,10 +1096,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:26:03.082922797" + "timestamp": "2025-01-22T16:20:43.257059452" }, "skip_pseudoalignment = true": { "content": [ @@ -1151,10 +1151,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:33:32.945973388" + "timestamp": "2025-01-22T16:24:28.57936905" }, "skip_gtf_filter - stub": { "content": [ @@ -1205,10 +1205,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:35:08.735012633" + "timestamp": "2025-01-22T16:25:32.773764178" }, "gencode = true": { "content": [ @@ -1261,10 +1261,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:29:54.92546344" + "timestamp": "2025-01-22T16:22:36.190529048" }, "hisat2_index = false": { "content": [ @@ -1316,10 +1316,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:29:08.52615029" + "timestamp": "2025-01-22T16:22:13.125436103" }, "rsem_index = false - stub": { "content": [ @@ -1371,10 +1371,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:39:53.29274644" + "timestamp": "2025-01-22T16:29:13.942291871" }, "featurecounts_group_type = 'gene_type'": { "content": [ @@ -1426,10 +1426,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:30:38.236090892" + "timestamp": "2025-01-22T16:22:58.703990578" }, "with bed - stub": { "content": [ @@ -1480,10 +1480,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:39:24.95243753" + "timestamp": "2025-01-22T16:28:44.518815375" }, "skip_pseudo_alignment": { "content": [ @@ -1535,10 +1535,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:23:12.222452353" + "timestamp": "2025-01-22T16:18:50.035264983" }, "skip_bbsplit": { "content": [ @@ -1589,10 +1589,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:22:04.417171396" + "timestamp": "2025-01-22T16:18:03.05536554" }, "with bed": { "content": [ @@ -1643,10 +1643,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:26:52.427274463" + "timestamp": "2025-01-22T16:21:05.592448003" }, "gtf = false - stub": { "content": [ @@ -1700,10 +1700,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:37:24.089579664" + "timestamp": "2025-01-22T16:27:04.167079535" }, "gff = false": { "content": [ @@ -1755,10 +1755,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:24:29.339763401" + "timestamp": "2025-01-22T16:19:39.929655668" }, "default options - stub": { "content": [ @@ -1810,10 +1810,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:34:12.631669075" + "timestamp": "2025-01-22T16:24:49.758178777" }, "salmon_index = false": { "content": [ @@ -1865,10 +1865,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:28:32.12627675" + "timestamp": "2025-01-22T16:21:50.632563783" }, "rsem_index = false": { "content": [ @@ -1920,10 +1920,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:27:44.114765753" + "timestamp": "2025-01-22T16:21:28.064555135" }, "skip_alignment = true - stub": { "content": [ @@ -1975,10 +1975,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:43:56.303595701" + "timestamp": "2025-01-22T16:32:20.147025523" }, "transcriptome = false - stub": { "content": [ @@ -2031,10 +2031,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:38:52.064137724" + "timestamp": "2025-01-22T16:28:14.178838335" }, "skip_alignment = true": { "content": [ @@ -2086,10 +2086,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:32:57.856332379" + "timestamp": "2025-01-22T16:24:06.255647938" }, "skip_gtf_filter = true": { "content": [ @@ -2140,10 +2140,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:31:25.686724069" + "timestamp": "2025-01-22T16:23:20.680526383" }, "hisat2_index = false - stub": { "content": [ @@ -2195,10 +2195,10 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:40:52.285127199" + "timestamp": "2025-01-22T16:30:12.605234376" }, "skip_bbsplit - stub": { "content": [ @@ -2308,9 +2308,9 @@ } ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "24.10.4" }, - "timestamp": "2024-10-21T14:35:46.545484011" + "timestamp": "2025-01-22T16:25:54.053589157" } } \ No newline at end of file From f07b1b1ef8fad69bdf189dc7eb10f1e5bace622c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 16:44:20 +0000 Subject: [PATCH 23/24] Update usage.md --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 3ae0a366f..c30ab0ae1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,11 +128,11 @@ The `--aligner hisat2` option is not currently supported using ARM architecture By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. By default, when specifying `--pseudo_aligner salmon` without an index, even with `--skip_alignment set` Salmon will still use the genomic FASTA file when building an index, providing the sequences as 'decoys' (see [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)), and this is the recommended mode of operation in this situation. However, if you do not supply a FASTA file, Salmon will run without those decoys, using only transcript sequences in the index. +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual). -When running Salmon in mapping-based mode via `--pseudo_aligner salmon` the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)). +When running Salmon in mapping-based mode via `--pseudo_aligner salmon`, supplying a genome fasta via `--fasta` and not supplying a Salmon index, the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices, as is recommended (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)). If you do not supply a FASTA file or an index, Salmon will index without those decoys, using only transcript sequences in the index. This second option is not usually recommended, but may be useful in limited circumstances. Note that Kallisto does not index with genomic sequences. Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. You can now also supply additional arguments to Kallisto via `--extra_kallisto_quant_args`. @@ -346,7 +346,7 @@ nextflow run \ -profile docker ``` -This is not usually recommended with Salmon unless you also supply a previously generated decoy-aware Salmon transcriptome. +This is not usually recommended with Salmon unless you also supply a previously generated decoy-aware Salmon transcriptome index. > **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above. From 21eb5ad1edf3a691bdbf751b3a6b130fb5157a53 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 22 Jan 2025 16:45:43 +0000 Subject: [PATCH 24/24] prettier --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index c30ab0ae1..74fd92290 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,7 +128,7 @@ The `--aligner hisat2` option is not currently supported using ARM architecture By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual).