From 85a778f6534f802e17645c5fe9f41ebdc4742686 Mon Sep 17 00:00:00 2001 From: atpoint Date: Mon, 8 Jan 2024 15:36:10 +0100 Subject: [PATCH] remove indexing step --- .github/workflows/CI.yml | 4 - CHANGELOG.md | 4 + README.md | 31 ++---- main.nf | 198 +++++++++++++++------------------------ nextflow.config | 29 +----- schema.nf | 19 +--- 6 files changed, 94 insertions(+), 191 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 48da6f0..02c4d21 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -29,10 +29,6 @@ jobs: # Test via docker - name: TEST-DOCKER_ALL run: | - # building new index and exit - NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_new_idx,test_resources --only_idx - - # using existing index NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --only_fastqc NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --skip_fastqc NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --skip_tximport diff --git a/CHANGELOG.md b/CHANGELOG.md index e112aaa..68aaa24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## v2.6.0 +- remove indexing from pipeline, expect pre-made index. +- assume now that partial decoy index or txtome index is used, hence lower memory for quant to 8GB + ## v2.5.2 - check in `tximport` process whether there is a mismatch between tx2gene file and quant.sf identifiers that can be solved by using either of the `ignoreTxVersion` or `ignoreAfterBar` arguments of `tximport()`, diff --git a/README.md b/README.md index e2de6f9..57574c3 100644 --- a/README.md +++ b/README.md @@ -21,28 +21,13 @@ See the [misc](misc/) folder which contains the software versions used in the pi **Indexing** -The indexing step must be run first and separately using the `--only_idx` flag. For this we need a reference transcriptome (gzipped), a reference genome as decoy (gzipped) and a GTF annotation file (gzipped). - -`--only_idx`: trigger the indexing process -`--idx_name`: name of the produced index, default `idx` -`--idx_dir`: name of the directory inside `rnaseq_preprocess_results/` storing the index, default `salmon_idx` -`--idx_additional`: additional arguments to `salmon index` beyond the defaults which are `--no-version-check -t -d -i -p --gencode` -`--txtome`: path to the gzipped transcriptome fasta -`--genome`: path to the gzipped genome fasta -`--gtf`: path to the gzipped GTF file -`--transcript_id`: name of GTF column storing transcript ID, default `transcript_id` -`--transcript_name`: name of GTF column storing transcript name, default `transcript_name` -`--gene_id`: name of GTF column storing gene ID, default `gene_id` -`--gene_name`: name of GTF column storing gene name, default `gene_name` -`--gene_type`: name of GTF column storing gene biotype, default `gene_type` - -For the indexing process, 30GB of RAM and 6 CPUs are required/hardcoded. On our HPC we use: +The pipeline does not cover the indexing step as there are different sorts of salmon index methods available, +for example indexing only the transcriptome without any genome decoys, partial genome decoys and full genome decoys. -```bash -NXF_VER=21.10.6 nextflow run atpoint/rnaseq_preprocess -r main -profile singularity,slurm --only_idx \ - --genome path/to/genome.fa.gz --txtome path/to/txtome.fa.gz --gtf path/to/foo.gtf.gz \ - -with-report indexing_report.html -with-trace indexing_report.trace -bg > indexing_report.log -``` +Please produce an index up front and then provide the output folder to the `--idx` option. + +The pipeline has a hardcoded 8GB memory limit for the quantification step which should be sufficient for transcriptome-only and partial genome decoy indices. +For full genome decoy please modify the `withLabel:process_quant` memory definition in `nextflow.config` to something like 20GB depending on organism. **Quantification/tximport** @@ -58,13 +43,13 @@ Transcript abundance estimates from `salmon` are then summarized to the gene lev Other options: `--idx`: path to the salmon index folder -`--tx2gene`: path to the tx2gene map matching transcripts to genes +`--tx2gene`: path to the tx2gene map matching transcripts to genes `--samplesheet`: path to the input samplesheet `--trim_reads`: logical, whether to trim reads to a fixed length `--trim_length`: numeric, length for trimming `--quant_additional`: additional options to `salmon quant` beyond `--gcBias --seqBias --posBias` -We hardcoded 30GB RAM and 6 CPUs for the quantification. On our HPC we use: +We hardcoded 8GB RAM and 6 CPUs for the quantification. On our HPC we use: ```bash NXF_VER=21.10.6 nextflow run atpoint/rnaseq_preprocess -r main -profile singularity,slurm \ diff --git a/main.nf b/main.nf index c9c5368..354d7ac 100644 --- a/main.nf +++ b/main.nf @@ -50,14 +50,10 @@ evaluate(new File("${baseDir}/functions/validate_schema_params.nf")) // Load the modules and pass params //------------------------------------------------------------------------ -include{ Idx } from './modules/index' addParams(outdir: params.idx_dir, additional: params.idx_additional) - include{ ValidateSamplesheet } from './modules/validatesamplesheet' include{ CatFastq } from './modules/cat_fastq' addParams(outdir: params.merge_dir, keep: params.merge_keep) - -include { Tx2Gene } from './modules/tx2gene' addParams(outdir: params.idx_dir) - + include{ FastQC } from './modules/fastqc' addParams(outdir: params.fastqc_dir) include{ Trim } from './modules/trim' addParams(outdir: params.trim_dir, keep: params.trim_keep) @@ -83,23 +79,6 @@ def ConvertBool2String(indata='') { } } -workflow IDX { - - main: - Idx(params.txtome, params.genome, params.idx_name) - - Tx2Gene(params.gtf) - - this_idx = Idx.out.idx - this_tx2gene = Tx2Gene.out.tx2gene - - emit: - idx = this_idx - tx2gene = this_tx2gene - versions = Idx.out.versions.concat(Tx2Gene.out.versions) - -} - workflow VALIDATESSAMPLESHEET { take: @@ -251,145 +230,124 @@ workflow MULTIQC { workflow RNASEQ_PREPROCESS { - if(params.only_idx==true) { - - IDX() - use_idx = IDX.out.idx - use_tx2gene = IDX.out.tx2gene - idx_versions = IDX.out.versions + idx_versions = Channel.empty() + use_idx = params.idx + use_tx2gene = params.tx2gene - cat_versions = Channel.empty() - fastqc_versions = Channel.empty() - trim_versions = Channel.empty() - quant_versions = Channel.empty() - tximport_versions = Channel.empty() - - } else { - - idx_versions = Channel.empty() - use_idx = params.idx - use_tx2gene = params.tx2gene - - // ---------------------------------------------------------------------------------------- - // Validate the provided samplesheet and merge fastq if necessary - // ---------------------------------------------------------------------------------------- + // ---------------------------------------------------------------------------------------- + // Validate the provided samplesheet and merge fastq if necessary + // ---------------------------------------------------------------------------------------- - sx = file(params.samplesheet, checkIfExists: true) + sx = file(params.samplesheet, checkIfExists: true) - VALIDATESSAMPLESHEET(params.samplesheet) + VALIDATESSAMPLESHEET(params.samplesheet) - // Samples with > 1 fastq per read - VALIDATESSAMPLESHEET.out.samplesheet - .map {meta, reads, counter -> - - if(counter>1) [meta, reads] + // Samples with > 1 fastq per read + VALIDATESSAMPLESHEET.out.samplesheet + .map {meta, reads, counter -> + + if(counter>1) [meta, reads] - }.set { ch_needMerge } + }.set { ch_needMerge } - CatFastq(ch_needMerge) - ch_merged = CatFastq.out.fastq_tuple - cat_versions = CatFastq.out.versions + CatFastq(ch_needMerge) + ch_merged = CatFastq.out.fastq_tuple + cat_versions = CatFastq.out.versions - // Samples with 1 fastq per read - VALIDATESSAMPLESHEET.out.samplesheet - .map {meta, reads, counter -> - - if(counter==1) [meta, reads] + // Samples with 1 fastq per read + VALIDATESSAMPLESHEET.out.samplesheet + .map {meta, reads, counter -> + + if(counter==1) [meta, reads] - }.set { ch_noMerge } + }.set { ch_noMerge } - // This channel is now [meta, reads] and can go in all downstream processes that require fastq - ch_fastq = ch_noMerge.concat(ch_merged) + // This channel is now [meta, reads] and can go in all downstream processes that require fastq + ch_fastq = ch_noMerge.concat(ch_merged) - // ---------------------------------------------------------------------------------------- - // Fastqc - // ---------------------------------------------------------------------------------------- + // ---------------------------------------------------------------------------------------- + // Fastqc + // ---------------------------------------------------------------------------------------- - if(!params.skip_fastqc){ + if(!params.skip_fastqc){ - FASTQC(ch_fastq) - fastqc_for_multiqc = FASTQC.out.fastqc - fastqc_versions = FASTQC.out.versions + FASTQC(ch_fastq) + fastqc_for_multiqc = FASTQC.out.fastqc + fastqc_versions = FASTQC.out.versions - } else { + } else { - fastqc_for_multiqc = Channel.empty() - fastqc_versions = Channel.empty() + fastqc_for_multiqc = Channel.empty() + fastqc_versions = Channel.empty() - } + } - // ---------------------------------------------------------------------------------------- - // Trim - // ---------------------------------------------------------------------------------------- - if(params.trim_reads & !params.only_fastqc){ + // ---------------------------------------------------------------------------------------- + // Trim + // ---------------------------------------------------------------------------------------- + if(params.trim_reads & !params.only_fastqc){ - TRIM(ch_fastq) - reads_for_quant = TRIM.out.fastq_tuple - trim_versions = TRIM.out.versions + TRIM(ch_fastq) + reads_for_quant = TRIM.out.fastq_tuple + trim_versions = TRIM.out.versions - } else { + } else { - reads_for_quant = ch_fastq - trim_versions = Channel.empty() + reads_for_quant = ch_fastq + trim_versions = Channel.empty() - } + } - // ---------------------------------------------------------------------------------------- - // Quantification & Tximport - // ---------------------------------------------------------------------------------------- - if(!params.only_fastqc) { + // ---------------------------------------------------------------------------------------- + // Quantification & Tximport + // ---------------------------------------------------------------------------------------- + if(!params.only_fastqc) { - QUANT(reads_for_quant, use_idx) - quant_for_multiqc = QUANT.out.quant - quant_versions = QUANT.out.versions - - if(!params.skip_tximport){ - - TXIMPORT(QUANT.out.quant.collect(), use_tx2gene) - tximport_versions = TXIMPORT.out.versions + QUANT(reads_for_quant, use_idx) + quant_for_multiqc = QUANT.out.quant + quant_versions = QUANT.out.versions - } else { + if(!params.skip_tximport){ - tximport_versions = Channel.empty() - - } + TXIMPORT(QUANT.out.quant.collect(), use_tx2gene) + tximport_versions = TXIMPORT.out.versions } else { - quant_for_multiqc = Channel.empty() - quant_versions = Channel.empty() tximport_versions = Channel.empty() } - // ---------------------------------------------------------------------------------------- - // MultiQC - // ---------------------------------------------------------------------------------------- + } else { + + quant_for_multiqc = Channel.empty() + quant_versions = Channel.empty() + tximport_versions = Channel.empty() - if(!params.skip_multiqc) { MULTIQC(fastqc_for_multiqc.concat(quant_for_multiqc).collect()) } - } + // ---------------------------------------------------------------------------------------- + // MultiQC + // ---------------------------------------------------------------------------------------- + + if(!params.skip_multiqc) { MULTIQC(fastqc_for_multiqc.concat(quant_for_multiqc).collect()) } + // ---------------------------------------------------------------------------------------- // Command lines and software versions // ---------------------------------------------------------------------------------------- - x_commands = idx_versions.concat(cat_versions, trim_versions, fastqc_versions, quant_versions, tximport_versions) - .map {it [1]}.flatten().collect() - - x_versions = idx_versions - .concat(cat_versions.first(), - fastqc_versions.first(), - trim_versions.first(), - quant_versions.first(), - tximport_versions) - .map {it [0]} - .flatten() - .collect() + x_commands = cat_versions.concat(trim_versions, fastqc_versions, quant_versions, tximport_versions) + .map {it [1]}.flatten().collect() - CommandLines(x_commands, x_versions) + x_versions = tximport_versions.concat(cat_versions.first(), fastqc_versions.first(), trim_versions.first(), quant_versions.first()) + .map {it [0]} + .flatten() + .collect() + CommandLines(x_commands, x_versions) + } +// RUN EVERYTHING workflow { RNASEQ_PREPROCESS() } def od = params.outdir diff --git a/nextflow.config b/nextflow.config index 79e4713..9572df8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,19 +4,9 @@ process { shell = ['/bin/bash', '-euo', 'pipefail'] - withLabel:process_idx { - cpus = { 6 } - memory = { 30.GB } - } - - withLabel:process_tx2gene { - cpus = { 1 } - memory = { 8.GB } - } - withLabel:process_quant { cpus = { 6 } - memory = { 30.GB } + memory = { 8.GB } } withLabel:process_tximport { @@ -66,11 +56,6 @@ profiles { process { - withLabel:process_idx { - cpus = { 1 } - memory = { 1.GB } - } - withLabel:process_tx2gene { cpus = { 1 } memory = { 1.GB } @@ -93,17 +78,7 @@ profiles { } } - test_with_new_idx { - - params.samplesheet = "$baseDir/test/samplesheet.csv" - params.txtome = "$baseDir/test/txtome.fa.gz" - params.genome = "$baseDir/test/genome.fa.gz" - params.gtf = "$baseDir/test/annot.gtf.gz" - params.quant_additional = '' - - } - - test_with_existing_idx { + test_with_existing_idx { params.samplesheet = "$baseDir/test/samplesheet.csv" params.idx = "$baseDir/test/index/idx" diff --git a/schema.nf b/schema.nf index 361a2e3..236ce6c 100644 --- a/schema.nf +++ b/schema.nf @@ -17,23 +17,6 @@ schema.publishmode = [value: 'copy', type: 'string', mandatory: true, allowed schema.outdir = [value: "$launchDir/rnaseq_preprocess_results/", type: 'string', mandatory: true] schema.pipe_dir = [value: "${schema.outdir['value']}/pipeline_info/", type: 'string'] -// indexing: -schema.title2 = [title: 'INDEXING OPTIONS'] -schema.idx = [value: '', type: 'string'] -schema.tx2gene = [value: '', type: 'string'] -schema.txtome = [value: '', type: 'string', pattern: /.*\.gz$/] -schema.genome = [value: '', type: 'string', pattern: /.*\.gz$/] -schema.gtf = [value: '', type: 'string', pattern: /.*\.gz$/] -schema.transcript_id = [value: 'transcript_id', type: 'string'] -schema.transcript_name = [value: 'transcript_name', type: 'string'] -schema.gene_id = [value: 'gene_id', type: 'string'] -schema.gene_name = [value: 'gene_name', type: 'string'] -schema.gene_type = [value: 'gene_type', type: 'string'] -schema.idx_name = [value: "idx", type: 'string'] -schema.idx_dir = [value: "${schema.outdir['value']}/salmon_idx/", type: 'string'] -schema.idx_additional = [value: '--gencode', type: 'string'] -schema.only_idx = [value: false, type: 'logical'] - // combining technical replicates into a single fastq and trimming schema.title3 = [title: 'MERGE/FASTQ OPTIONS'] schema.merge_dir = [value: "${schema.outdir['value']}/fastq_merged/", type: 'string'] @@ -55,8 +38,10 @@ schema.only_fastqc = [value: false, type: 'logical'] // samplesheet and quantification schema.title5 = [title: 'QUANTIFICATION/TXIMPORT OPTIONS'] schema.samplesheet = [value: '', type: 'string', pattern: /.*\.csv$/] +schema.idx = [value: '', type: 'string', mandatory: true] schema.quant_dir = [value: "${schema.outdir['value']}/salmon_quant/", type: 'string'] schema.quant_additional = [value: '--gcBias --seqBias --posBias', type: 'string'] +schema.tx2gene = [value: '', type: 'string'] schema.tximport_dir = [value: "${schema.outdir['value']}/tximport/", type: 'string'] schema.skip_tximport = [value: false, type: 'logical']