From 85a778f6534f802e17645c5fe9f41ebdc4742686 Mon Sep 17 00:00:00 2001
From: atpoint <atpoint90@gmail.com>
Date: Mon, 8 Jan 2024 15:36:10 +0100
Subject: [PATCH] remove indexing step

---
 .github/workflows/CI.yml |   4 -
 CHANGELOG.md             |   4 +
 README.md                |  31 ++----
 main.nf                  | 198 +++++++++++++++------------------------
 nextflow.config          |  29 +-----
 schema.nf                |  19 +---
 6 files changed, 94 insertions(+), 191 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 48da6f0..02c4d21 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -29,10 +29,6 @@ jobs:
       # Test via docker
       - name: TEST-DOCKER_ALL
         run: |
-          # building new index and exit
-          NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_new_idx,test_resources --only_idx
-
-          # using existing index
           NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --only_fastqc
           NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --skip_fastqc
           NXF_VER=21.10.6 nextflow run main.nf -profile docker,test_with_existing_idx,test_resources --skip_tximport
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e112aaa..68aaa24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## v2.6.0
+- remove indexing from pipeline, expect pre-made index.
+- assume now that partial decoy index or txtome index is used, hence lower memory for quant to 8GB
+
 ## v2.5.2
 - check in `tximport` process whether there is a mismatch between tx2gene file and quant.sf identifiers
 that can be solved by using either of the `ignoreTxVersion` or `ignoreAfterBar` arguments of `tximport()`,
diff --git a/README.md b/README.md
index e2de6f9..57574c3 100644
--- a/README.md
+++ b/README.md
@@ -21,28 +21,13 @@ See the [misc](misc/) folder which contains the software versions used in the pi
 
 **Indexing**
 
-The indexing step must be run first and separately using the `--only_idx` flag. For this we need a reference transcriptome (gzipped), a reference genome as decoy (gzipped) and a GTF annotation file (gzipped).
-
-`--only_idx`: trigger the indexing process  
-`--idx_name`: name of the produced index, default `idx`  
-`--idx_dir`: name of the directory inside `rnaseq_preprocess_results/` storing the index, default `salmon_idx`  
-`--idx_additional`: additional arguments to `salmon index` beyond the defaults which are `--no-version-check -t -d -i -p --gencode`  
-`--txtome`: path to the gzipped transcriptome fasta  
-`--genome`: path to the gzipped genome fasta  
-`--gtf`: path to the gzipped GTF file  
-`--transcript_id`: name of GTF column storing transcript ID, default `transcript_id`  
-`--transcript_name`: name of GTF column storing transcript name, default `transcript_name`  
-`--gene_id`: name of GTF column storing gene ID, default `gene_id`  
-`--gene_name`: name of GTF column storing gene name, default `gene_name`  
-`--gene_type`: name of GTF column storing gene biotype, default `gene_type`  
-
-For the indexing process, 30GB of RAM and 6 CPUs are required/hardcoded. On our HPC we use:  
+The pipeline does not cover the indexing step as there are different sorts of salmon index methods available,
+for example indexing only the transcriptome without any genome decoys, partial genome decoys and full genome decoys.
 
-```bash
-NXF_VER=21.10.6 nextflow run atpoint/rnaseq_preprocess -r main  -profile singularity,slurm --only_idx \
-    --genome path/to/genome.fa.gz --txtome path/to/txtome.fa.gz --gtf path/to/foo.gtf.gz \
-    -with-report indexing_report.html -with-trace indexing_report.trace -bg > indexing_report.log
-```    
+Please produce an index up front and then provide the output folder to the `--idx` option.
+
+The pipeline has a hardcoded 8GB memory limit for the quantification step which should be sufficient for transcriptome-only and partial genome decoy indices.
+For full genome decoy please modify the `withLabel:process_quant` memory definition in `nextflow.config` to something like 20GB depending on organism.
 
 **Quantification/tximport**
 
@@ -58,13 +43,13 @@ Transcript abundance estimates from `salmon` are then summarized to the gene lev
 Other options:
 
 `--idx`: path to the salmon index folder  
-`--tx2gene`: path to the tx2gene map matching transcripts to genes  
+`--tx2gene`: path to the tx2gene map matching transcripts to genes
 `--samplesheet`: path to the input samplesheet  
 `--trim_reads`: logical, whether to trim reads to a fixed length  
 `--trim_length`: numeric, length for trimming  
 `--quant_additional`: additional options to `salmon quant` beyond `--gcBias --seqBias --posBias`  
 
-We hardcoded 30GB RAM and 6 CPUs for the quantification. On our HPC we use:
+We hardcoded 8GB RAM and 6 CPUs for the quantification. On our HPC we use:
 
 ```bash
 NXF_VER=21.10.6 nextflow run atpoint/rnaseq_preprocess -r main -profile singularity,slurm \
diff --git a/main.nf b/main.nf
index c9c5368..354d7ac 100644
--- a/main.nf
+++ b/main.nf
@@ -50,14 +50,10 @@ evaluate(new File("${baseDir}/functions/validate_schema_params.nf"))
 // Load the modules and pass params
 //------------------------------------------------------------------------
 
-include{ Idx } from './modules/index' addParams(outdir: params.idx_dir, additional: params.idx_additional)
-
 include{ ValidateSamplesheet } from './modules/validatesamplesheet'
 
 include{ CatFastq } from './modules/cat_fastq' addParams(outdir: params.merge_dir, keep: params.merge_keep)
-
-include { Tx2Gene } from './modules/tx2gene' addParams(outdir: params.idx_dir)
-                                                             
+                                                            
 include{ FastQC } from './modules/fastqc' addParams(outdir: params.fastqc_dir)
 
 include{ Trim } from './modules/trim' addParams(outdir: params.trim_dir, keep: params.trim_keep)
@@ -83,23 +79,6 @@ def ConvertBool2String(indata='') {
     }
 }
 
-workflow IDX {
-
-    main:
-        Idx(params.txtome, params.genome, params.idx_name)
-
-        Tx2Gene(params.gtf)
-
-        this_idx     = Idx.out.idx 
-        this_tx2gene = Tx2Gene.out.tx2gene
-            
-    emit:
-        idx      = this_idx
-        tx2gene  = this_tx2gene    
-        versions = Idx.out.versions.concat(Tx2Gene.out.versions)
-
-}
-
 workflow VALIDATESSAMPLESHEET {
 
     take: 
@@ -251,145 +230,124 @@ workflow MULTIQC {
 
 workflow RNASEQ_PREPROCESS {
 
-    if(params.only_idx==true) {
-        
-        IDX()
-        use_idx     = IDX.out.idx
-        use_tx2gene = IDX.out.tx2gene
-        idx_versions = IDX.out.versions
+    idx_versions = Channel.empty()
+    use_idx = params.idx
+    use_tx2gene = params.tx2gene
 
-        cat_versions = Channel.empty()
-        fastqc_versions = Channel.empty()
-        trim_versions = Channel.empty()
-        quant_versions = Channel.empty()
-        tximport_versions = Channel.empty()
-
-    } else {
-
-        idx_versions = Channel.empty()
-        use_idx = params.idx
-        use_tx2gene = params.tx2gene
-
-        // ----------------------------------------------------------------------------------------
-        // Validate the provided samplesheet and merge fastq if necessary
-        // ----------------------------------------------------------------------------------------
+    // ----------------------------------------------------------------------------------------
+    // Validate the provided samplesheet and merge fastq if necessary
+    // ----------------------------------------------------------------------------------------
 
-        sx = file(params.samplesheet, checkIfExists: true)
+    sx = file(params.samplesheet, checkIfExists: true)
         
-        VALIDATESSAMPLESHEET(params.samplesheet)
+    VALIDATESSAMPLESHEET(params.samplesheet)
 
-        // Samples with > 1 fastq per read
-        VALIDATESSAMPLESHEET.out.samplesheet
-        .map {meta, reads, counter -> 
-        
-            if(counter>1) [meta, reads]
+    // Samples with > 1 fastq per read
+    VALIDATESSAMPLESHEET.out.samplesheet
+    .map {meta, reads, counter -> 
+    
+        if(counter>1) [meta, reads]
 
-        }.set { ch_needMerge }
+    }.set { ch_needMerge }
 
-        CatFastq(ch_needMerge)
-        ch_merged = CatFastq.out.fastq_tuple
-        cat_versions = CatFastq.out.versions  
+    CatFastq(ch_needMerge)
+    ch_merged = CatFastq.out.fastq_tuple
+    cat_versions = CatFastq.out.versions  
 
-        // Samples with 1 fastq per read
-        VALIDATESSAMPLESHEET.out.samplesheet
-        .map {meta, reads, counter -> 
-            
-            if(counter==1) [meta, reads]
+    // Samples with 1 fastq per read
+    VALIDATESSAMPLESHEET.out.samplesheet
+    .map {meta, reads, counter -> 
+        
+        if(counter==1) [meta, reads]
 
-        }.set { ch_noMerge }
+    }.set { ch_noMerge }
 
-        // This channel is now [meta, reads] and can go in all downstream processes that require fastq
-        ch_fastq = ch_noMerge.concat(ch_merged)
+    // This channel is now [meta, reads] and can go in all downstream processes that require fastq
+    ch_fastq = ch_noMerge.concat(ch_merged)
 
-        // ----------------------------------------------------------------------------------------
-        // Fastqc
-        // ----------------------------------------------------------------------------------------
+    // ----------------------------------------------------------------------------------------
+    // Fastqc
+    // ----------------------------------------------------------------------------------------
 
-        if(!params.skip_fastqc){
+    if(!params.skip_fastqc){
 
-            FASTQC(ch_fastq)
-            fastqc_for_multiqc = FASTQC.out.fastqc
-            fastqc_versions = FASTQC.out.versions
+        FASTQC(ch_fastq)
+        fastqc_for_multiqc = FASTQC.out.fastqc
+        fastqc_versions = FASTQC.out.versions
 
-        } else {
+    } else {
 
-            fastqc_for_multiqc = Channel.empty()
-            fastqc_versions = Channel.empty()
+        fastqc_for_multiqc = Channel.empty()
+        fastqc_versions = Channel.empty()
 
-        }
+    }
 
-        // ----------------------------------------------------------------------------------------
-        // Trim
-        // ----------------------------------------------------------------------------------------
-        if(params.trim_reads & !params.only_fastqc){
+    // ----------------------------------------------------------------------------------------
+    // Trim
+    // ----------------------------------------------------------------------------------------
+    if(params.trim_reads & !params.only_fastqc){
 
-            TRIM(ch_fastq)
-            reads_for_quant = TRIM.out.fastq_tuple
-            trim_versions = TRIM.out.versions
+        TRIM(ch_fastq)
+        reads_for_quant = TRIM.out.fastq_tuple
+        trim_versions = TRIM.out.versions
 
-        } else {
+    } else {
 
-            reads_for_quant = ch_fastq
-            trim_versions = Channel.empty()
+        reads_for_quant = ch_fastq
+        trim_versions = Channel.empty()
 
-        }
+    }
 
-        // ----------------------------------------------------------------------------------------
-        // Quantification & Tximport
-        // ----------------------------------------------------------------------------------------
-        if(!params.only_fastqc) {
+    // ----------------------------------------------------------------------------------------
+    // Quantification & Tximport
+    // ----------------------------------------------------------------------------------------
+    if(!params.only_fastqc) {
             
-            QUANT(reads_for_quant, use_idx)
-            quant_for_multiqc = QUANT.out.quant
-            quant_versions = QUANT.out.versions
-
-            if(!params.skip_tximport){
-
-                TXIMPORT(QUANT.out.quant.collect(), use_tx2gene)
-                tximport_versions = TXIMPORT.out.versions
+        QUANT(reads_for_quant, use_idx)
+        quant_for_multiqc = QUANT.out.quant
+        quant_versions = QUANT.out.versions
 
-            } else {
+        if(!params.skip_tximport){
 
-                tximport_versions = Channel.empty()
-
-            }
+            TXIMPORT(QUANT.out.quant.collect(), use_tx2gene)
+            tximport_versions = TXIMPORT.out.versions
 
         } else {
 
-            quant_for_multiqc = Channel.empty()
-            quant_versions = Channel.empty()
             tximport_versions = Channel.empty()
 
         }
 
-        // ----------------------------------------------------------------------------------------
-        // MultiQC
-        // ----------------------------------------------------------------------------------------
+    } else {
+
+        quant_for_multiqc = Channel.empty()
+        quant_versions = Channel.empty()
+        tximport_versions = Channel.empty()
 
-        if(!params.skip_multiqc) { MULTIQC(fastqc_for_multiqc.concat(quant_for_multiqc).collect()) }
-       
     }
 
+    // ----------------------------------------------------------------------------------------
+    // MultiQC
+    // ----------------------------------------------------------------------------------------
+
+    if(!params.skip_multiqc) { MULTIQC(fastqc_for_multiqc.concat(quant_for_multiqc).collect()) }
+
     // ----------------------------------------------------------------------------------------
     // Command lines and software versions
     // ----------------------------------------------------------------------------------------
-    x_commands = idx_versions.concat(cat_versions, trim_versions, fastqc_versions, quant_versions, tximport_versions)
-                 .map {it [1]}.flatten().collect()
-
-    x_versions = idx_versions
-                 .concat(cat_versions.first(), 
-                         fastqc_versions.first(), 
-                         trim_versions.first(),
-                         quant_versions.first(),
-                         tximport_versions)
-                .map {it [0]}
-                .flatten()
-                .collect()
+    x_commands = cat_versions.concat(trim_versions, fastqc_versions, quant_versions, tximport_versions)
+                .map {it [1]}.flatten().collect()
 
-    CommandLines(x_commands, x_versions)
+    x_versions = tximport_versions.concat(cat_versions.first(), fastqc_versions.first(), trim_versions.first(), quant_versions.first())
+                 .map {it [0]}
+                 .flatten()
+                 .collect()
 
+    CommandLines(x_commands, x_versions)
+       
 }
 
+// RUN EVERYTHING
 workflow { RNASEQ_PREPROCESS() }
 
 def od = params.outdir
diff --git a/nextflow.config b/nextflow.config
index 79e4713..9572df8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -4,19 +4,9 @@ process {
 
    shell = ['/bin/bash', '-euo', 'pipefail']
   
-    withLabel:process_idx {
-        cpus   = { 6 }
-        memory = { 30.GB }
-    }
-
-    withLabel:process_tx2gene {
-        cpus   = { 1 }
-        memory = { 8.GB }
-    }
-
     withLabel:process_quant {
         cpus   = { 6 }
-        memory = { 30.GB }
+        memory = { 8.GB }
     }
 
     withLabel:process_tximport {
@@ -66,11 +56,6 @@ profiles {
 
         process {
   
-            withLabel:process_idx {
-                cpus   = { 1 }
-                memory = { 1.GB }
-            }
-
             withLabel:process_tx2gene {
                 cpus   = { 1 }
                 memory = { 1.GB }
@@ -93,17 +78,7 @@ profiles {
         }
     }
 
-    test_with_new_idx {
-
-        params.samplesheet = "$baseDir/test/samplesheet.csv"
-        params.txtome      = "$baseDir/test/txtome.fa.gz"
-        params.genome      = "$baseDir/test/genome.fa.gz"
-        params.gtf         = "$baseDir/test/annot.gtf.gz"
-        params.quant_additional = ''
-                
-    }
-
-    test_with_existing_idx {
+   test_with_existing_idx {
 
         params.samplesheet = "$baseDir/test/samplesheet.csv"
         params.idx         = "$baseDir/test/index/idx"
diff --git a/schema.nf b/schema.nf
index 361a2e3..236ce6c 100644
--- a/schema.nf
+++ b/schema.nf
@@ -17,23 +17,6 @@ schema.publishmode    = [value: 'copy', type: 'string', mandatory: true, allowed
 schema.outdir         = [value: "$launchDir/rnaseq_preprocess_results/", type: 'string', mandatory: true]
 schema.pipe_dir        = [value: "${schema.outdir['value']}/pipeline_info/", type: 'string']
 
-// indexing:
-schema.title2          = [title: 'INDEXING OPTIONS']
-schema.idx             = [value: '', type: 'string']
-schema.tx2gene         = [value: '', type: 'string']
-schema.txtome          = [value: '', type: 'string', pattern: /.*\.gz$/]
-schema.genome          = [value: '', type: 'string', pattern: /.*\.gz$/]
-schema.gtf             = [value: '', type: 'string', pattern: /.*\.gz$/]
-schema.transcript_id   = [value: 'transcript_id', type: 'string']
-schema.transcript_name = [value: 'transcript_name', type: 'string']
-schema.gene_id         = [value: 'gene_id', type: 'string']
-schema.gene_name       = [value: 'gene_name', type: 'string']
-schema.gene_type       = [value: 'gene_type', type: 'string']
-schema.idx_name        = [value: "idx", type: 'string']
-schema.idx_dir         = [value: "${schema.outdir['value']}/salmon_idx/", type: 'string']
-schema.idx_additional  = [value: '--gencode', type: 'string']
-schema.only_idx        = [value: false, type: 'logical']
-
 // combining technical replicates into a single fastq and trimming
 schema.title3            = [title: 'MERGE/FASTQ OPTIONS']
 schema.merge_dir         = [value: "${schema.outdir['value']}/fastq_merged/", type: 'string']
@@ -55,8 +38,10 @@ schema.only_fastqc        = [value: false, type: 'logical']
 // samplesheet and quantification
 schema.title5             = [title: 'QUANTIFICATION/TXIMPORT OPTIONS']
 schema.samplesheet        = [value: '', type: 'string', pattern: /.*\.csv$/]
+schema.idx                = [value: '', type: 'string', mandatory: true]
 schema.quant_dir          = [value: "${schema.outdir['value']}/salmon_quant/", type: 'string']
 schema.quant_additional   = [value: '--gcBias --seqBias --posBias', type: 'string']
+schema.tx2gene            = [value: '', type: 'string']
 schema.tximport_dir       = [value: "${schema.outdir['value']}/tximport/", type: 'string']
 schema.skip_tximport      = [value: false, type: 'logical']