Param renames, adapted workflow to do the correct checks; fixed a bug…

… in the gprofiler2 module. It is now the same as that in my nf-core/modules PR but I still need to 'officially' install the updated module once the modules PR is merged
nf-core · Dec 8, 2023 · a1fcc1c · a1fcc1c
1 parent 8519c11
commit a1fcc1c
Show file tree

Hide file tree

Showing 11 changed files with 50 additions and 49 deletions.
diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -25,7 +25,7 @@ params:
   report_author: NULL,
   report_description: NULL,
   report_scree: NULL
-  gene_set_files: NULL
+  gene_sets_files: NULL
   report_round_digits: NULL
   observations_type: NULL
   observations: NULL                                          # GSE156533.samplesheet.csv
@@ -145,7 +145,7 @@ params:
   gprofiler2_sources: NULL
   gprofiler2_evcodes: NULL
   gprofiler2_max_qval: NULL
-  gprofiler2_gost_token: NULL
+  gprofiler2_token: NULL
   gprofiler2_background_file: NULL
   gprofiler2_background_column: NULL
   gprofiler2_domain_scope: NULL
@@ -894,7 +894,7 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
     if (unlist(params[paste0(gene_set_method, '_run')])){
       cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n")
       if (gene_set_method == 'gsea') {
-        for (gmt_file in simpleSplit(params$gene_set_files)) {
+        for (gmt_file in simpleSplit(params$gene_sets_files)) {
           gmt_name <- basename(tools::file_path_sans_ext(gmt_file))
           cat("\n##### ", gmt_name ," {.tabset}\n")
 
@@ -911,8 +911,10 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
 
       } else if (gene_set_method == 'gprofiler2') {
         enrichment_files <- grep("gprofiler2", list.files(params$input_dir), value=T, fixed=T)
-        tsv_files <- grep(".tsv", enrichment_files, fixed=T)
-        if (length(tsv_files)) {
+        tsv_files <- grep("all_enriched_pathways.tsv", enrichment_files, value=T, fixed=T)
+
+        # Make sure to grab only non-empty files
+        if (length(tsv_files) && any(file.size(tsv_files) != 0L)) {
 
           cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
           ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n"))

diff --git a/conf/modules.config b/conf/modules.config
@@ -362,7 +362,8 @@ process {
             "--palette_name \"${params.gprofiler2_palette_name}\"",
             ((meta.blocking == null) ? '' : "--blocking_variables $meta.blocking"),
             ((params.differential_feature_id_column == null) ? '' : "--de_id_column \"${params.differential_feature_id_column}\""),
-            ((params.gprofiler2_gost_token == null) ? '' : "--gost_token \"${params.gprofiler2_gost_token}\""),
+            ((params.gprofiler2_token == null) ? '' : "--token \"${params.gprofiler2_token}\""),
+            ((params.gprofiler2_organism == null) ? '' : "--organism \"${params.gprofiler2_organism}\""),
             ((params.gprofiler2_background_column == null) ? '' : "--background_column \"${params.gprofiler2_background_column}\""),
             ((params.gprofiler2_sources == null) ? '' : "--sources \"${params.gprofiler2_sources}\"")
         ].join(' ').trim() }

diff --git a/conf/test.config b/conf/test.config
@@ -47,6 +47,4 @@ params {
 
     // Activate gprofiler2
     gprofiler2_run = true
-    gprofiler2_organism = 'mmusculus'
-    gene_set_files = '/home-link/iivow01/git/differentialabundance/testdata/combo_gprofiler_hallmark_mmusculus.gmt'
 }
diff --git a/conf/test_affy.config b/conf/test_affy.config
@@ -42,5 +42,5 @@ params {
 
     // Activate GSEA
     gsea_run = true
-    gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
+    gene_sets_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -33,11 +33,6 @@ params {
     report_description = "This is a full-sized test dataset contributed by Oskar Wacker"
 
     // Activate GSEA
-    gsea_run = false
-//    gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'
-
-    // Activate gprofiler2
-    gprofiler2_run = true
-    gprofiler2_organism = 'mmusculus'
-    gprofiler2_sources = 'KEGG,REAC'
+    gsea_run = true
+    gsea_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -290,7 +290,7 @@ The organism (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) is req
 
 ```bash
 --gsea_run true \
---gene_set_files gene_sets.gmt
+--gene_sets_files gene_sets.gmt
 ```
 
 ## Running the pipeline

diff --git a/modules/nf-core/gprofiler2/gost/main.nf b/modules/nf-core/gprofiler2/gost/main.nf
diff --git a/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R b/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R
diff --git a/nextflow.config b/nextflow.config
@@ -164,7 +164,7 @@ params {
     gprofiler2_sources                      = null
     gprofiler2_evcodes                      = false
     gprofiler2_max_qval                     = 0.05
-    gprofiler2_gost_token                   = null
+    gprofiler2_token                        = null
     gprofiler2_background_file              = null
     gprofiler2_background_column            = null
     gprofiler2_domain_scope                 = 'annotated'
@@ -183,7 +183,7 @@ params {
     shinyngs_shinyapps_app_name     = null
 
     // Gene set options
-    gene_set_files                  = null
+    gene_sets_files                 = null
 
     // References
     genome                     = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -868,7 +868,7 @@
                 "gprofiler2_organism": {
                     "type": "string",
                     "description": "Short name of the organism that is analyzed, e.g. hsapiens for homo sapiens.",
-                    "help_text": "Set this to the short organism name consisting of the first letter of the genus and the full species name, e.g. hsapiens for Homo sapiens, mmusculus for Mus musculus."
+                    "help_text": "Set this to the short organism name consisting of the first letter of the genus and the full species name, e.g. hsapiens for Homo sapiens, mmusculus for Mus musculus. This has lowest priority and will be overridden by --gprofiler2_token and --gene_sets_files."
                 },
                 "gprofiler2_significant": {
                     "type": "boolean",
@@ -890,8 +890,8 @@
                 },
                 "gprofiler2_sources": {
                     "type": "string",
-                    "description": "On which source databases to run the gprofiler query.",
-                    "help_text": "GO, GO:MF, GO:BP, GO:CC, KEGG, REAC, WP, TF, MIRNA, HPA, CORUM, HP, or any comma-reparated combination thereof, e.g. 'KEGG,REAC'."
+                    "description": "On which source databases to run the gprofiler query",
+                    "help_text": "GO, GO:MF, GO:BP, GO:CC, KEGG, REAC, WP, TF, MIRNA, HPA, CORUM, HP, or any comma-reparated combination thereof, e.g. 'KEGG,REAC'. This works if --gprofiler2_organism is used; if a GMT file is provided with --gene_sets_files, should also work; the module will then remove any lines not starting with any of the source names. Does not work for --gprofiler2_token as g:Profiler will not filter such a run."
                 },
                 "gprofiler2_evcodes": {
                     "type": "boolean",
@@ -904,10 +904,10 @@
                     "default": 0.05,
                     "description": "Maximum q value used for significance testing."
                 },
-                "gprofiler2_gost_token": {
+                "gprofiler2_token": {
                     "type": "string",
                     "description": "Token that should be used as a query.",
-                    "help_text": "For reproducibility, instead of querying the online databases, you can provide a token, e.g. from a previous pipeline run or from a manual query on https://biit.cs.ut.ee/gprofiler/gost."
+                    "help_text": "For reproducibility, instead of querying the online databases, you can provide a token, e.g. from a previous pipeline run or from a manual query on https://biit.cs.ut.ee/gprofiler/gost. This has highest priority and will override --gprofiler2_organism and --gene_sets_files."
                 },
                 "gprofiler2_background_file": {
                     "type": "string",
@@ -986,10 +986,10 @@
             "fa_icon": "fas fa-cogs",
             "description": "Files and options used by gene set analysis modules.",
             "properties": {
-                "gene_set_files": {
+                "gene_sets_files": {
                     "type": "string",
                     "default": "None",
-                    "description": "Gene sets in GMT or GMX-format; for GSEA: multiple comma-separated input files are possible. For gprofiler2: One GMT file is possible.",
+                    "description": "Gene sets in GMT or GMX-format; for GSEA: multiple comma-separated input files in either format are possible. For gprofiler2: A single file in GMT format is possible; this has second highest priority and will override --gprofiler2_organism.",
                     "fa_icon": "fas fa-bars"
                 }
             }

diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf
@@ -34,6 +34,9 @@ if (params.study_type == 'affy_array'){
         if (params.gsea_run) {
             error("Cannot run GSEA for maxquant data; please set --gsea_run to false.")
         }
+        if (params.gprofiler2_run){
+            error("gprofiler2 pathway analysis is not yet possible with maxquant input data; please set --gprofiler2_run false and rerun pipeline!")
+        }
         if (!params.matrix) {
             error("Input matrix not specified!")
         }
@@ -67,19 +70,17 @@ if (params.study_type == 'affy_array'){
 if (params.transcript_length_matrix) { ch_transcript_lengths = Channel.of([ exp_meta, file(params.transcript_length_matrix, checkIfExists: true)]).first() } else { ch_transcript_lengths = [[],[]] }
 if (params.control_features) { ch_control_features = Channel.of([ exp_meta, file(params.control_features, checkIfExists: true)]).first() } else { ch_control_features = [[],[]] }
 if (params.gsea_run) {
-    if (params.gene_set_files){
-        gene_set_files = params.gene_set_files.split(",")
-        ch_gene_sets = Channel.of(gene_set_files).map { file(it, checkIfExists: true) }
+    if (params.gene_sets_files){
+        gene_sets_files = params.gene_sets_files.split(",")
+        ch_gene_sets = Channel.of(gene_sets_files).map { file(it, checkIfExists: true) }
     } else {
         error("GSEA activated but gene set file not specified!")
     }
 }
 if (params.gprofiler2_run) {
-    if (params.study_type == 'maxquant'){
-        error("gprofiler2 pathway analysis is not yet possible with maxquant input data; please set --gprofiler2_run false and rerun pipeline!")
-    }
-    if (!params.gprofiler2_organism){
-        error("gprofiler2 pathway analysis activated but organism not specified!")
+    if (!params.gprofiler2_token && !params.gene_sets_files && !params.gprofiler2_organism){
+    } else {
+        error("To run gprofiler2, please provide a run token, GMT file or organism!")
     }
 }
 
@@ -478,7 +479,7 @@ workflow DIFFERENTIALABUNDANCE {
 
         // For gprofiler2, use only features that are considered differential
         ch_filtered_diff = FILTER_DIFFTABLE.out.filtered
-        ch_organism = Channel.value(params.gprofiler2_organism)
+
         if (!params.gprofiler2_background_file) {
             // If param not set, use empty list as "background"
             ch_background = []
@@ -492,16 +493,15 @@ workflow DIFFERENTIALABUNDANCE {
         } else {
             ch_background = Channel.from(file(params.gprofiler2_background_file, checkIfExists: true))
         }
-        if (!params.gene_set_files) {
+        if (!params.gene_sets_files) {
             ch_gene_sets = []
         } else {
-            ch_gene_sets = Channel.value(params.gene_set_files)
+            ch_gene_sets = Channel.value(params.gene_sets_files)
         }
 
         GPROFILER2_GOST(
             ch_contrasts,
             ch_filtered_diff,
-            ch_organism,
             ch_gene_sets,
             ch_background
         )