added difftable filtering module, added short docu section for gsea, …

…renamed params
nf-core · Dec 8, 2023 · 8519c11 · 8519c11
1 parent 5250c22
commit 8519c11
Show file tree

Hide file tree

Showing 12 changed files with 313 additions and 231 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [[#203](https://github.com/nf-core/differentialabundance/pull/203)] - Transcript lengths for DESeq2 ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse))
-- [[#199](https://github.com/nf-core/differentialabundance/pull/199)] - Add gprofiler2 module([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#199](https://github.com/nf-core/differentialabundance/pull/199)] - Add gprofiler2 module and local differential table filtering module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#193](https://github.com/nf-core/differentialabundance/pull/193)] - Add DESeq2 text to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#192](https://github.com/nf-core/differentialabundance/pull/192)] - Add scree plot in report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#189](https://github.com/nf-core/differentialabundance/pull/189)] - Add DE models to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))

diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -25,8 +25,8 @@ params:
   report_author: NULL,
   report_description: NULL,
   report_scree: NULL
-  mods_gene_sets: NULL
-  mods_round_digits: NULL
+  gene_set_files: NULL
+  report_round_digits: NULL
   observations_type: NULL
   observations: NULL                                          # GSE156533.samplesheet.csv
   observations_id_col: NULL
@@ -894,7 +894,7 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
     if (unlist(params[paste0(gene_set_method, '_run')])){
       cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n")
       if (gene_set_method == 'gsea') {
-        for (gmt_file in simpleSplit(params$mods_gene_sets)) {
+        for (gmt_file in simpleSplit(params$gene_set_files)) {
           gmt_name <- basename(tools::file_path_sans_ext(gmt_file))
           cat("\n##### ", gmt_name ," {.tabset}\n")
 
@@ -911,32 +911,33 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
 
       } else if (gene_set_method == 'gprofiler2') {
         enrichment_files <- grep("gprofiler2", list.files(params$input_dir), value=T, fixed=T)
-        if (length(grep(".html", enrichment_files, fixed=T))) {
-          cat(paste0("\nThis section contains the results of the pathway analysis which was done with the R package gprofiler2. The plots show the -log10 adjusted p values of each pathway which was found to be enriched for differential genes (if necessary determined by converting feature IDs to gene IDs); if possible, pathways are grouped by their source database. The tables below give additional info for each pathway; the differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
+        tsv_files <- grep(".tsv", enrichment_files, fixed=T)
+        if (length(tsv_files)) {
+
+          cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
           ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n"))
 
-          for (html in rev(grep(".html", enrichment_files, value=T, fixed=T))) {
-            contrast <- unlist(strsplit(html, "gprofiler2.", fixed=T))[2]
-            contrast <- unlist(strsplit(contrast, ".gostplot", fixed=T))[1]
-            if (! file.exists(file.path(params$input_dir, html))){
-              stop(paste("gprofiler2 gost plot", file.path(params$input_dir, html), "does not exist"))
+          for (i in 1:nrow(contrasts)) {
+            contrast_name <- paste(contrasts$variable[i], contrasts$reference[i], contrasts$target[i], sep = '_')
+            contrast_name <- paste('gprofiler2', contrast_name, sep = '.')
+            if (contrasts$blocking[i] != "") {
+                blocking_variables <- paste(make.names(unlist(strsplit(contrasts$blocking[i], split = ','))), collapse = '_')
+                contrast_name <- paste(contrast_name, blocking_variables, sep= '_')
             }
-
-            cat(paste0("\n##### ", contrast, "\n"))
-            cat(paste0('<embed type="text/html" src="', file.path(params$input_dir, html), '" width="800" height="400">'))
-
-            table <- paste0("gprofiler2.", contrast, ".all_enriched_pathways.tsv")
-
+            table <- paste0(contrast_name, ".all_enriched_pathways.tsv")
             if (! file.exists(file.path(params$input_dir, table))){
               stop(paste("gprofiler2 enrichment table", file.path(params$input_dir, table), "does not exist"))
             }
+
             all_enriched <- read.table(file.path(params$input_dir, table), header=T, sep="\t", quote="\"")
             all_enriched <- data.frame("Pathway name" = all_enriched$term_name, "Pathway code" = all_enriched$term_id,
               "Differential features" = all_enriched$intersection_size, "Pathway size" = all_enriched$term_size,
               "Differential fraction" = (all_enriched$intersection_size/all_enriched$term_size),
               "Adjusted p value" = all_enriched$p_value, check.names = FALSE)
-            all_enriched <- round_dataframe_columns(all_enriched, digits=params$mods_round_digits)
-            print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrast, " (check", table, "for more detail)"), rownames = FALSE)))
+            all_enriched <- round_dataframe_columns(all_enriched, digits=params$report_round_digits)
+
+            cat(paste0("\n##### ", contrasts$id[i], "\n"))            
+            print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrast_name, " (check", table, "for more detail)"), rownames = FALSE)))
             cat("\n")
           }
         } else {
@@ -999,7 +1000,7 @@ make_params_table('downstream differential analysis', 'differential_', remove_pa
 <!-- If any gene set methods have been activated show their params -->
 
 ```{r, echo=FALSE, results='asis'}
-possible_gene_set_methods <- c('gsea')
+possible_gene_set_methods <- c('gsea', 'gprofiler2')
 
 if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
   cat("\n### Gene set analysis\n")
@@ -1014,12 +1015,6 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
 }
 ```
 
-```{r, echo=FALSE, results='asis', eval=params$gprofiler2_run}
-cat("\n### Pathway enrichment analysis\n")
-cat("\n#### gprofiler2\n")
-make_params_table("gprofiler2", 'gprofiler2_', remove_pattern = TRUE)
-```
-
 # Appendices
 
 ## All parameters

diff --git a/conf/modules.config b/conf/modules.config
@@ -133,7 +133,7 @@ process {
             "--plotsd_method $params.proteus_plotsd_method",
             "--plotmv_loess $params.proteus_plotmv_loess",
             "--palette_name $params.proteus_palette_name",
-            "--round_digits $params.mods_round_digits"
+            "--round_digits $params.report_round_digits"
         ].join(' ').trim() }
     }
 
@@ -281,6 +281,17 @@ process {
         ].join(' ').trim() }
     }
 
+    withName: FILTER_DIFFTABLE {
+        ext.prefix = { "${meta.id}" }
+        publishDir = [
+            [
+                path: { "${params.outdir}/tables/differential" },
+                mode: params.publish_dir_mode,
+                pattern: '*_filtered.tsv'
+            ]
+        ]
+    }
+
     withName: GSEA_GSEA {
         ext.prefix = { "${meta.id}.${gene_sets.baseName}." }
         publishDir = [
@@ -347,7 +358,7 @@ process {
             "--pval_threshold \"${params.gprofiler2_max_qval}\"",
             "--domain_scope ${params.gprofiler2_domain_scope}",
             "--min_diff \"${params.gprofiler2_min_diff}\"",
-            "--round_digits ${params.mods_round_digits}",
+            "--round_digits ${params.report_round_digits}",
             "--palette_name \"${params.gprofiler2_palette_name}\"",
             ((meta.blocking == null) ? '' : "--blocking_variables $meta.blocking"),
             ((params.differential_feature_id_column == null) ? '' : "--de_id_column \"${params.differential_feature_id_column}\""),

diff --git a/conf/test.config b/conf/test.config
@@ -48,4 +48,5 @@ params {
     // Activate gprofiler2
     gprofiler2_run = true
     gprofiler2_organism = 'mmusculus'
+    gene_set_files = '/home-link/iivow01/git/differentialabundance/testdata/combo_gprofiler_hallmark_mmusculus.gmt'
 }
diff --git a/conf/test_affy.config b/conf/test_affy.config
@@ -42,5 +42,5 @@ params {
 
     // Activate GSEA
     gsea_run = true
-    mods_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
+    gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -33,8 +33,8 @@ params {
     report_description = "This is a full-sized test dataset contributed by Oskar Wacker"
 
     // Activate GSEA
-    gsea_run = true
-    mods_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'
+    gsea_run = false
+//    gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'
 
     // Activate gprofiler2
     gprofiler2_run = true

diff --git a/docs/usage.md b/docs/usage.md
@@ -290,7 +290,7 @@ The organism (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) is req
 
 ```bash
 --gsea_run true \
---mods_gene_sets gene_sets.gmt
+--gene_set_files gene_sets.gmt
 ```
 
 ## Running the pipeline

diff --git a/modules/local/filter_difftable.nf b/modules/local/filter_difftable.nf
@@ -0,0 +1,61 @@
+process FILTER_DIFFTABLE {
+
+    label 'process_single'
+
+    conda "conda-forge::gawk=5.1.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
+        'biocontainers/gawk:5.1.0' }"
+
+    input:
+    tuple val(meta), path(tsv)
+    tuple val(logFC_column), val(logFC_threshold)
+    tuple val(padj_column), val(padj_threshold)
+
+    output:
+    tuple val(meta), path("*_filtered.tsv") , emit: filtered
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def VERSION = '9.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    """
+    function find_column_number {
+        file=\$1
+        column=\$2
+
+        head -n 1 \$file | tr '\\t' '\\n' | grep -n "^\${column}\$" | awk -F':' '{print \$1}'
+    }
+
+    # export the two threshold as otherwise, awk will for some reason interpret them incorrectly; padj_threshold 0.05 then becomes 0
+    export logFC_threshold
+    export padj_threshold
+
+
+    logFC_threshold=$logFC_threshold
+    padj_threshold=$padj_threshold
+
+    logFC_col=\$(find_column_number $tsv $logFC_column)
+    padj_col=\$(find_column_number $tsv $padj_column)
+    outfile=\$(echo $tsv | sed 's/\\(.*\\)\\..*/\\1/')_filtered.tsv
+
+    head -n 1 $tsv > \${outfile}.tmp
+
+    # The following snippet performs the following checks on each row (add +0.0 to the numbers so that they are definitely treated as numerics):
+    # 1. Check that the current logFC/padj is not NA
+    # 2. Check that the current logFC is >= threshold (abs does not work, so use a workaround)
+    # 3. Check that the current padj is <= threshold
+    # If this is true, the row is written to the new file, otherwise not
+    tail -n +2 $tsv | awk -F'\\t' -v logFC_col=\$logFC_col -v padj_col=\$padj_col '
+    \$logFC_col != "NA" && \$padj_col != "NA" && (\$logFC_col+0.0 >= 0 ? \$logFC_col+0.0 >= ENVIRON["logFC_threshold"]+0.0 : -(\$logFC_col+0.0) >= ENVIRON["logFC_threshold"]+0.0) && \$padj_col+0.0 <= ENVIRON["padj_threshold"]+0.0 {print}' >> \${outfile}.tmp
+
+    mv \${outfile}.tmp \${outfile}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //'))
+    END_VERSIONS
+    """
+}