Skip to content

Commit

Permalink
added difftable filtering module, added short docu section for gsea, …
Browse files Browse the repository at this point in the history
…renamed params
  • Loading branch information
WackerO committed Dec 8, 2023
1 parent 5250c22 commit 8519c11
Show file tree
Hide file tree
Showing 12 changed files with 313 additions and 231 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`

- [[#203](https://github.com/nf-core/differentialabundance/pull/203)] - Transcript lengths for DESeq2 ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse))
- [[#199](https://github.com/nf-core/differentialabundance/pull/199)] - Add gprofiler2 module([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
- [[#199](https://github.com/nf-core/differentialabundance/pull/199)] - Add gprofiler2 module and local differential table filtering module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
- [[#193](https://github.com/nf-core/differentialabundance/pull/193)] - Add DESeq2 text to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
- [[#192](https://github.com/nf-core/differentialabundance/pull/192)] - Add scree plot in report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
- [[#189](https://github.com/nf-core/differentialabundance/pull/189)] - Add DE models to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
Expand Down
45 changes: 20 additions & 25 deletions assets/differentialabundance_report.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ params:
report_author: NULL,
report_description: NULL,
report_scree: NULL
mods_gene_sets: NULL
mods_round_digits: NULL
gene_set_files: NULL
report_round_digits: NULL
observations_type: NULL
observations: NULL # GSE156533.samplesheet.csv
observations_id_col: NULL
Expand Down Expand Up @@ -894,7 +894,7 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
if (unlist(params[paste0(gene_set_method, '_run')])){
cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n")
if (gene_set_method == 'gsea') {
for (gmt_file in simpleSplit(params$mods_gene_sets)) {
for (gmt_file in simpleSplit(params$gene_set_files)) {
gmt_name <- basename(tools::file_path_sans_ext(gmt_file))
cat("\n##### ", gmt_name ," {.tabset}\n")
Expand All @@ -911,32 +911,33 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
} else if (gene_set_method == 'gprofiler2') {
enrichment_files <- grep("gprofiler2", list.files(params$input_dir), value=T, fixed=T)
if (length(grep(".html", enrichment_files, fixed=T))) {
cat(paste0("\nThis section contains the results of the pathway analysis which was done with the R package gprofiler2. The plots show the -log10 adjusted p values of each pathway which was found to be enriched for differential genes (if necessary determined by converting feature IDs to gene IDs); if possible, pathways are grouped by their source database. The tables below give additional info for each pathway; the differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
tsv_files <- grep(".tsv", enrichment_files, fixed=T)
if (length(tsv_files)) {
cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n"))
for (html in rev(grep(".html", enrichment_files, value=T, fixed=T))) {
contrast <- unlist(strsplit(html, "gprofiler2.", fixed=T))[2]
contrast <- unlist(strsplit(contrast, ".gostplot", fixed=T))[1]
if (! file.exists(file.path(params$input_dir, html))){
stop(paste("gprofiler2 gost plot", file.path(params$input_dir, html), "does not exist"))
for (i in 1:nrow(contrasts)) {
contrast_name <- paste(contrasts$variable[i], contrasts$reference[i], contrasts$target[i], sep = '_')
contrast_name <- paste('gprofiler2', contrast_name, sep = '.')
if (contrasts$blocking[i] != "") {
blocking_variables <- paste(make.names(unlist(strsplit(contrasts$blocking[i], split = ','))), collapse = '_')
contrast_name <- paste(contrast_name, blocking_variables, sep= '_')
}
cat(paste0("\n##### ", contrast, "\n"))
cat(paste0('<embed type="text/html" src="', file.path(params$input_dir, html), '" width="800" height="400">'))
table <- paste0("gprofiler2.", contrast, ".all_enriched_pathways.tsv")
table <- paste0(contrast_name, ".all_enriched_pathways.tsv")
if (! file.exists(file.path(params$input_dir, table))){
stop(paste("gprofiler2 enrichment table", file.path(params$input_dir, table), "does not exist"))
}
all_enriched <- read.table(file.path(params$input_dir, table), header=T, sep="\t", quote="\"")
all_enriched <- data.frame("Pathway name" = all_enriched$term_name, "Pathway code" = all_enriched$term_id,
"Differential features" = all_enriched$intersection_size, "Pathway size" = all_enriched$term_size,
"Differential fraction" = (all_enriched$intersection_size/all_enriched$term_size),
"Adjusted p value" = all_enriched$p_value, check.names = FALSE)
all_enriched <- round_dataframe_columns(all_enriched, digits=params$mods_round_digits)
print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrast, " (check", table, "for more detail)"), rownames = FALSE)))
all_enriched <- round_dataframe_columns(all_enriched, digits=params$report_round_digits)
cat(paste0("\n##### ", contrasts$id[i], "\n"))
print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrast_name, " (check", table, "for more detail)"), rownames = FALSE)))
cat("\n")
}
} else {
Expand Down Expand Up @@ -999,7 +1000,7 @@ make_params_table('downstream differential analysis', 'differential_', remove_pa
<!-- If any gene set methods have been activated show their params -->

```{r, echo=FALSE, results='asis'}
possible_gene_set_methods <- c('gsea')
possible_gene_set_methods <- c('gsea', 'gprofiler2')
if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
cat("\n### Gene set analysis\n")
Expand All @@ -1014,12 +1015,6 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
}
```

```{r, echo=FALSE, results='asis', eval=params$gprofiler2_run}
cat("\n### Pathway enrichment analysis\n")
cat("\n#### gprofiler2\n")
make_params_table("gprofiler2", 'gprofiler2_', remove_pattern = TRUE)
```

# Appendices

## All parameters
Expand Down
15 changes: 13 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ process {
"--plotsd_method $params.proteus_plotsd_method",
"--plotmv_loess $params.proteus_plotmv_loess",
"--palette_name $params.proteus_palette_name",
"--round_digits $params.mods_round_digits"
"--round_digits $params.report_round_digits"
].join(' ').trim() }
}

Expand Down Expand Up @@ -281,6 +281,17 @@ process {
].join(' ').trim() }
}

withName: FILTER_DIFFTABLE {
ext.prefix = { "${meta.id}" }
publishDir = [
[
path: { "${params.outdir}/tables/differential" },
mode: params.publish_dir_mode,
pattern: '*_filtered.tsv'
]
]
}

withName: GSEA_GSEA {
ext.prefix = { "${meta.id}.${gene_sets.baseName}." }
publishDir = [
Expand Down Expand Up @@ -347,7 +358,7 @@ process {
"--pval_threshold \"${params.gprofiler2_max_qval}\"",
"--domain_scope ${params.gprofiler2_domain_scope}",
"--min_diff \"${params.gprofiler2_min_diff}\"",
"--round_digits ${params.mods_round_digits}",
"--round_digits ${params.report_round_digits}",
"--palette_name \"${params.gprofiler2_palette_name}\"",
((meta.blocking == null) ? '' : "--blocking_variables $meta.blocking"),
((params.differential_feature_id_column == null) ? '' : "--de_id_column \"${params.differential_feature_id_column}\""),
Expand Down
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@ params {
// Activate gprofiler2
gprofiler2_run = true
gprofiler2_organism = 'mmusculus'
gene_set_files = '/home-link/iivow01/git/differentialabundance/testdata/combo_gprofiler_hallmark_mmusculus.gmt'
}
2 changes: 1 addition & 1 deletion conf/test_affy.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ params {

// Activate GSEA
gsea_run = true
mods_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt'
}
4 changes: 2 additions & 2 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ params {
report_description = "This is a full-sized test dataset contributed by Oskar Wacker"

// Activate GSEA
gsea_run = true
mods_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'
gsea_run = false
// gene_set_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt'

// Activate gprofiler2
gprofiler2_run = true
Expand Down
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ The organism (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) is req

```bash
--gsea_run true \
--mods_gene_sets gene_sets.gmt
--gene_set_files gene_sets.gmt
```

## Running the pipeline
Expand Down
61 changes: 61 additions & 0 deletions modules/local/filter_difftable.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
process FILTER_DIFFTABLE {

label 'process_single'

conda "conda-forge::gawk=5.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'biocontainers/gawk:5.1.0' }"

input:
tuple val(meta), path(tsv)
tuple val(logFC_column), val(logFC_threshold)
tuple val(padj_column), val(padj_threshold)

output:
tuple val(meta), path("*_filtered.tsv") , emit: filtered
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def VERSION = '9.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
"""
function find_column_number {
file=\$1
column=\$2
head -n 1 \$file | tr '\\t' '\\n' | grep -n "^\${column}\$" | awk -F':' '{print \$1}'
}
# export the two threshold as otherwise, awk will for some reason interpret them incorrectly; padj_threshold 0.05 then becomes 0
export logFC_threshold
export padj_threshold
logFC_threshold=$logFC_threshold
padj_threshold=$padj_threshold
logFC_col=\$(find_column_number $tsv $logFC_column)
padj_col=\$(find_column_number $tsv $padj_column)
outfile=\$(echo $tsv | sed 's/\\(.*\\)\\..*/\\1/')_filtered.tsv
head -n 1 $tsv > \${outfile}.tmp
# The following snippet performs the following checks on each row (add +0.0 to the numbers so that they are definitely treated as numerics):
# 1. Check that the current logFC/padj is not NA
# 2. Check that the current logFC is >= threshold (abs does not work, so use a workaround)
# 3. Check that the current padj is <= threshold
# If this is true, the row is written to the new file, otherwise not
tail -n +2 $tsv | awk -F'\\t' -v logFC_col=\$logFC_col -v padj_col=\$padj_col '
\$logFC_col != "NA" && \$padj_col != "NA" && (\$logFC_col+0.0 >= 0 ? \$logFC_col+0.0 >= ENVIRON["logFC_threshold"]+0.0 : -(\$logFC_col+0.0) >= ENVIRON["logFC_threshold"]+0.0) && \$padj_col+0.0 <= ENVIRON["padj_threshold"]+0.0 {print}' >> \${outfile}.tmp
mv \${outfile}.tmp \${outfile}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //'))
END_VERSIONS
"""
}
Loading

0 comments on commit 8519c11

Please sign in to comment.