From 3268ac056f9d414f2c9c8b1af457fec52b4cb6cd Mon Sep 17 00:00:00 2001 From: Alexey Sergushichev Date: Mon, 5 Aug 2024 11:06:33 -0500 Subject: [PATCH] check for empty strings in gene names, fixes #149 --- DESCRIPTION | 2 +- NEWS | 3 ++ R/fgsea.R | 36 ++--------------------- R/fgseaORA.R | 25 ++++------------ R/geseca-utils.R | 13 ++------- R/util.R | 45 +++++++++++++++++++++++++++++ tests/testthat/test_gsea_analysis.R | 4 +++ 7 files changed, 63 insertions(+), 65 deletions(-) create mode 100644 R/util.R diff --git a/DESCRIPTION b/DESCRIPTION index fe06714..3f3dcb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: fgsea Title: Fast Gene Set Enrichment Analysis -Version: 1.31.2 +Version: 1.31.3 Authors@R: c(person("Gennady", "Korotkevich", role = "aut"), person("Vladimir", "Sukhov", role = "aut"), person("Nikolay", "Budin", role = "ctb"), diff --git a/NEWS b/NEWS index 3ca2aa2..38ab4c3 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +CHANGES IN VERSION 1.31.3 +* check for empty strings in gene names + CHANGES IN VERSION 1.31.2 * fora() provides fold enrichment scores for more effective prioritisation of results diff --git a/R/fgsea.R b/R/fgsea.R index 7bd512b..7865541 100644 --- a/R/fgsea.R +++ b/R/fgsea.R @@ -39,26 +39,6 @@ fgsea <- function(pathways, stats, minSize = 1, maxSize = length(stats)-1, gseaP preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam, scoreType){ - # Error if pathways is not a list - if (!is.list(pathways)) { - stop("pathways should be a list with each element containing names of the stats argument") - } - - # Error if stats is not named - if (is.null(names(stats))) { - stop("stats should be named") - } - - # Error if stats names are NA - if (any(is.na(names(stats)))) { - stop("NAs in names(stats) are not allowed") - } - - # Error for duplicate gene names - if (any(duplicated(names(stats)))) { - stop("Duplicate names(stats) are not allowed") - } - # Error if stats are non-finite if (any(!is.finite(stats))){ stop("Not all stats values are finite numbers") @@ -81,21 +61,11 @@ preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam stats <- sort(stats, decreasing=TRUE) stats <- abs(stats) ^ gseaParam + res <- preparePathways(pathways, universe=names(stats), minSize, maxSize) - minSize <- max(minSize, 1) - maxSize <- min(maxSize, length(stats)-1) + res$stats <- stats - pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, names(stats)))) }) - pathwaysSizes <- sapply(pathwaysFiltered, length) - - toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize) - - pathwaysFiltered <- pathwaysFiltered[toKeep] - pathwaysSizes <- pathwaysSizes[toKeep] - - list(filtered=pathwaysFiltered, - sizes=pathwaysSizes, - stats=stats) + res } diff --git a/R/fgseaORA.R b/R/fgseaORA.R index c80339a..3376e00 100644 --- a/R/fgseaORA.R +++ b/R/fgseaORA.R @@ -22,25 +22,12 @@ #' data(exampleRanks) #' foraRes <- fora(examplePathways, genes=tail(names(exampleRanks), 200), universe=names(exampleRanks)) fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)-1) { - # Error if pathways is not a list - if (!is.list(pathways)) { - stop("pathways should be a list with each element containing genes from the universe") - } - - # Warning message for duplicate gene names - if (any(duplicated(universe))) { - warning("There were duplicate genes in universe, they were collapsed") - universe <- unique(universe) - } - - minSize <- max(minSize, 1) + pp <- preparePathways(pathways, universe, minSize, maxSize) + pathwaysFiltered <- pp$filtered + pathwaysSizes <- pp$sizes - pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) }) - pathwaysSizes <- sapply(pathwaysFiltered, length) - toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize) - - if (length(toKeep) == 0){ + if (length(pathwaysFiltered) == 0){ return(data.table(pathway=character(), pval=numeric(), padj=numeric(), @@ -50,13 +37,11 @@ fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)- overlapGenes=list())) } - pathwaysFiltered <- pathwaysFiltered[toKeep] - pathwaysSizes <- pathwaysSizes[toKeep] - if (!all(genes %in% universe)) { warning("Not all of the input genes belong to the universe, such genes were removed") } + genesFiltered <- unique(na.omit(fmatch(genes, universe))) overlaps <- lapply(pathwaysFiltered, intersect, genesFiltered) diff --git a/R/geseca-utils.R b/R/geseca-utils.R index e9c21fa..8880fc8 100644 --- a/R/geseca-utils.R +++ b/R/geseca-utils.R @@ -26,18 +26,9 @@ checkGesecaArgs <- function(E, pathways){ } gesecaPreparePathways <- function(E, pathways, minSize, maxSize){ - minSize <- max(minSize, 1) - maxSize <- min(nrow(E) - 1, maxSize) + res <- preparePathways(pathways, universe = rownames(E), minSize, maxSize) - pathwaysFiltered <- lapply(pathways, function(p) {unique(na.omit(fmatch(p, rownames(E))))}) - pathwaysSizes <- sapply(pathwaysFiltered, length) - - toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize) - pathwaysFiltered <- pathwaysFiltered[toKeep] - pathwaysSizes <- pathwaysSizes[toKeep] - - return(list(filtered=pathwaysFiltered, - sizes=pathwaysSizes)) + return(res) } diff --git a/R/util.R b/R/util.R new file mode 100644 index 0000000..140dda1 --- /dev/null +++ b/R/util.R @@ -0,0 +1,45 @@ +preparePathways <- function(pathways, universe, minSize, maxSize){ + universeArg <- deparse(match.call()$universe) + # Error if pathways is not a list + if (!is.list(pathways)) { + stop("pathways should be a list with each element containing gene identifiers") + } + + # Error if stats is not named + if (is.null(universe)) { + .stopf("%s should not be null", universeArg) + } + + # Error if stats names are NA + if (any(is.na(universe))) { + .stopf("NAs in %s are not allowed", universeArg) + } + + # Error if stats names are empty string + if (any(universe == "")) { + .stopf("Empty strings are not allowed in %s", universeArg) + } + + # Error for duplicate gene names + if (any(duplicated(universe))) { + .stopf("Duplicate values in %s not allowed", universeArg) + } + + minSize <- max(minSize, 1) + maxSize <- min(maxSize, length(universe)-1) + + pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) }) + pathwaysSizes <- lengths(pathwaysFiltered) + + toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize) + + pathwaysFiltered <- pathwaysFiltered[toKeep] + pathwaysSizes <- pathwaysSizes[toKeep] + + list(filtered=pathwaysFiltered, + sizes=pathwaysSizes) +} + +.stopf <- function(fmt, ...) { + stop(sprintf(fmt, ...)) +} diff --git a/tests/testthat/test_gsea_analysis.R b/tests/testthat/test_gsea_analysis.R index 931414c..d7f2e23 100644 --- a/tests/testthat/test_gsea_analysis.R +++ b/tests/testthat/test_gsea_analysis.R @@ -126,6 +126,10 @@ test_that("fgseaSimple correctly checks gene names", { names(ranks)[41] <- NA expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1)) + ranks <- exampleRanks + names(ranks)[41] <- "" + expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1)) + ranks <- unname(exampleRanks) expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))