From 72215e3e0fa2ffb1f3d163f6844a3a68e36273b1 Mon Sep 17 00:00:00 2001 From: mahendra-mariadassou Date: Fri, 31 May 2019 00:13:54 +0200 Subject: [PATCH] Add fast_tax_glom and have ggformat and top_taxa depend on it. --- R/deprecated.R | 49 ++++++++++++++++++++++ R/graphical_methods.R | 96 +++++++++++++++++++++---------------------- man/fast_tax_glom.Rd | 35 ++++++++++++++++ 3 files changed, 131 insertions(+), 49 deletions(-) create mode 100644 man/fast_tax_glom.Rd diff --git a/R/deprecated.R b/R/deprecated.R index 734fcfe..6519094 100644 --- a/R/deprecated.R +++ b/R/deprecated.R @@ -53,3 +53,52 @@ rarecurve2 <- function (physeq, step = 1, sample, xlab = "Sample Size", ylab = " } invisible(out) } + + +## Plotting fonction once library sizes have been estimated +ggnorm <- function(physeq, cds, x = "X.SampleID", color = NULL, title = NULL) { + ## Args: + ## - cds: Count data set (class eSet of BioConductor) + ## - x: ggplot x aesthetics, used for plotting + ## - color: ggplot color aes, used for plotting + ## - title: optional, plot title + ## - physeq: phyloseq class object from which metadata are extracted + ## + ## Returns: + ## - ggplot2 figure with distribution of normalized log2(counts+1) on left, colored by + ## color and mean/variance function on right panel. + countdf <- counts(cds, normalize = TRUE) + countdf <- log2(countdf+1) + countdf[countdf == 0] <- NA + countdf <- melt(countdf, varnames = c("OTU", "X.SampleID")) + countdf <- countdf[!is.na(countdf$value), ] + countdf <- merge(countdf, as(sample_data(physeq), "data.frame")) + p <- ggplot(countdf, aes_string(x = x, y = "value", color = color)) + geom_boxplot() + p <- p + theme(axis.text.x = element_text(angle = 90)) + scale_x_discrete("Sample") + p <- p + scale_y_discrete("log2(Normalized counts + 1)") + if (!is.null(title)) { p <- p + ggtitle(title) } + ## Open graphical device + par(no.readonly=TRUE) + plot.new() + ## setup layout + gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,1), 'null')) + ## grid.show.layout(gl) + ## setup viewports + vp.1 <- viewport(layout.pos.col=1) # boxplot + vp.2 <- viewport(layout.pos.col=2) # mean - sd plot + ## init layout + pushViewport(viewport(layout=gl)) + ## Access the first viewport + pushViewport(vp.1) + ## print our ggplot graphics here + print(p, newpage=FALSE) + ## done with the viewport + popViewport() + ## Move to the second viewport + pushViewport(vp.2) + ## start new base graphics in second viewport + par(new=TRUE, fig=gridFIG()) + meanSdPlot(log2(counts(cds, normalized = TRUE)[, ] + 1), ranks = FALSE, main = title) + ## done with the viewport + popViewport() +} diff --git a/R/graphical_methods.R b/R/graphical_methods.R index 5c9ad79..11d53df 100644 --- a/R/graphical_methods.R +++ b/R/graphical_methods.R @@ -216,7 +216,7 @@ top_taxa <- function(physeq, taxaRank = NULL, numberOfTaxa = 9) { transform_sample_counts(function(x) {x / sum(x)}) ## Manual tax glom if (taxaRank != "OTU") { - physeq <- tax_glom(physeq, taxrank = taxaRank) + physeq <- fast_tax_glom(physeq, taxrank = taxaRank) } ## Most abundant taxa top_taxa <- taxa_sums(physeq) %>% @@ -227,6 +227,51 @@ top_taxa <- function(physeq, taxaRank = NULL, numberOfTaxa = 9) { tax_table(physeq)[top_taxa, 1:match(taxaRank, rank_names(physeq))] %>% as('matrix') } +#' Fast alternative to \code{\link{tax_glom}} +#' +#' @param physeq Required. \code{\link{phyloseq-class}} object +#' @param taxrank A character string specifying the taxonomic level that you want to agglomerate over. Should be among the results of \code{rank_names(physeq)}. The default value is \code{rank_names(physeq)[1]}, which may agglomerate too broadly for a given experiment. You are strongly encouraged to try different values for this argument. +#' @param bad_empty (Optional). Character vector. Default: c("", " ", "\t"). Defines the bad/empty values that should be replaced by "Unknown". +#' +#' @return A taxonomically-agglomerated \code{\link{phyloseq-class}} object. +#' @export +#' +#' @details fast_tax_glom differs from \code{\link{tax_glom}} in three important ways: +#' * It is based on dplyr function and thus much faster +#' * It does not preserve the phy_tree and refseq slots +#' * It only preserves taxonomic ranks up to \code{taxrank} and discard all other ones. +#' +#' @seealso \code{\link{tax_glom}} +#' @importFrom tibble column_to_rownames +#' @examples +#' data(food) +#' fast_tax_glom(food, "Species") +fast_tax_glom <- function(physeq, taxrank = rank_names(physeq)[1], bad_empty = c("", " ", "\t")) { + rank_number <- match(taxrank, rank_names(physeq)) + if (is.na(rank_number)) { + stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") + } + ranks <- rank_names(physeq)[1:rank_number] + ## change NA and empty to Unknown before merging + tax <- as(tax_table(physeq), "matrix") + tax[is.na(tax)] <- "Unknown" + tax[tax %in% bad_empty] <- "Unknown" + ## create groups + tax <- as_tibble(tax) %>% select(one_of(ranks)) %>% group_by_all() %>% + mutate(group = group_indices()) + ## create new_taxonomy + new_tax <- tax %>% slice(1) %>% arrange(group) %>% + tibble::column_to_rownames(var = "group") %>% as.matrix() + ## create new count table + otutab <- otu_table(physeq) + if (!taxa_are_rows(physeq)) otutab <- t(otutab) + otutab <- rowsum(otutab, group = tax$group, reorder = TRUE) + ## return merged phyloseq + phyloseq(sample_data(physeq), + tax_table(new_tax), + otu_table(otutab, taxa_are_rows = TRUE)) +} + ## Find numberOfConditions most abundant conditions in variable top_conditions <- function(physeq, variable, numberOfConditions = 9) { stopifnot(!is.null(sample_data(physeq, FALSE))) @@ -430,53 +475,6 @@ gg_color_hue <- function(n) { scales::hue_pal()(n) } -## Plotting fonction once library sizes have been estimated -ggnorm <- function(physeq, cds, x = "X.SampleID", color = NULL, title = NULL) { - ## Args: - ## - cds: Count data set (class eSet of BioConductor) - ## - x: ggplot x aesthetics, used for plotting - ## - color: ggplot color aes, used for plotting - ## - title: optional, plot title - ## - physeq: phyloseq class object from which metadata are extracted - ## - ## Returns: - ## - ggplot2 figure with distribution of normalized log2(counts+1) on left, colored by - ## color and mean/variance function on right panel. - countdf <- counts(cds, normalize = TRUE) - countdf <- log2(countdf+1) - countdf[countdf == 0] <- NA - countdf <- melt(countdf, varnames = c("OTU", "X.SampleID")) - countdf <- countdf[!is.na(countdf$value), ] - countdf <- merge(countdf, as(sample_data(physeq), "data.frame")) - p <- ggplot(countdf, aes_string(x = x, y = "value", color = color)) + geom_boxplot() - p <- p + theme(axis.text.x = element_text(angle = 90)) + scale_x_discrete("Sample") - p <- p + scale_y_discrete("log2(Normalized counts + 1)") - if (!is.null(title)) { p <- p + ggtitle(title) } - ## Open graphical device - par(no.readonly=TRUE) - plot.new() - ## setup layout - gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,1), 'null')) - ## grid.show.layout(gl) - ## setup viewports - vp.1 <- viewport(layout.pos.col=1) # boxplot - vp.2 <- viewport(layout.pos.col=2) # mean - sd plot - ## init layout - pushViewport(viewport(layout=gl)) - ## Access the first viewport - pushViewport(vp.1) - ## print our ggplot graphics here - print(p, newpage=FALSE) - ## done with the viewport - popViewport() - ## Move to the second viewport - pushViewport(vp.2) - ## start new base graphics in second viewport - par(new=TRUE, fig=gridFIG()) - meanSdPlot(log2(counts(cds, normalized = TRUE)[, ] + 1), ranks = FALSE, main = title) - ## done with the viewport - popViewport() -} #' Format phyloseq data for easy composition plots #' @@ -519,7 +517,7 @@ ggformat <- function(physeq, taxaRank1 = "Phylum", taxaSet1 = "Proteobacteria", tax[tax %in% c("", "unclassified", "Unclassified", "NA")] <- "Unknown" tax_table(physeq) <- tax if (taxaRank2 != "OTU_rank") { - physeq <- tax_glom(physeq, taxrank = taxaRank2) + physeq <- fast_tax_glom(physeq, taxrank = taxaRank2) } ## Keep only numberOfTaxa top taxa and aggregate the rest as "Other" diff --git a/man/fast_tax_glom.Rd b/man/fast_tax_glom.Rd new file mode 100644 index 0000000..1f0a076 --- /dev/null +++ b/man/fast_tax_glom.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/graphical_methods.R +\name{fast_tax_glom} +\alias{fast_tax_glom} +\title{Fast alternative to \code{\link{tax_glom}}} +\usage{ +fast_tax_glom(physeq, taxrank = rank_names(physeq)[1], + bad_empty = c("", " ", "\\t")) +} +\arguments{ +\item{physeq}{Required. \code{\link{phyloseq-class}} object} + +\item{taxrank}{A character string specifying the taxonomic level that you want to agglomerate over. Should be among the results of \code{rank_names(physeq)}. The default value is \code{rank_names(physeq)[1]}, which may agglomerate too broadly for a given experiment. You are strongly encouraged to try different values for this argument.} + +\item{bad_empty}{(Optional). Character vector. Default: c("", " ", "\t"). Defines the bad/empty values that should be replaced by "Unknown".} +} +\value{ +A taxonomically-agglomerated \code{\link{phyloseq-class}} object. +} +\description{ +Fast alternative to \code{\link{tax_glom}} +} +\details{ +fast_tax_glom differs from \code{\link{tax_glom}} in three important ways: +* It is based on dplyr function and thus much faster +* It does not preserve the phy_tree and refseq slots +* It only preserves taxonomic ranks up to \code{taxrank} and discard all other ones. +} +\examples{ +data(food) +fast_tax_glom(food, "Species") +} +\seealso{ +\code{\link{tax_glom}} +}