From 72215e3e0fa2ffb1f3d163f6844a3a68e36273b1 Mon Sep 17 00:00:00 2001
From: mahendra-mariadassou <mahendra.mariadassou@gmail.com>
Date: Fri, 31 May 2019 00:13:54 +0200
Subject: [PATCH] Add fast_tax_glom and have ggformat and top_taxa depend on
 it.

---
 R/deprecated.R        | 49 ++++++++++++++++++++++
 R/graphical_methods.R | 96 +++++++++++++++++++++----------------------
 man/fast_tax_glom.Rd  | 35 ++++++++++++++++
 3 files changed, 131 insertions(+), 49 deletions(-)
 create mode 100644 man/fast_tax_glom.Rd

diff --git a/R/deprecated.R b/R/deprecated.R
index 734fcfe..6519094 100644
--- a/R/deprecated.R
+++ b/R/deprecated.R
@@ -53,3 +53,52 @@ rarecurve2 <- function (physeq, step = 1, sample, xlab = "Sample Size", ylab = "
   }
   invisible(out)
 }
+
+
+## Plotting fonction once library sizes have been estimated
+ggnorm <- function(physeq, cds, x = "X.SampleID", color = NULL, title = NULL) {
+  ## Args:
+  ## - cds: Count data set (class eSet of BioConductor)
+  ## - x:  ggplot x aesthetics, used for plotting
+  ## - color: ggplot color aes, used for plotting
+  ## - title: optional, plot title
+  ## - physeq: phyloseq class object from which metadata are extracted
+  ##
+  ## Returns:
+  ## - ggplot2 figure with distribution of normalized log2(counts+1) on left, colored by
+  ##   color and mean/variance function on right panel.
+  countdf <- counts(cds, normalize = TRUE)
+  countdf <- log2(countdf+1)
+  countdf[countdf == 0] <- NA
+  countdf <- melt(countdf, varnames = c("OTU", "X.SampleID"))
+  countdf <- countdf[!is.na(countdf$value), ]
+  countdf <- merge(countdf, as(sample_data(physeq), "data.frame"))
+  p <- ggplot(countdf, aes_string(x = x, y = "value", color = color)) + geom_boxplot()
+  p <- p + theme(axis.text.x = element_text(angle = 90)) + scale_x_discrete("Sample")
+  p <- p + scale_y_discrete("log2(Normalized counts + 1)")
+  if (!is.null(title)) {  p <- p + ggtitle(title) }
+  ## Open graphical device
+  par(no.readonly=TRUE)
+  plot.new()
+  ## setup layout
+  gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,1), 'null'))
+  ## grid.show.layout(gl)
+  ## setup viewports
+  vp.1 <- viewport(layout.pos.col=1) # boxplot
+  vp.2 <- viewport(layout.pos.col=2) # mean - sd plot
+  ## init layout
+  pushViewport(viewport(layout=gl))
+  ## Access the first viewport
+  pushViewport(vp.1)
+  ## print our ggplot graphics here
+  print(p, newpage=FALSE)
+  ## done with the viewport
+  popViewport()
+  ## Move to the second viewport
+  pushViewport(vp.2)
+  ##  start new base graphics in second viewport
+  par(new=TRUE, fig=gridFIG())
+  meanSdPlot(log2(counts(cds, normalized = TRUE)[, ] + 1), ranks = FALSE, main = title)
+  ##  done with the viewport
+  popViewport()
+}
diff --git a/R/graphical_methods.R b/R/graphical_methods.R
index 5c9ad79..11d53df 100644
--- a/R/graphical_methods.R
+++ b/R/graphical_methods.R
@@ -216,7 +216,7 @@ top_taxa <- function(physeq, taxaRank = NULL, numberOfTaxa = 9) {
     transform_sample_counts(function(x) {x / sum(x)})
   ## Manual tax glom
   if (taxaRank != "OTU") {
-    physeq <- tax_glom(physeq, taxrank = taxaRank)
+    physeq <- fast_tax_glom(physeq, taxrank = taxaRank)
   }
   ## Most abundant taxa
   top_taxa <- taxa_sums(physeq) %>%
@@ -227,6 +227,51 @@ top_taxa <- function(physeq, taxaRank = NULL, numberOfTaxa = 9) {
   tax_table(physeq)[top_taxa, 1:match(taxaRank, rank_names(physeq))] %>% as('matrix')
 }
 
+#' Fast alternative to \code{\link{tax_glom}}
+#'
+#' @param physeq Required. \code{\link{phyloseq-class}} object
+#' @param taxrank A character string specifying the taxonomic level that you want to agglomerate over. Should be among the results of \code{rank_names(physeq)}. The default value is \code{rank_names(physeq)[1]}, which may agglomerate too broadly for a given experiment. You are strongly encouraged to try different values for this argument.
+#' @param bad_empty (Optional). Character vector. Default: c("", " ", "\t"). Defines the bad/empty values that should be replaced by "Unknown".
+#'
+#' @return A taxonomically-agglomerated \code{\link{phyloseq-class}} object.
+#' @export
+#'
+#' @details fast_tax_glom differs from \code{\link{tax_glom}} in three important ways:
+#' * It is based on dplyr function and thus much faster
+#' * It does not preserve the phy_tree and refseq slots
+#' * It only preserves taxonomic ranks up to \code{taxrank} and discard all other ones.
+#'
+#' @seealso \code{\link{tax_glom}}
+#' @importFrom tibble column_to_rownames
+#' @examples
+#' data(food)
+#' fast_tax_glom(food, "Species")
+fast_tax_glom <- function(physeq, taxrank = rank_names(physeq)[1], bad_empty = c("", " ", "\t")) {
+  rank_number <- match(taxrank, rank_names(physeq))
+  if (is.na(rank_number)) {
+    stop("Bad taxrank argument. Must be among the values of rank_names(physeq)")
+  }
+  ranks <- rank_names(physeq)[1:rank_number]
+  ## change NA and empty to Unknown before merging
+  tax <- as(tax_table(physeq), "matrix")
+  tax[is.na(tax)] <- "Unknown"
+  tax[tax %in% bad_empty] <- "Unknown"
+  ## create groups
+  tax <- as_tibble(tax) %>% select(one_of(ranks)) %>% group_by_all() %>%
+    mutate(group = group_indices())
+  ## create new_taxonomy
+  new_tax <- tax %>% slice(1) %>% arrange(group) %>%
+    tibble::column_to_rownames(var = "group") %>% as.matrix()
+  ## create new count table
+  otutab <- otu_table(physeq)
+  if (!taxa_are_rows(physeq)) otutab  <- t(otutab)
+  otutab <- rowsum(otutab, group = tax$group, reorder = TRUE)
+  ## return merged phyloseq
+  phyloseq(sample_data(physeq),
+           tax_table(new_tax),
+           otu_table(otutab, taxa_are_rows = TRUE))
+}
+
 ## Find numberOfConditions most abundant conditions in variable
 top_conditions <- function(physeq, variable, numberOfConditions = 9) {
   stopifnot(!is.null(sample_data(physeq, FALSE)))
@@ -430,53 +475,6 @@ gg_color_hue <- function(n) {
   scales::hue_pal()(n)
 }
 
-## Plotting fonction once library sizes have been estimated
-ggnorm <- function(physeq, cds, x = "X.SampleID", color = NULL, title = NULL) {
-  ## Args:
-  ## - cds: Count data set (class eSet of BioConductor)
-  ## - x:  ggplot x aesthetics, used for plotting
-  ## - color: ggplot color aes, used for plotting
-  ## - title: optional, plot title
-  ## - physeq: phyloseq class object from which metadata are extracted
-  ##
-  ## Returns:
-  ## - ggplot2 figure with distribution of normalized log2(counts+1) on left, colored by
-  ##   color and mean/variance function on right panel.
-  countdf <- counts(cds, normalize = TRUE)
-  countdf <- log2(countdf+1)
-  countdf[countdf == 0] <- NA
-  countdf <- melt(countdf, varnames = c("OTU", "X.SampleID"))
-  countdf <- countdf[!is.na(countdf$value), ]
-  countdf <- merge(countdf, as(sample_data(physeq), "data.frame"))
-  p <- ggplot(countdf, aes_string(x = x, y = "value", color = color)) + geom_boxplot()
-  p <- p + theme(axis.text.x = element_text(angle = 90)) + scale_x_discrete("Sample")
-  p <- p + scale_y_discrete("log2(Normalized counts + 1)")
-  if (!is.null(title)) {  p <- p + ggtitle(title) }
-  ## Open graphical device
-  par(no.readonly=TRUE)
-  plot.new()
-  ## setup layout
-  gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,1), 'null'))
-  ## grid.show.layout(gl)
-  ## setup viewports
-  vp.1 <- viewport(layout.pos.col=1) # boxplot
-  vp.2 <- viewport(layout.pos.col=2) # mean - sd plot
-  ## init layout
-  pushViewport(viewport(layout=gl))
-  ## Access the first viewport
-  pushViewport(vp.1)
-  ## print our ggplot graphics here
-  print(p, newpage=FALSE)
-  ## done with the viewport
-  popViewport()
-  ## Move to the second viewport
-  pushViewport(vp.2)
-  ##  start new base graphics in second viewport
-  par(new=TRUE, fig=gridFIG())
-  meanSdPlot(log2(counts(cds, normalized = TRUE)[, ] + 1), ranks = FALSE, main = title)
-  ##  done with the viewport
-  popViewport()
-}
 
 #' Format phyloseq data for easy composition plots
 #'
@@ -519,7 +517,7 @@ ggformat <- function(physeq, taxaRank1 = "Phylum", taxaSet1 = "Proteobacteria",
     tax[tax %in% c("", "unclassified", "Unclassified", "NA")] <- "Unknown"
     tax_table(physeq) <- tax
     if (taxaRank2 != "OTU_rank") {
-      physeq <- tax_glom(physeq, taxrank = taxaRank2)
+      physeq <- fast_tax_glom(physeq, taxrank = taxaRank2)
     }
 
     ## Keep only numberOfTaxa top taxa and aggregate the rest as "Other"
diff --git a/man/fast_tax_glom.Rd b/man/fast_tax_glom.Rd
new file mode 100644
index 0000000..1f0a076
--- /dev/null
+++ b/man/fast_tax_glom.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/graphical_methods.R
+\name{fast_tax_glom}
+\alias{fast_tax_glom}
+\title{Fast alternative to \code{\link{tax_glom}}}
+\usage{
+fast_tax_glom(physeq, taxrank = rank_names(physeq)[1],
+  bad_empty = c("", " ", "\\t"))
+}
+\arguments{
+\item{physeq}{Required. \code{\link{phyloseq-class}} object}
+
+\item{taxrank}{A character string specifying the taxonomic level that you want to agglomerate over. Should be among the results of \code{rank_names(physeq)}. The default value is \code{rank_names(physeq)[1]}, which may agglomerate too broadly for a given experiment. You are strongly encouraged to try different values for this argument.}
+
+\item{bad_empty}{(Optional). Character vector. Default: c("", " ", "\t"). Defines the bad/empty values that should be replaced by "Unknown".}
+}
+\value{
+A taxonomically-agglomerated \code{\link{phyloseq-class}} object.
+}
+\description{
+Fast alternative to \code{\link{tax_glom}}
+}
+\details{
+fast_tax_glom differs from \code{\link{tax_glom}} in three important ways:
+* It is based on dplyr function and thus much faster
+* It does not preserve the phy_tree and refseq slots
+* It only preserves taxonomic ranks up to \code{taxrank} and discard all other ones.
+}
+\examples{
+data(food)
+fast_tax_glom(food, "Species")
+}
+\seealso{
+\code{\link{tax_glom}}
+}