Merge pull request #320 from ncborcherding/dev

BorchLab · Feb 16, 2024 · e51695a · e51695a
2 parents 3f25230 + 1e0d8ca
commit e51695a
Show file tree

Hide file tree

Showing 27 changed files with 10,595 additions and 1,613 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: scRepertoire is a toolkit for processing and analyzing single-cell
 License: MIT + file LICENSE 
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 biocViews: Software, ImmunoOncology, SingleCell, Classification, Annotation, Sequencing
 Depends: 
 	ggplot2, 
@@ -62,4 +62,3 @@ LinkingTo:
     Rcpp
 URL: https://www.borch.dev/uploads/screpertoire/
 BugReports: https://github.com/ncborcherding/scRepertoire/issues
-
diff --git a/NAMESPACE b/NAMESPACE
@@ -32,6 +32,7 @@ export(percentGenes)
 export(percentKmer)
 export(percentVJ)
 export(positionalEntropy)
+export(positionalProperty)
 export(subsetClones)
 export(vizGenes)
 import(dplyr)
@@ -51,6 +52,7 @@ importFrom(dplyr,bind_rows)
 importFrom(dplyr,count)
 importFrom(dplyr,group_by)
 importFrom(dplyr,mutate)
+importFrom(dplyr,mutate_at)
 importFrom(dplyr,sample_n)
 importFrom(dplyr,select)
 importFrom(dplyr,summarise)
@@ -98,6 +100,7 @@ importFrom(stats,mad)
 importFrom(stats,na.omit)
 importFrom(stats,optim)
 importFrom(stats,pgamma)
+importFrom(stats,qt)
 importFrom(stats,quantile)
 importFrom(stats,sd)
 importFrom(stats,setNames)

diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,8 @@
 * Added ```percentVJ()```
 * Added ```percentKmer()```
 * Added ```exportClones()``` 
+* Added ```positionalEntropy()``` 
+* Added ```positionalProperty()``` 
 * Changed compareClonotypes to ```clonalCompare()```
 * Changed clonotypeSizeDistribution to ```clonalSizeDistribution()```
 * Changed scatterClonotypes to ```clonalScatter()```
@@ -38,14 +40,14 @@
 * ```clonalDiversity()``` no longer automatically orders samples.
 * Remove **order** parameter from ```clonalQuant()```, ```clonalLength()```, and ```clonalAbundance()```
 * **x.axis** parameter in ```clonalDiversity()``` separated from **group.by** parameter
+* filtering chains will not eliminate none matching chains.
 
 ## DEPRECATED AND DEFUNCT
 
 * Deprecate stripBarcodes()
 * Deprecate expression2List() (now only an internal function).
 * Deprecate checkContigs()
 
-
 # scRepertoire VERSION 1.11.0
 
 * Rebasing for the purposes of bioconductor version

diff --git a/R/combineExpression.R b/R/combineExpression.R
@@ -69,11 +69,14 @@ combineExpression <- function(input.data,
         stop("Adjust the cloneSize parameter - there are groupings < 1")
     }
     cloneSize <- c(None = 0, cloneSize)
+
+    cloneCall <- .theCall(input.data, cloneCall)
     if (chain != "both") {
-      input.data[[i]] <- .off.the.chain(input.data[[i]], chain, cloneCall)
+      for(i in seq_along(input.data)) {
+        input.data[[i]] <- .off.the.chain(input.data[[i]], chain, cloneCall)
+      }
     }
     input.data <- .checkList(input.data)
-    cloneCall <- .theCall(input.data, cloneCall)
 
     #Getting Summaries of clones from combineTCR() or combineBCR()
     Con.df <- NULL

diff --git a/R/exportClones.R b/R/exportClones.R
@@ -60,11 +60,11 @@ exportClones <- function(input.data,
 
 .TCRmatchExport<- function(input.data) {
 
-  input.data <- .data.wrangle(input.data, group.by, "CTgene", "TRB")
+  input.data <- .data.wrangle(input.data, NULL, "CTgene", "TRB")
 
   for(i in seq_along(input.data)) {
-    input.data[[i]] <- .off.the.chain(input.data[[i]], "TRB", "CTaa")
-    input.data[[i]] <- .off.the.chain(input.data[[i]], "TRB", "CTnt")
+    input.data[[i]] <- .off.the.chain(input.data[[i]], "TRB", "CTaa", check = FALSE)
+    input.data[[i]] <- .off.the.chain(input.data[[i]], "TRB", "CTnt", check = FALSE)
   }
 
   input.data <- bind_rows(input.data, .id = "group")

diff --git a/R/global.R b/R/global.R
@@ -43,5 +43,10 @@
     utils::globalVariables ("group")
     utils::globalVariables ("chain2_aa")
     utils::globalVariables ("dotSize")
+    utils::globalVariables ("ci_lower")
+    utils::globalVariables ("ci_upper") 
+    utils::globalVariables ("mat_melt") 
+    utils::globalVariables ("position") 
+    utils::globalVariables ("se")
     invisible ()
 }
diff --git a/R/percentAA.R b/R/percentAA.R
@@ -21,6 +21,7 @@
 #' @param palette Colors to use in visualization - input any \link[grDevices]{hcl.pals}.
 #' @import ggplot2
 #' @importFrom reshape2 melt
+#' @importFrom dplyr mutate_at %>%
 #' @export
 #' @concept Summarize_Repertoire
 #' @return ggplot of stacked bar graphs of amino acid proportions
@@ -37,29 +38,17 @@ percentAA <- function(input.data,
     input.data <- .groupList(input.data, group.by)
   }
 
-  res.list <- list()
-  for (i in seq_along(input.data)) {
-    strings <- input.data[[i]][,"CTaa"]
-    strings <- do.call(c,str_split(strings, ";"))
-    strings <- strings[strings != "NA"]
-    strings <- strings[nchar(strings) < aa.length]
-    strings <- na.omit(strings)
-    strings <- .padded_strings(strings, aa.length)
-    strings <- do.call(rbind, strings)
-
-    #Summarizing the % of each position
-    aa.output <- apply(strings, 2, function(x) {
-      summary <- as.data.frame(prop.table(table(x, useNA = "always")))
-    })
-
-    #Forming a matrix of % across each position and formatting
-    res <- suppressWarnings(Reduce(function(...) merge(..., all = TRUE, by="x"), aa.output))
-    colnames(res) <- c("AA", paste0("pos.", seq_len(aa.length)))
-    res[seq_len(20),][is.na(res[seq_len(20),])] <- 0
-    melt.res <- suppressMessages(melt(res))
-    melt.res$group <- names(input.data)[i]
-    res.list[[i]] <- melt.res
-  }
+  #Getting AA Counts
+  aa.count.list <- .aa.counter(input.data, "CTaa", aa.length)
+
+  #Calculating proportion and melting data
+  lapply(seq_along(aa.count.list), function(x) {
+    aa.count.list[[x]] <- aa.count.list[[x]] %>% mutate_if(is.numeric, list(~ ./sum(.)))
+    melt.res <- suppressMessages(melt(aa.count.list[[x]]))
+    melt.res$group <- names(input.data)[x]
+    melt.res
+  }) -> res.list
+
   mat_melt <- do.call(rbind, res.list)
   plot <- ggplot(mat_melt, aes(x=as.factor(variable), y = value, fill=AA)) +
     geom_bar(stat = "identity", position="fill", lwd= 0.25, color = "black") +
@@ -78,15 +67,3 @@ percentAA <- function(input.data,
   return(plot)
 }    
 
-.padded_strings <- function(strings, max_length) {
-
-      x <- lapply(strings, function(str) {
-        str_len <- nchar(str)
-        str <- strsplit(str, split = "")[[1]]
-        if (str_len < max_length) {
-          c(str, rep(NA, max_length - str_len))
-        } else {
-          str
-        }
-    })
-  }
diff --git a/R/positionalEntropy.R b/R/positionalEntropy.R
@@ -23,8 +23,6 @@
 #' @param aa.length The maximum length of the CDR3 amino acid sequence. 
 #' @param method The method to calculate the entropy/diversity - 
 #' "shannon", "inv.simpson", "norm.entropy".
-#' @param n.boots number of bootstraps to down sample in order to 
-#' get mean diversity.
 #' @param exportTable Returns the data frame used for forming the graph.
 #' @param palette Colors to use in visualization - input any \link[grDevices]{hcl.pals}.
 #' @import ggplot2
@@ -36,8 +34,7 @@ positionalEntropy <- function(input.data,
                               chain = "TRB", 
                               group.by = NULL, 
                               aa.length = 20,
-                              method = "shannon",
-                              n.boots = 20,
+                              method = "norm.entropy",
                               exportTable = FALSE, 
                               palette = "inferno")  {
 
@@ -55,52 +52,33 @@ positionalEntropy <- function(input.data,
     input.data <- .groupList(input.data, group.by)
   }
 
-  #Selecting Diversit Function
+  #Selecting Diversity Function
   diversityFunc <- switch(method,
-                          "norm.entropy" = .shannon,
+                          "norm.entropy" = .normentropy,
                           "inv.simpson" = .invsimpson,
-                          "shannon" = .normentropy,
+                          "shannon" = .shannon,
                           stop("Invalid method provided"))
 
-  min <- .short.check(input.data, cloneCall)
+  aa.count.list <- .aa.counter(input.data, "CTaa", aa.length)
 
-  lapply(input.data, function(x) {
-      lapply(seq_len(n.boots), function(y) {
-       strings <- x[,cloneCall]
-       strings <- do.call(c,str_split(strings, ";"))
-       strings <- strings[strings != "NA"]
-       strings <- na.omit(strings)
-       strings <- strings[nchar(strings) < aa.length]
-       strings <- strings[sample(seq_len(length(strings)), min)]
-       strings <- .padded_strings(strings, aa.length)
-       strings <- do.call(rbind, strings)
-       aa.output <- apply(strings, 2, function(z) {
-         summary <- as.data.frame(table(z, useNA = "always"))
-       })
-       res <- suppressWarnings(Reduce(function(...) merge(..., all = TRUE, by="z"), aa.output))
-       colnames(res) <- c("AA", paste0("pos.", seq_len(aa.length)))
-       res[seq_len(20),][is.na(res[seq_len(20),])] <- 0
-       diversity <- sapply(res[,2:ncol(res)], diversityFunc)
-       diversity[is.nan(diversity)] <- 0
-       diversity
-    }) -> diversity.calculations
-    diversity.calculations <- do.call(rbind, diversity.calculations)
-    diversity.means <- colMeans(diversity.calculations)
-    diversity.means
-    }) -> positional.diversity
-
-    mat <- do.call(rbind, positional.diversity)
-    mat_melt <- suppressMessages(melt(mat))
+  lapply(aa.count.list, function(x){
+      diversity <- sapply(x[,2:ncol(x)], diversityFunc)
+      diversity[is.nan(diversity)] <- 0
+      diversity
+  }) -> group.results
+
+  mat <- do.call(rbind, group.results)
+  mat_melt <- suppressMessages(melt(mat))
 
-    plot <- ggplot(mat_melt, aes(x=Var2, y = value, group= Var1, color = Var1)) +
-      geom_line(stat = "identity") +
-      geom_point() + 
-      scale_color_manual(name = "Groups", 
-                        values = rev(.colorizer(palette,nrow(mat)))) +
-      xlab("Amino Acid Residues") +
-      ylab("Relative Diversity") +
-      theme_classic() + 
-      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+  plot <- ggplot(mat_melt, aes(x=Var2, y = value, group= Var1, color = Var1)) +
+          geom_line(stat = "identity") +
+          geom_point() + 
+          scale_color_manual(name = "Groups", 
+                            values = rev(.colorizer(palette,nrow(mat)))) +
+          xlab("Amino Acid Residues") +
+          ylab("Relative Diversity") +
+          theme_classic() + 
+          theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
     if (exportTable == TRUE) { 
       return(mat_melt) 
     }