Merge pull request #15 from julia-wrobel/alex-dev

Update for Manuscript Reviewers
julia-wrobel · Oct 4, 2024 · 260764c · 260764c
2 parents 34a9416 + a34d9e8
commit 260764c
Show file tree

Hide file tree

Showing 367 changed files with 3,497 additions and 439 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,8 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^docs$
+^data-raw$
+^pkgdown$
+^LICENSE\.md$
+^_pkgdown\.yml$
+^\.github$
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
 inst/doc
-.Rbuildignore
 *DS_Store
 *.Rproj.user/
 .Rproj.user/shared/notebooks/paths

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mxfda
 Title: A Functional Data Analysis Package for Spatial Single Cell Data
-Version: 0.2.1-1
+Version: 0.2.2
 Date: 2024-05-06
 Authors@R: 
     c(
@@ -21,7 +21,7 @@ URL: https://github.com/julia-wrobel/mxfda/, http://juliawrobel.com/mxfda/
 BugReports: https://github.com/julia-wrobel/mxfda/issues/
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Suggests: 
     knitr,
     rmarkdown,
@@ -32,7 +32,9 @@ Suggests:
     spatialTIME,
     tibble,
     broom,
-    refund.shiny
+    refund.shiny,
+    Seurat,
+    SeuratObject
 Imports: 
     magrittr,
     rlang,

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,8 +14,10 @@ export(Lcross)
 export(Lest)
 export(add_summary_function)
 export(bivariate)
+export(can_permute)
 export(entropy)
 export(extract_c)
+export(extract_entropy)
 export(extract_fpca_object)
 export(extract_fpca_scores)
 export(extract_model)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,13 @@
 # mxfda (development version)
 
+# mxfda v0.2.2
+
+-   added vignette for working with spatial transcriptomic data as provided in a seurat object from a clear cell renal cell carcinoma CosMx SMI study. Data can be downloaded [here](https://doi.org/10.5281/zenodo.12730227).
+-   from original v0.1.0 submitted to [CRAN](https://cran.r-project.org/web/packages/mxfda/index.html).
+-   implemented Vu et al bivariate entropy measure.
+-   ther bug fixes. See [Commit History](https://github.com/julia-wrobel/mxfda/commits/main/) for complete list of commits associated with the main branch of mxFDA.
+-   added ability to calculate an empirical estimate for complete spatial randomness (CSR) when extracting summary functions
+
 # mxfda v0.1.0
 
-* starting package out
+-   starting package out
diff --git a/R/data.R b/R/data.R
@@ -39,6 +39,7 @@
 #'   \item{sample_key}{column in both `Metadata` and `Spatial` that is a 1:1 with the samples (unique per sample)}
 #'   \item{univariate_summaries}{univariate summary slot with nearest neighbor G calculared}
 #'   \item{bivariate_summaries}{empty slot available for bivariate summaries}
+#'   \item{multiivariate_summaries}{empty slot available for multivariate summaries}
 #'   \item{functional_pca}{empty slot for functional PCA data of summaries}
 #'   \item{functional_cox}{empty slot for functional models}
 #' }
@@ -66,6 +67,7 @@
 #'   \item{sample_key}{column in both `Metadata` and `Spatial` that is a 1:1 with the samples (unique per sample)}
 #'   \item{univariate_summaries}{univariate summary slot with nearest neighbor G calculared}
 #'   \item{bivariate_summaries}{empty slot available for bivariate summaries}
+#'   \item{multiivariate_summaries}{empty slot available for multivariate summaries}
 #'   \item{functional_pca}{empty slot for functional PCA data of summaries}
 #'   \item{functional_cox}{empty slot for functional models}
 #' }

diff --git a/R/extract_bivariate.R b/R/extract_bivariate.R
@@ -11,7 +11,8 @@
 #' @param r_vec Numeric vector of radii over which to evaluate spatial summary functions. Must begin at 0.
 #' @param func Spatial summary function to calculate. Options are c(Kcross, Lcross, Gcross) which denote Ripley's K, Besag's L, and nearest neighbor G function, respectively, or entropy from Vu et al, 2023.
 #' @param edge_correction Character string that denotes the edge correction method for spatial summary function. For Kcross and Lcross choose one of c("border", "isotropic", "Ripley", "translate", "none"). For Gcross choose one of c("rs", "km", "han")
-#' @param breaks an integer for the number of breaks used for entropy
+#' @param empirical_CSR logical to indicate whether to use the permutations to identify the sample-specific complete spatial randomness (CSR) estimation.
+#' @param permutations integer for the number of permtuations to use if empirical_CSR is `TRUE` and exact CSR not calculable
 #'
 #' @details `r lifecycle::badge('stable')`
 #'
@@ -22,61 +23,87 @@
 #' \item{fundiff}{sumfun - csr, positive values indicate clustering and negative values repulsion}
 #'
 #' @author Julia Wrobel \email{`r juliawrobel_email`}
+#' @author Alex Soupir \email{`r alexsoupir_email`}
 #'
-#' @references Xiao, L., Ruppert, D., Zipunnikov, V., and Crainiceanu, C. (2016).
+#' @references
+#'
+#' Xiao, L., Ruppert, D., Zipunnikov, V., and Crainiceanu, C. (2016).
 #' Fast covariance estimation for high-dimensional functional data.
 #' \emph{Statistics and Computing}, 26, 409-421.
 #' DOI: 10.1007/s11222-014-9485-x.
-#' @references Vu, T., Seal, S., Ghosh, T., Ahmadian, M., Wrobel, J., & Ghosh, D. (2023).
+#'
+#' Vu, T., Seal, S., Ghosh, T., Ahmadian, M., Wrobel, J., & Ghosh, D. (2023).
 #' FunSpace: A functional and spatial analytic approach to cell imaging data using entropy measures.
 #' \emph{PLOS Computational Biology}, 19(9), e1011490.
 #'
+#' Creed, J. H., Wilson, C. M., Soupir, A. C., Colin-Leitzinger, C. M., Kimmel, G. J.,
+#' Ospina, O. E., Chakiryan, N. H., Markowitz, J., Peres, L. C., Coghill, A., & Fridley, B. L. (2021).
+#' spatialTIME and iTIME: R package and Shiny application for visualization and analysis of
+#' immunofluorescence data. \emph{Bioinformatics} (Oxford, England), 37(23), 4584–4586.
+#' https://doi.org/10.1093/bioinformatics/btab757
+#'
 #' @importFrom spatstat.geom ppp convexhull.xy
 #' @import dplyr
 #'
 #' @export
 bivariate = function(mximg,
-                             markvar,
-                             mark1,
-                             mark2,
-                             r_vec,
-                             func = c(Kcross, Lcross, Gcross, entropy),
-                             edge_correction,
-                             breaks = NULL){
+                     markvar,
+                     mark1,
+                     mark2,
+                     r_vec,
+                     func = c(Kcross, Lcross, Gcross, entropy),
+                     edge_correction,
+                     empirical_CSR = FALSE,
+                     permutations = 1000){
   #### note to switch edge correction based on choice of func, this should be automated
 
   # subset data to only cells if interest
-  #mximg = filter(mximg, get(markvar) %in% c(mark1, mark2))
-  # mximg = filter(mximg, markvar == mark1)
-  n = mximg %>% count(get(markvar)) %>% pull(n)
+  #some of the samples will have other cell types, as well as missing cell type of interest
+  n = mximg %>% count(get(markvar)) %>% pull(n, name = 1)
   if(any(n < 2)) return(NA)
-  if(length(n) < 2) return(NA)
+  if(sum(names(n) %in% c(mark1, mark2)) < 2) return(NA) #make sure the marks wanted are in n vector
 
   # create a convex hull as the observation window
   w = convexhull.xy(mximg[["x"]], mximg[["y"]])
 
   # create ppp object
   pp_obj = ppp(mximg[["x"]], mximg[["y"]], window = w,
                marks = as.factor(mximg[[markvar]]), checkdup = FALSE)
-  if(!is.null(breaks)){
-    r_vec = exp(seq(log(0.05*max(r_vec)), log(max(r_vec)), length.out = breaks))
-  }
+
   # estimate L using spatstat
   sumfun = func(pp_obj,
                 mark1,
                 mark2,
                 r = r_vec,
                 correction = edge_correction)
-  if(is.null(breaks)){
-    df = data.frame(sumfun) %>%
-      select(r, sumfun = all_of(edge_correction), csr = theo) %>%
-      mutate(fundiff = sumfun - csr) %>%
-      select(r, sumfun, csr, fundiff)
-    return(df)
-  } else {
-    return(data.frame(sumfun))
+
+  #if the user wants permutations to estimate CSR then will have to run them
+  if(empirical_CSR == TRUE){
+    if(identical(Kcross, func)){
+      #message("Using Exact Complete Spatial Randomness for Ripley's K")
+      sumfun[[2]] = func(pp_obj, #exact CSR is across all points
+                         r = r_vec,
+                         correction = edge_correction)[[3]]
+    } else {
+      sumfun[[2]] = sapply(1:permutations, function(x){
+        pp_obj_tmp = pp_obj
+        pp_obj_tmp$marks = sample(pp_obj_tmp$marks)
+        func(pp_obj_tmp, #sample ppp to length of the correct marks
+             mark1,
+             mark2,
+             r = r_vec,
+             correction = edge_correction)[[3]]
+      }) %>%
+        rowMeans(na.rm = TRUE) #take average of all permutations
+    }
+
   }
 
+  df = data.frame(sumfun) %>%
+    select(r, sumfun = all_of(edge_correction), csr = theo) %>%
+    mutate(fundiff = sumfun - csr) %>%
+    select(r, sumfun, csr, fundiff)
+
 
 
 }

diff --git a/R/extract_entropy.R b/R/extract_entropy.R
@@ -1,10 +1,69 @@
+#' extract_entropy
+#'
+#' The extract_entropy() is used to compute spatial entropy at each distance
+#' interval for all cell types of interest. The goal is to capture the diversity
+#' in cellular composition, such as similar proportions across cell types or
+#' dominance of a single type, at a specific distance range. Additionally,
+#' spatial patterns, including clustered, independent, or regular, among cell
+#' types can also be acquired. In this example, we will look at the spatial
+#' heterogeneity across T cells, macrophages, and others. To focus on the local
+#' cell-to-cell interactions, we set the default maximum of the distance range
+#' (i.e., rmax) to be 400 microns. The default number of distance
+#' breaks/intervals is set to 50. Then, a sequence of distance breaks is
+#' generated by linearly decreasing from rmax to 0 on a log scale. At each
+#' distance range, partial spatial entropy and residual entropy are
+#' calculated as in Vu et al. (2023), Altieri et al. (2018). These spatial
+#' entropy functions can then be used as input functions for FPCA.
+#'
+#' @param mxFDAobject object of class `mxFDA`
+#' @param markvar The name of the variable that denotes cell type(s) of interest. Character.
+#' @param marks Character vector that denotes cell types of interest.
+#' @param n_break Total number of distance ranges/intervals of interest made from 0 to `rmax` for calculating entropy
+#' @param rmax Max distance between pairs of cells
+#'
+#' @return object of class `mxFDA` with a dataframe in the `multivariate_summaries` slot
+#' @export
+#'
+extract_entropy = function(mxFDAobject,
+                           markvar,
+                           marks,
+                           n_break = 50,
+                           rmax = 400){
+  #make sure input object is of right class
+  if(!inherits(mxFDAobject, "mxFDA"))
+    stop("Object must be of class `mxFDA`.")
+  #need spatial data to calculate spatial summary functions
+  if(nrow(mxFDAobject@Spatial) == 0)
+    stop("No summary function to be calculated - missing spatial")
+  #check to make sure that there are more than 1 level in the markvar column
+  if(!all(marks %in% unique(mxFDAobject@Spatial[[markvar]])))
+    stop("One or marks not in markvar column of Spatial data")
+  #check to make sure that there are more than 1 level in the markvar column
+  if(length(unique(mxFDAobject@Spatial[[markvar]]))<2)
+    stop("Calculating entropy requires at least 2 cell types")
+  #get only those columns and marks needed
+  df_nest = mxFDAobject@Spatial %>%
+    dplyr::filter(!!dplyr::sym(markvar) %in% marks) %>%
+    dplyr::select(dplyr::all_of(mxFDAobject@sample_key), x, y, dplyr::all_of(markvar)) %>%
+    tidyr::nest(data = c(x, y, dplyr::all_of(markvar)))
+  r_vec = exp(seq(log(0.05*max(rmax)), log(max(rmax)), length.out = n_break))
+
+  ndat = df_nest %>% dplyr::mutate(sumfuns = purrr::map(df_nest$data, entropy,
+                                                        r_vec = r_vec,
+                                                        markvar = markvar,
+                                                        .progress = TRUE)) %>%
+    select(-data) %>%
+    unnest(sumfuns)
+
+  mxFDAobject@multivariate_summaries$entropy = ndat
+  return(mxFDAobject)
+}
+
 #' Entropy
 #'
-#' @param X object of class `ppp` from spatstat with 2 marks
-#' @param i ignored
-#' @param j ignored
+#' @param df data frame with x and y columns, along with a column for point marks
 #' @param r_vec vector of length wanted for breaks (will be rescaled) with max value at max for measuring entropy
-#' @param correction ignored
+#' @param markvar The name of the variable that denotes cell type(s) of interest. Character.
 #'
 #' @details `r lifecycle::badge('experimental')`
 #'
@@ -21,30 +80,55 @@
 #' \emph{Environmental and ecological statistics}, 25, 95-110.
 #'
 #' @export
-entropy = function(X,
-                   i,
-                   j,
+entropy = function(df,
                    r_vec,
-                   correction
-                   #n_break = 50,
-                   #rmax = 400
-                             ){
+                   markvar){
+  #if not enough points return NAs
+  if(nrow(df) < 3)
+    return(data.frame(r = r_vec, spatial_entropy = NA, residual_entropy = NA))
+  win = spatstat.geom::convexhull.xy(df$x, df$y)
+  X = spatstat.geom::ppp(df$x, df$y, window = win, marks = df[[markvar]])
   breaks = length(r_vec)
+  #nnumber of cells for marks
+  cells = df %>%
+    dplyr::group_by(!!dplyr::sym(markvar)) %>%
+    dplyr::summarise(counts = dplyr::n()) %>%
+    dplyr::mutate(!!markvar := paste0(get(markvar), " cells")) %>%
+    tidyr::spread(key = markvar, value = 'counts')
+
   # Compute spatial entropy
-  spa_entropy = SimDesign::quiet(
-    SpatEntropy::altieri(X, distbreak = r_vec[-breaks],
-                         verbose = FALSE, plotout = FALSE)
-  )
+  spa_entropy <- tryCatch({
+    # Try to calculate spatial entropy using SpatEntropy::altieri
+    SimDesign::quiet(
+      SpatEntropy::altieri(X, distbreak = r_vec[-breaks], verbose = FALSE, plotout = FALSE)
+    ) %>% suppressWarnings()
+  }, error = function(e) {
+    # In case of error, return a fallback data frame
+    data.frame(
+      r = r_vec,
+      spatial_entropy = NA,
+      residual_entropy = NA
+    )
+  })
+
+  if(inherits(spa_entropy, "data.frame")){
+    return(spa_entropy)
+  }
 
   if (length(spa_entropy$SPI.terms) < breaks){
     breaks = length(spa_entropy$SPI.terms)
     r = unique(c(spa_entropy$distance.breaks[,1],spa_entropy$distance.breaks[,2]))[-c(1)]
     r[breaks] = max(r_vec)
     r_vec = r
-    }
+  }
+  r_vec = round(r_vec, 2)
 
   df = data.frame(r = r_vec, spatial_entropy = spa_entropy$SPI.terms,
-                  residual_entropy  =  spa_entropy$RES.terms)
+                  residual_entropy  =  spa_entropy$RES.terms) %>%
+    dplyr::full_join(cells %>%
+                       dplyr::slice(rep(1:n(), each = length(r_vec))) %>%  # Repeat the row
+                       dplyr::mutate(r = r_vec),
+                     by = dplyr::join_by(r))
 
   df
 
@@ -54,4 +138,3 @@ entropy = function(X,
 
 
 
-
diff --git a/R/extract_model.R b/R/extract_model.R
@@ -48,6 +48,7 @@ extract_model = function(mxFDAobject, metric, type, model_name){
     if(grepl("[U|u]", metric[1]) & grepl("[K|k]", metric[2])) fit = mxFDAobject@functional_cox$Kest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[G|g]", metric[2])) fit = mxFDAobject@functional_cox$Gest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[L|l]", metric[2])) fit = mxFDAobject@functional_cox$Lest[[model_name]]
+    if(grepl("[M|m]", metric[1]) & grepl("[E|e]", metric[2])) fit = mxFDAobject@`functional_cox`$entropy[[model_name]]
   } else if(type == 'mcox'){
     #get mixed effects
     if(grepl("[B|b]", metric[1]) & grepl("[K|k]", metric[2])) fit = mxFDAobject@functional_mcox$Kcross[[model_name]]
@@ -56,6 +57,7 @@ extract_model = function(mxFDAobject, metric, type, model_name){
     if(grepl("[U|u]", metric[1]) & grepl("[K|k]", metric[2])) fit = mxFDAobject@functional_mcox$Kest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[G|g]", metric[2])) fit = mxFDAobject@functional_mcox$Gest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[L|l]", metric[2])) fit = mxFDAobject@functional_mcox$Lest[[model_name]]
+    if(grepl("[M|m]", metric[1]) & grepl("[E|e]", metric[2])) fit = mxFDAobject@functional_mcox$entropy[[model_name]]
   } else if(type == 'sofr'){
     #get scalar on functional regression
     if(grepl("[B|b]", metric[1]) & grepl("[K|k]", metric[2])) fit = mxFDAobject@scalar_on_functional$Kcross[[model_name]]
@@ -64,6 +66,7 @@ extract_model = function(mxFDAobject, metric, type, model_name){
     if(grepl("[U|u]", metric[1]) & grepl("[K|k]", metric[2])) fit = mxFDAobject@scalar_on_functional$Kest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[G|g]", metric[2])) fit = mxFDAobject@scalar_on_functional$Gest[[model_name]]
     if(grepl("[U|u]", metric[1]) & grepl("[L|l]", metric[2])) fit = mxFDAobject@scalar_on_functional$Lest[[model_name]]
+    if(grepl("[M|m]", metric[1]) & grepl("[E|e]", metric[2])) fit = mxFDAobject@scalar_on_functional$entropy[[model_name]]
   }
 
   if(is.null(fit))