Merge branch 'feature/PSO' into develop

nickbrazeau · May 11, 2024 · 5b8731c · 5b8731c
2 parents 5240b33 + 973ae0d
commit 5b8731c
Show file tree

Hide file tree

Showing 26 changed files with 1,226 additions and 354 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,7 +4,8 @@ S3method(print,DISCresult)
 S3method(summary,DISCresult)
 S3method(tidyout,DISCresult)
 export("%>%")
-export(deme_inbreeding_spcoef)
+export(deme_inbreeding_spcoef_pso)
+export(deme_inbreeding_spcoef_vanilla)
 export(is.DISCresult)
 export(tidyout)
 importFrom(Rcpp,sourceCpp)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,11 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-deme_inbreeding_coef_cpp <- function(args, args_functions, args_progress) {
-    .Call(`_discent_deme_inbreeding_coef_cpp`, args, args_functions, args_progress)
+pso_deme_inbreeding_coef_cpp <- function(args) {
+    .Call(`_discent_pso_deme_inbreeding_coef_cpp`, args)
+}
+
+vanilla_deme_inbreeding_coef_cpp <- function(args) {
+    .Call(`_discent_vanilla_deme_inbreeding_coef_cpp`, args)
 }
 
diff --git a/R/main_pso.R b/R/main_pso.R
@@ -0,0 +1,261 @@
+#' @title Identify Deme Inbreeding Spatial Coefficients in Continuous Space with
+#' Particle Swarm Meta-Optimization
+#' @inheritParams deme_inbreeding_spcoef_vanilla
+#' @param  fi_lowerbound
+#' @param  fi_upperbound
+#' @param  flearn_lowerbound
+#' @param  flearn_upperbound
+#' @param  mlearn_lowerbound
+#' @param  mlearn_upperbound
+#' @param  c1
+#' @param  c2
+#' @param  w
+#' @param  swarmsize
+#' @param  swarmsteps
+#' @param  searchsteps
+#' @param report_sd_progress boolean; search chain
+#' @param report_fd_progress boolean; final chain
+#' @inherit description deme_inbreeding_spcoef_vanilla
+#' @details The gen.geo.dist dataframe must be named with the following columns:
+#'          "smpl1"; "smpl2"; "deme1"; "deme2"; "gendist"; "geodist"; which corresponds to:
+#'          Sample 1 Name; Sample 2 Name; Sample 1 Location; Sample 2 Location;
+#'          Pairwise Genetic Distance; Pairwise Geographpic Distance. Note, the order of the
+#'          columns do not matter but the names of the columns must match.
+#' @details We have implemented coding decisions to not allow the "f" inbreeding coefficients to be negative by using a
+#' logit transformation internally in the code.
+#' @details Gradient descent is performed using the Adam (adaptive moment estimation) optimization approach. Default values
+#' for moment decay rates, epsilon, and learning rates are taken from \cite{DP Kingma, 2014}.
+#' @details The Particle Swarm Optimization is a meta-optimization (meta-heuristic) approach that attempts to find optimal
+#' start parameters for the user to avoid a grid-search approach as would be best practices for fine-tuning the gradient descent
+#' approach.
+#' @export
+
+deme_inbreeding_spcoef_pso <- function(discdat,
+                                       m_lowerbound = 1e-10,
+                                       m_upperbound = Inf,
+                                       fi_lowerbound= 1e-3,
+                                       fi_upperbound = 0.3,
+                                       flearn_lowerbound = 1e-10,
+                                       flearn_upperbound = 1e-2,
+                                       mlearn_lowerbound = 1e-15,
+                                       mlearn_upperbound = 1e-8,
+                                       c1 = 0.1,
+                                       c2 = 0.1,
+                                       w = 0.25,
+                                       b1 = 0.9,
+                                       b2 = 0.999,
+                                       e = 1e-8,
+                                       steps = 1e3,
+                                       searchsteps = 1e2,
+                                       swarmsteps = 50,
+                                       swarmsize = 25,
+                                       thin = 1,
+                                       normalize_geodist = TRUE,
+                                       report_sd_progress = TRUE,
+                                       report_fd_progress = TRUE,
+                                       return_verbose = FALSE){
+
+  #..............................................................
+  # Assertions & Catches
+  #..............................................................
+  if (!all(colnames(discdat) %in% c("smpl1", "smpl2", "deme1", "deme2", "gendist", "geodist"))) {
+    stop("The discdat dataframe must contain columns with names: smpl1, smpl2, deme1, deme2, gendist, geodist")
+  }
+  # make sure correct order
+  for (i in 1:6) {
+    if (colnames(discdat)[i] != c("smpl1", "smpl2", "deme1", "deme2", "gendist", "geodist")[i]) {
+      stop("The discdat dataframe must contain columns with names in the exact order of: smpl1, smpl2, deme1, deme2, gendist, geodist")
+    }
+  }
+
+  assert_dataframe(discdat)
+  assert_single_numeric(b1)
+  assert_single_numeric(b2)
+  assert_single_numeric(e)
+  assert_single_numeric(c1)
+  assert_single_numeric(c2)
+  assert_single_numeric(w)
+  assert_single_numeric(m_lowerbound)
+  assert_single_numeric(m_upperbound)
+  assert_gr(m_upperbound, m_lowerbound)
+  assert_single_numeric(fi_lowerbound)
+  assert_single_numeric(fi_upperbound)
+  assert_gr(fi_upperbound, fi_lowerbound)
+  assert_single_numeric(flearn_lowerbound)
+  assert_single_numeric(flearn_upperbound)
+  assert_gr(flearn_upperbound, flearn_lowerbound)
+  assert_single_numeric(mlearn_lowerbound)
+  assert_single_numeric(mlearn_upperbound)
+  assert_gr(mlearn_upperbound, mlearn_lowerbound)
+  assert_single_int(steps)
+  assert_single_int(thin)
+  assert_greq(thin, 1, message = "Must be at least 1")
+  ssert_single_logical(report_sd_progress)
+  assert_single_logical(report_fd_progress)
+  assert_single_logical(normalize_geodist)
+
+  # no missing
+  if(sum(is.na(discdat)) != 0) {
+    stop("discdat dataframe cannot have missing values")
+  }
+
+  # catch accidental bad F and M bound
+  if ( any(round(c(fi_lowerbound, m_lowerbound), digits = 1e200) == 0) ) {
+    warning("The Fi or M lower-bound is zero (or essentially zero), which will result in unstable behavior in the Gradient-Descent algorithm. Consider increasing the lower-bound limit")
+  }
+
+  #......................
+  # check for self comparisons
+  #......................
+  sapply(discdat$geodist, assert_neq, y = 0,
+         message = "No within-deme sample comparisons allowed. Geodistance should not be 0")
+  mapply(assert_neq, discdat$deme1, discdat$deme2,
+         message = "No within-deme sample comparisons allowed. Locat names should not be the same")
+
+  #............................................................
+  # R manipulations before C++
+  #...........................................................
+  # use efficient R functions to group pairs and wrangle data for faster C++ manipulation
+  # get deme names and lift over sorted names for i and j
+  demes <- sort(unique(c(discdat$deme1, discdat$deme2)))
+  keyi <- data.frame(deme1 = demes, i = 1:length(demes))
+  keyj <- data.frame(deme2 = demes, j = 1:length(demes))
+
+  # transform data w/ logit
+  discdat <- discdat %>%
+    dplyr::mutate(gendist = ifelse(gendist > 0.999, 0.999,
+                                   ifelse(gendist < 0.001, 0.001,
+                                          gendist))) %>% # reasonable bounds on logit
+    dplyr::mutate(gendist = logit(gendist))
+
+  # get genetic data by pairs through efficient nest
+  gendist <- discdat %>%
+    expand_pairwise(.) %>% # get all pairwise for full matrix
+    dplyr::select(c("deme1", "deme2", "gendist")) %>%
+    dplyr::group_by_at(c("deme1", "deme2")) %>%
+    tidyr::nest(.) %>%
+    dplyr::left_join(., keyi, by = "deme1") %>%
+    dplyr::left_join(., keyj, by = "deme2") %>%
+    dplyr::arrange_at(c("i", "j"))
+
+
+  # put gendist into an array
+  # NB we are filling an array with dimension of size:
+  #   locations x locations x max K-pairs
+  # Will fill in w/ -1 to indicate missing/skip where demes do not
+  # have max pairs.
+  # This approach wastes memory but allows for a structured array
+  # versus a list with varying sizes (and eventually a more efficient for-loop)
+  n_Kpairmax <- max(purrr::map_dbl(gendist$data, nrow))
+  gendist_arr <- array(data = -1, dim = c(length(demes), length(demes), n_Kpairmax))
+  for (i in 1:nrow(gendist)) {
+    gendist_arr[gendist$i[i], gendist$j[i], 1:nrow(gendist$data[[i]])] <- unname(unlist(gendist$data[[i]]))
+  }
+
+  # normalize geodistances per user; NB have already removed self comparisons, so no 0s
+  if (normalize_geodist) {
+    mingeodist <- min(discdat$geodist)
+    maxgeodist <- max(discdat$geodist)
+    discdat <- discdat %>%
+      dplyr::mutate(geodist = (geodist - mingeodist)/(maxgeodist - mingeodist))
+  }
+
+  # put geo information into distance matrix
+  geodist <- discdat %>%
+    expand_pairwise(.) %>% # get all pairwise for full matrix
+    dplyr::select(c("deme1", "deme2", "geodist")) %>%
+    dplyr::group_by_at(c("deme1", "deme2")) %>%
+    tidyr::nest(.) %>%
+    dplyr::left_join(., keyi, by = "deme1") %>%
+    dplyr::left_join(., keyj, by = "deme2") %>%
+    dplyr::arrange_at(c("i", "j"))
+
+  # simplify geodistance data storage
+  geodist$data <- purrr::map_dbl(geodist$data, function(x){
+    if (length(unique(unlist(x))) != 1) {
+      stop("deme1 and deme2 have different geodistances among P-sample combinations. Distances should all be same among samples")
+    }
+    return( unique(unlist(x)) ) # all same by unique
+  }
+  )
+
+  # upper tri
+  geodist_mat <- matrix(data = -1, nrow = length(demes), ncol = length(demes))
+  for (i in 1:nrow(geodist)) {
+    geodist_mat[geodist$i[i], geodist$j[i]] <- geodist$data[i]
+  }
+  diag(geodist_mat) <- 0
+
+  #..............................................................
+  # run efficient C++ function
+  #..............................................................
+
+  args <- list(gendist = as.vector(gendist_arr),
+               geodist = as.vector(geodist_mat),
+               n_Demes = length(demes),
+               n_Kpairmax = n_Kpairmax,
+               m_lowerbound = m_lowerbound,
+               m_upperbound = m_upperbound,
+               fi_lowerbound = logit( fi_lowerbound ),
+               fi_upperbound = logit( fi_upperbound ),
+               flearn_lowerbound = flearn_lowerbound,
+               flearn_upperbound = flearn_upperbound,
+               mlearn_lowerbound = mlearn_lowerbound,
+               mlearn_upperbound = mlearn_upperbound,
+               b1 = b1,
+               b2 = b2,
+               e = e,
+               c1 = c1,
+               c2 = c2,
+               w = w,
+               swarmsteps = swarmsteps,
+               swarmsize = swarmsize,
+               searchsteps = searchsteps,
+               steps = steps,
+               report_sd_progress = report_sd_progress,
+               report_fd_progress = report_fd_progress,
+               return_verbose = return_verbose
+  )
+
+  # run
+  output_raw <- pso_deme_inbreeding_coef_cpp(args)
+
+
+  # set up thinning
+  thin_its <- seq(1, steps, by = thin)
+  thin_its <- unique(c(thin_its, steps)) # always include last iteration
+
+  # process output
+  colnames(keyi) <- c("Deme", "key")
+  if (return_verbose) {
+    output <- list(
+      deme_key = keyi,
+      m_run = output_raw$m_run[thin_its],
+      fi_run = expit(do.call("rbind", output_raw$fi_run))[thin_its, ],
+      m_gradtraj = output_raw$m_gradtraj[thin_its],
+      fi_gradtraj = do.call("rbind", output_raw$fi_gradtraj)[thin_its, ],
+      m_1moment = output_raw$m_firstmoment[thin_its],
+      m_2moment = output_raw$m_secondmoment[thin_its],
+      fi_1moment = do.call("rbind", output_raw$fi_firstmoment)[thin_its, ],
+      fi_2moment = do.call("rbind", output_raw$fi_secondmoment)[thin_its, ],
+      cost = output_raw$cost[thin_its],
+      Final_Fis = expit(output_raw$Final_Fis),
+      Final_m = output_raw$Final_m,
+      swarm = output_raw$swarm
+    )
+
+  } else {
+    output <- list(
+      deme_key = keyi,
+      cost = output_raw$cost[thin_its],
+      m_run = output_raw$m_run[thin_its],
+      fi_run = expit(do.call("rbind", output_raw$fi_run))[thin_its, ],
+      Final_Fis = expit(output_raw$Final_Fis),
+      Final_m = output_raw$Final_m)
+  }
+
+  # add S3 class structure
+  attr(output, "class") <- "psoDISCresult"
+  return(output)
+}
+