From 7d040f0a0dd8828d23fa15c93f4ca61be1aba442 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 25 Feb 2021 10:00:48 +1000 Subject: [PATCH 1/7] Adding updates to description and other notes for next CRAN release (v1.04) --- DESCRIPTION | 4 ++-- NEWS.md | 1 + R/MRFcov.R | 11 ++++++++++- R/MRFcov_spatial.R | 11 ++++++++++- R/predict_MRF.R | 12 +++++++++++- R/predict_MRFnetworks.R | 9 ++++++--- cran-comments.md | 2 +- docs/README.Rmd | 2 +- man/MRFcov.Rd | 4 +++- man/MRFcov_spatial.Rd | 5 ++++- man/predict_MRF.Rd | 5 ++++- man/predict_MRFnetworks.Rd | 5 ++++- tests/testthat/test-MRFcov.R | 2 +- 13 files changed, 58 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 150acdc..3c66b43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: MRFcov Type: Package Title: Markov Random Fields with Additional Covariates -Version: 1.0.37 -Date/Publication: 2019-11-04 +Version: 1.0.4 +Date/Publication: 2021-02-25 URL: https://github.com/nicholasjclark/MRFcov Authors@R: c( person("Nicholas J", "Clark", email = "nicholas.j.clark1214@gmail.com", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 91400ff..6711afa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # MRFcov 1.0.40 * Improved nonparanormal mapping of predictions for discrete variables +* Option to remove progress bar in multicore estimation, significantly speeding up analysis runs # MRFcov 1.0.39 * Improvement for the detection of infinite values in data and coordinates diff --git a/R/MRFcov.R b/R/MRFcov.R index 8e103e7..bd1af18 100644 --- a/R/MRFcov.R +++ b/R/MRFcov.R @@ -42,6 +42,7 @@ #'coefficients are estimated on the identity scale AFTER using a nonparanormal transformation. #'See \code{vignette("Gaussian_Poisson_CRFs")} for details of interpretation #'@param bootstrap Logical. Used by \code{\link{bootstrap_MRF}} to reduce memory usage +#'@param progress_bar Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation. #' #'@return A \code{list} containing: #'\itemize{ @@ -120,7 +121,7 @@ #' MRFcov <- function(data, symmetrise, prep_covariates, n_nodes, n_cores, n_covariates, - family, bootstrap = FALSE) { + family, bootstrap = FALSE, progress_bar = FALSE) { #### Specify default parameter values and initiate warnings #### if(!(family %in% c('gaussian', 'poisson', 'binomial'))) @@ -364,6 +365,14 @@ MRFcov <- function(data, symmetrise, #### If n_cores > 1, check for parallel loading before initiating parallel clusters #### if(n_cores > 1){ + + # Progress bar significantly slows estimation, set to FALSE unless specified + if(progress_bar){ + pbapply::pboptions(style = 1, char = "+", type = 'timer') + } else { + pbapply::pboptions(type="none") + } + #Initiate the n_cores parallel clusters cl <- makePSOCKcluster(n_cores) setDefaultCluster(cl) diff --git a/R/MRFcov_spatial.R b/R/MRFcov_spatial.R index dc5147e..f190218 100644 --- a/R/MRFcov_spatial.R +++ b/R/MRFcov_spatial.R @@ -61,6 +61,7 @@ #'@param prep_splines Logical. If spatial splines are already included in \code{data}, set to #'\code{FALSE}. Default is \code{TRUE} #'@param bootstrap Logical. Used by \code{\link{bootstrap_MRF}} to reduce memory usage +#'@param progress_bar Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation. #' #'@return A \code{list} of all elements contained in a returned \code{\link{MRFcov}} object, with #'the inclusion of a \code{dataframe} called \code{mrf_data}. This contains all prepped covariates @@ -86,7 +87,7 @@ #'@export #' MRFcov_spatial <- function(data, symmetrise, prep_covariates, n_nodes, n_cores, n_covariates, - family, coords, prep_splines = TRUE, bootstrap = FALSE) { + family, coords, prep_splines = TRUE, bootstrap = FALSE, progress_bar = FALSE) { #### Specify default parameter values and initiate warnings #### if(!(family %in% c('gaussian', 'poisson', 'binomial'))) @@ -381,6 +382,14 @@ MRFcov_spatial <- function(data, symmetrise, prep_covariates, n_nodes, n_cores, #### If n_cores > 1, check for parallel loading before initiating parallel clusters #### if(n_cores > 1){ + + # Progress bar significantly slows estimation, set to FALSE unless specified + if(progress_bar){ + pbapply::pboptions(style = 1, char = "+", type = 'timer') + } else { + pbapply::pboptions(type="none") + } + #Initiate the n_cores parallel clusters cl <- makePSOCKcluster(n_cores) setDefaultCluster(cl) diff --git a/R/predict_MRF.R b/R/predict_MRF.R index 52d1ac5..dd50875 100755 --- a/R/predict_MRF.R +++ b/R/predict_MRF.R @@ -15,6 +15,7 @@ #'by cross-multiplication (\code{TRUE} by default; \code{FALSE} when used in other functions) #'@param n_cores Positive integer stating the number of processing cores to split the job across. #'Default is \code{1} (no parallelisation) +#'@param progress_bar Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation. #'@return A \code{matrix} containing predictions for each observation in \code{data}. If #'\code{family = "binomial"}, a second element containing binary #'predictions for nodes is returned. @@ -59,7 +60,8 @@ #' #'@export #' -predict_MRF <- function(data, MRF_mod, prep_covariates = TRUE, n_cores){ +predict_MRF <- function(data, MRF_mod, prep_covariates = TRUE, n_cores, + progress_bar = FALSE){ if(missing(n_cores)){ n_cores <- 1 @@ -67,6 +69,14 @@ predict_MRF <- function(data, MRF_mod, prep_covariates = TRUE, n_cores){ #### If n_cores > 1, check parallel library loading #### if(n_cores > 1){ + + # Progress bar significantly slows estimation, set to FALSE unless specified + if(progress_bar){ + pbapply::pboptions(style = 1, char = "+", type = 'timer') + } else { + pbapply::pboptions(type="none") + } + #Initiate the n_cores parallel clusters cl <- makePSOCKcluster(n_cores) setDefaultCluster(cl) diff --git a/R/predict_MRFnetworks.R b/R/predict_MRFnetworks.R index 08d6a74..95a8df5 100644 --- a/R/predict_MRFnetworks.R +++ b/R/predict_MRFnetworks.R @@ -30,6 +30,7 @@ #'networks from \code{\link{MRFcov_spatial}} objects) #'@param n_cores Positive integer stating the number of processing cores to split the job across. #'Default is \code{1} (no parallelisation) +#'@param progress_bar Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation. #' #'@return Either a \code{matrix} with \code{nrow = nrow(data)}, #'containing each species' predicted network metric at each observation in \code{data}, or @@ -64,7 +65,7 @@ #' predict_MRFnetworks = function(data, MRF_mod, cutoff, omit_zeros, metric, cached_predictions = NULL, prep_covariates, - n_cores){ + n_cores, progress_bar = FALSE){ if(missing(n_cores)){ n_cores <- 1 @@ -157,7 +158,8 @@ predict_MRFnetworks = function(data, MRF_mod, cutoff, omit_zeros, metric, if(MRF_mod$mod_family == "binomial"){ if(is.null(cached_predictions)){ preds <- predict_MRF(pred.prepped.dat, MRF_mod_booted, - prep_covariates = FALSE, n_cores = n_cores)$Probability_predictions + prep_covariates = FALSE, n_cores = n_cores, + progress_bar = progress_bar)$Probability_predictions } else { preds <- cached_predictions$Probability_predictions } @@ -165,7 +167,8 @@ predict_MRFnetworks = function(data, MRF_mod, cutoff, omit_zeros, metric, } else { if(is.null(cached_predictions)){ preds <- predict_MRF(pred.prepped.dat, MRF_mod_booted, - prep_covariates = FALSE, n_cores = n_cores) + prep_covariates = FALSE, n_cores = n_cores, + progress_bar = progress_bar) } else { preds <- cached_predictions } diff --git a/cran-comments.md b/cran-comments.md index 76d0e96..f6306b6 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,4 @@ -## Patch updates for release v1.0.40 +## Patch updates for release v1.1 This update contains improvments to the nonparanormal mapping of predictions for discrete variables ## Test environments diff --git a/docs/README.Rmd b/docs/README.Rmd index 2164065..396b4e3 100644 --- a/docs/README.Rmd +++ b/docs/README.Rmd @@ -94,7 +94,7 @@ MRF_mod$key_coefs$Plas MRF_mod$key_coefs$Microfilaria ``` -To work through more in-depth tutorials and examples, see the vignettes in the package and check out papers that have been published using the method +To work through more in-depth tutorials and examples, see the vignettes in the package and check out some of the papers that have been published using the method ```{r eval=FALSE} vignette("Bird_Parasite_CRF") vignette("Gaussian_Poisson_CRFs") diff --git a/man/MRFcov.Rd b/man/MRFcov.Rd index be8b47a..3167423 100644 --- a/man/MRFcov.Rd +++ b/man/MRFcov.Rd @@ -5,7 +5,7 @@ \title{Markov Random Fields with covariates} \usage{ MRFcov(data, symmetrise, prep_covariates, n_nodes, n_cores, n_covariates, - family, bootstrap = FALSE) + family, bootstrap = FALSE, progress_bar = FALSE) } \arguments{ \item{data}{A \code{dataframe}. The input data where the \code{n_nodes} @@ -45,6 +45,8 @@ coefficients are estimated on the identity scale AFTER using a nonparanormal tra See \code{vignette("Gaussian_Poisson_CRFs")} for details of interpretation} \item{bootstrap}{Logical. Used by \code{\link{bootstrap_MRF}} to reduce memory usage} + +\item{progress_bar}{Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation.} } \value{ A \code{list} containing: diff --git a/man/MRFcov_spatial.Rd b/man/MRFcov_spatial.Rd index ee4037f..c12d11e 100644 --- a/man/MRFcov_spatial.Rd +++ b/man/MRFcov_spatial.Rd @@ -5,7 +5,8 @@ \title{Spatially structured Markov Random Fields with covariates} \usage{ MRFcov_spatial(data, symmetrise, prep_covariates, n_nodes, n_cores, - n_covariates, family, coords, prep_splines = TRUE, bootstrap = FALSE) + n_covariates, family, coords, prep_splines = TRUE, bootstrap = FALSE, + progress_bar = FALSE) } \arguments{ \item{data}{A \code{dataframe}. The input data where the \code{n_nodes} @@ -66,6 +67,8 @@ the \code{\link{predict_MRF}} function} \code{FALSE}. Default is \code{TRUE}} \item{bootstrap}{Logical. Used by \code{\link{bootstrap_MRF}} to reduce memory usage} + +\item{progress_bar}{Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation.} } \value{ A \code{list} of all elements contained in a returned \code{\link{MRFcov}} object, with diff --git a/man/predict_MRF.Rd b/man/predict_MRF.Rd index 3b015bf..38021f2 100644 --- a/man/predict_MRF.Rd +++ b/man/predict_MRF.Rd @@ -4,7 +4,8 @@ \alias{predict_MRF} \title{Predict training observations from fitted MRFcov models} \usage{ -predict_MRF(data, MRF_mod, prep_covariates = TRUE, n_cores) +predict_MRF(data, MRF_mod, prep_covariates = TRUE, n_cores, + progress_bar = FALSE) } \arguments{ \item{data}{Dataframe. The input data to be predicted, where the \code{n_nodes} @@ -20,6 +21,8 @@ by cross-multiplication (\code{TRUE} by default; \code{FALSE} when used in other \item{n_cores}{Positive integer stating the number of processing cores to split the job across. Default is \code{1} (no parallelisation)} + +\item{progress_bar}{Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation.} } \value{ A \code{matrix} containing predictions for each observation in \code{data}. If diff --git a/man/predict_MRFnetworks.Rd b/man/predict_MRFnetworks.Rd index 51c9d29..7c61b5c 100644 --- a/man/predict_MRFnetworks.Rd +++ b/man/predict_MRFnetworks.Rd @@ -6,7 +6,8 @@ equations from a fitted \code{MRFcov} object} \usage{ predict_MRFnetworks(data, MRF_mod, cutoff, omit_zeros, metric, - cached_predictions = NULL, prep_covariates, n_cores) + cached_predictions = NULL, prep_covariates, n_cores, + progress_bar = FALSE) } \arguments{ \item{data}{Dataframe. The sample data where the @@ -40,6 +41,8 @@ networks from \code{\link{MRFcov_spatial}} objects)} \item{n_cores}{Positive integer stating the number of processing cores to split the job across. Default is \code{1} (no parallelisation)} + +\item{progress_bar}{Logical. Progress bar in pbapply is used if \code{TRUE}, but this slows estimation.} } \value{ Either a \code{matrix} with \code{nrow = nrow(data)}, diff --git a/tests/testthat/test-MRFcov.R b/tests/testthat/test-MRFcov.R index d298ab7..d6bdf4c 100644 --- a/tests/testthat/test-MRFcov.R +++ b/tests/testthat/test-MRFcov.R @@ -26,7 +26,7 @@ test_that("binary data should only have two possible values", { }) inf.dat <- Bird.parasites -inf.dat[1,1] <- 'Inf' +inf.dat[1,1] <- Inf test_that("infinite values are not allowed", { expect_success(expect_error(MRFcov(data = inf.dat, n_nodes = 4, From 09ca6c657d3fc39fd89b43db532f7f11759d6755 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 25 Feb 2021 10:55:23 +1000 Subject: [PATCH 2/7] Added package url --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 3c66b43..98a4fcc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,6 +34,7 @@ Imports: purrr, mgcv, MASS License: GPL-3 +URL: https://github.com/nicholasjclark/MRFcov Encoding: UTF-8 RoxygenNote: 6.1.1 LazyData: true From 41511739ce449105d3569a562150a97b3d24e603 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 25 Feb 2021 11:20:04 +1000 Subject: [PATCH 3/7] editing description duplicates --- .Rbuildignore | 6 ------ CRAN-RELEASE | 2 ++ DESCRIPTION | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) create mode 100644 CRAN-RELEASE diff --git a/.Rbuildignore b/.Rbuildignore index c8e4094..e69de29 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,6 +0,0 @@ -^.*\.Rproj$ -^\.Rproj\.user$ -^README\.Rmd$ -^README-.*\.png$ -^docs$ -^cran-comments.md$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE new file mode 100644 index 0000000..f97ee7f --- /dev/null +++ b/CRAN-RELEASE @@ -0,0 +1,2 @@ +This package was submitted to CRAN on 2021-02-25. +Once it is accepted, delete this file and tag the release (commit 09ca6c6). diff --git a/DESCRIPTION b/DESCRIPTION index 98a4fcc..3c66b43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,7 +34,6 @@ Imports: purrr, mgcv, MASS License: GPL-3 -URL: https://github.com/nicholasjclark/MRFcov Encoding: UTF-8 RoxygenNote: 6.1.1 LazyData: true From 2be47b984561b9c248ccdf6228ea58b3fa3da5c9 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 25 Feb 2021 20:36:48 +1000 Subject: [PATCH 4/7] Remove buildignore and other leftovers --- .Rbuildignore | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .Rbuildignore diff --git a/.Rbuildignore b/.Rbuildignore deleted file mode 100644 index e69de29..0000000 From b09bbcf312187fadbc862b240050e11130e0daac Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Tue, 9 Mar 2021 10:40:57 +1000 Subject: [PATCH 5/7] minor edits for CRAN release v1.0.38 --- .Rbuildignore | 5 + .gitignore | 2 + CRAN-RELEASE | 4 +- DESCRIPTION | 4 +- R/Bird.parasites.R | 2 +- cran-comments.md | 19 +- man/Bird.parasites.Rd | 8 +- man/MRFcov.Rd | 13 +- man/MRFcov_spatial.Rd | 16 +- man/bootstrap_MRF.Rd | 16 +- man/cv_MRF_diag.Rd | 55 ++- man/predict_MRF.Rd | 9 +- man/predict_MRFnetworks.Rd | 14 +- vignettes/Bird_Parasite_CRF.R | 107 ------ vignettes/Bird_Parasite_CRF.html | 487 --------------------------- vignettes/CRF_data_prep.R | 31 -- vignettes/CRF_data_prep.html | 348 ------------------- vignettes/Gaussian_Poisson_CRFs.R | 63 ---- vignettes/Gaussian_Poisson_CRFs.Rmd | 178 +++++----- vignettes/Gaussian_Poisson_CRFs.html | 410 ---------------------- 20 files changed, 221 insertions(+), 1570 deletions(-) create mode 100644 .Rbuildignore delete mode 100644 vignettes/Bird_Parasite_CRF.R delete mode 100644 vignettes/Bird_Parasite_CRF.html delete mode 100644 vignettes/CRF_data_prep.R delete mode 100644 vignettes/CRF_data_prep.html delete mode 100644 vignettes/Gaussian_Poisson_CRFs.R delete mode 100644 vignettes/Gaussian_Poisson_CRFs.html diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..c9d9bd8 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,5 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^doc$ +^Meta$ +^CRAN-RELEASE$ diff --git a/.gitignore b/.gitignore index 2dfe80f..0584128 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ README-unnamed-chunk-6-1.png README-unnamed-chunk-7-1.png README-unnamed-chunk-8-1.png README-unnamed-chunk-9-1.png +doc +Meta diff --git a/CRAN-RELEASE b/CRAN-RELEASE index f97ee7f..1a152a0 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2021-02-25. -Once it is accepted, delete this file and tag the release (commit 09ca6c6). +This package was submitted to CRAN on 2021-03-09. +Once it is accepted, delete this file and tag the release (commit 2be47b9). diff --git a/DESCRIPTION b/DESCRIPTION index 3c66b43..1e25e58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MRFcov Type: Package Title: Markov Random Fields with Additional Covariates -Version: 1.0.4 +Version: 1.0.38 Date/Publication: 2021-02-25 URL: https://github.com/nicholasjclark/MRFcov Authors@R: c( @@ -35,7 +35,7 @@ Imports: purrr, MASS License: GPL-3 Encoding: UTF-8 -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.1 LazyData: true Suggests: testthat, knitr, diff --git a/R/Bird.parasites.R b/R/Bird.parasites.R index f76d9e3..23af0d7 100644 --- a/R/Bird.parasites.R +++ b/R/Bird.parasites.R @@ -17,5 +17,5 @@ #' @references Clark, N.J., Wells, K., Dimitrov, D. & Clegg, S.M. (2016) #' Co-infections and environmental conditions drive the distributions of blood #' parasites in wild birds. Journal of Animal Ecology, 85, 1461-1470. -#' @source \url{http://dx.doi.org/10.5061/dryad.pp6k4} +#' @source \doi{10.5061/dryad.pp6k4} "Bird.parasites" diff --git a/cran-comments.md b/cran-comments.md index f6306b6..0402dca 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,4 @@ -## Patch updates for release v1.1 +## Patch updates for release v1.0.38 This update contains improvments to the nonparanormal mapping of predictions for discrete variables ## Test environments @@ -8,8 +8,19 @@ This update contains improvments to the nonparanormal mapping of predictions for ## R CMD check results There were no ERRORs or WARNINGs. -The following NOTE appeared in the win-builder check: -* checking CRAN incoming feasibility ... NOTE +An ERROR appears in the win-builder check for r-devel-windows-ix86+x86_64, likely owing to failures on r-devel-windows-ix86+x86_64 due to missing dependencies that need compilation. A test using `rhub::check( platform="windows-x86_64-devel", env_vars=c(R_COMPILE_AND_INSTALL_PACKAGES = "always"))` passes the win-builder check + +Previously the version of this update was not sufficient for CRAN requirements: Insufficient package version (submitted: 1.0.4, existing: 1.0.37). This has been changed (new version is 1.0.38) + +A URL in the one of the vignettes previously did not have https. The CRAN note was +* Found the following (possibly) invalid URLs: URL: http://www.jmlr.org/papers/v10/liu09a.html (moved to https://www.jmlr.org/papers/v10/liu09a.html) From: inst/doc/Gaussian_Poisson_CRFs.html Status: 200 + +This has been changed to the suggested https format + +Previously a DOI was not formatted according to CRAN requirements: Found the following URLs which should use \doi (with the DOI name only): File 'Bird.parasites.Rd': http://dx.doi.org/10.5061/dryad.pp6k4 + +This has been fixed + + Maintainer: 'Nicholas J Clark ' -No action was taken, as this note apparently reminds CRAN maintainers to check that the submission actually comes from myself and not anybody else. diff --git a/man/Bird.parasites.Rd b/man/Bird.parasites.Rd index 5528283..2401a19 100644 --- a/man/Bird.parasites.Rd +++ b/man/Bird.parasites.Rd @@ -4,7 +4,8 @@ \name{Bird.parasites} \alias{Bird.parasites} \title{Blood parasite occurrences in New Caledonian birds.} -\format{A data frame with 449 rows and 5 variables: +\format{ +A data frame with 449 rows and 5 variables: \describe{ \item{Hzosteropis}{binary occurrence of \emph{Haemoproteus zosteropis}} \item{Hkillangoi}{binary occurrence of \emph{Haemoproteus killangoi}} @@ -12,9 +13,10 @@ \item{Microfilaria}{binary occurrence of Microfilaria species} \item{scale.prop.zos}{scaled numeric variable representing relative abundance of \emph{Zosterops} host species} -}} +} +} \source{ -\url{http://dx.doi.org/10.5061/dryad.pp6k4} +\doi{10.5061/dryad.pp6k4} } \usage{ Bird.parasites diff --git a/man/MRFcov.Rd b/man/MRFcov.Rd index 3167423..8b7577f 100644 --- a/man/MRFcov.Rd +++ b/man/MRFcov.Rd @@ -4,8 +4,17 @@ \alias{MRFcov} \title{Markov Random Fields with covariates} \usage{ -MRFcov(data, symmetrise, prep_covariates, n_nodes, n_cores, n_covariates, - family, bootstrap = FALSE, progress_bar = FALSE) +MRFcov( + data, + symmetrise, + prep_covariates, + n_nodes, + n_cores, + n_covariates, + family, + bootstrap = FALSE, + progress_bar = FALSE +) } \arguments{ \item{data}{A \code{dataframe}. The input data where the \code{n_nodes} diff --git a/man/MRFcov_spatial.Rd b/man/MRFcov_spatial.Rd index c12d11e..deb1c1b 100644 --- a/man/MRFcov_spatial.Rd +++ b/man/MRFcov_spatial.Rd @@ -4,9 +4,19 @@ \alias{MRFcov_spatial} \title{Spatially structured Markov Random Fields with covariates} \usage{ -MRFcov_spatial(data, symmetrise, prep_covariates, n_nodes, n_cores, - n_covariates, family, coords, prep_splines = TRUE, bootstrap = FALSE, - progress_bar = FALSE) +MRFcov_spatial( + data, + symmetrise, + prep_covariates, + n_nodes, + n_cores, + n_covariates, + family, + coords, + prep_splines = TRUE, + bootstrap = FALSE, + progress_bar = FALSE +) } \arguments{ \item{data}{A \code{dataframe}. The input data where the \code{n_nodes} diff --git a/man/bootstrap_MRF.Rd b/man/bootstrap_MRF.Rd index ccbccfa..0894d0d 100644 --- a/man/bootstrap_MRF.Rd +++ b/man/bootstrap_MRF.Rd @@ -4,9 +4,19 @@ \alias{bootstrap_MRF} \title{Bootstrap observations to estimate MRF parameter coefficients} \usage{ -bootstrap_MRF(data, n_bootstraps, sample_seed, symmetrise, n_nodes, - n_cores, n_covariates, family, sample_prop, spatial = FALSE, - coords = NULL) +bootstrap_MRF( + data, + n_bootstraps, + sample_seed, + symmetrise, + n_nodes, + n_cores, + n_covariates, + family, + sample_prop, + spatial = FALSE, + coords = NULL +) } \arguments{ \item{data}{Dataframe. The input data where the \code{n_nodes} diff --git a/man/cv_MRF_diag.Rd b/man/cv_MRF_diag.Rd index d2666ac..aa45f09 100644 --- a/man/cv_MRF_diag.Rd +++ b/man/cv_MRF_diag.Rd @@ -6,16 +6,51 @@ \alias{cv_MRF_diag_rep_spatial} \title{MRF cross validation and assessment of predictive performance} \usage{ -cv_MRF_diag(data, symmetrise, n_nodes, n_cores, sample_seed, n_folds, - n_fold_runs, n_covariates, compare_null, family, plot = TRUE, - cached_model, cached_predictions, mod_labels = NULL) - -cv_MRF_diag_rep(data, symmetrise, n_nodes, n_cores, sample_seed, n_folds, - n_fold_runs, n_covariates, compare_null, family, plot = TRUE) - -cv_MRF_diag_rep_spatial(data, coords, symmetrise, n_nodes, n_cores, - sample_seed, n_folds, n_fold_runs, n_covariates, compare_null, family, - plot = TRUE) +cv_MRF_diag( + data, + symmetrise, + n_nodes, + n_cores, + sample_seed, + n_folds, + n_fold_runs, + n_covariates, + compare_null, + family, + plot = TRUE, + cached_model, + cached_predictions, + mod_labels = NULL +) + +cv_MRF_diag_rep( + data, + symmetrise, + n_nodes, + n_cores, + sample_seed, + n_folds, + n_fold_runs, + n_covariates, + compare_null, + family, + plot = TRUE +) + +cv_MRF_diag_rep_spatial( + data, + coords, + symmetrise, + n_nodes, + n_cores, + sample_seed, + n_folds, + n_fold_runs, + n_covariates, + compare_null, + family, + plot = TRUE +) } \arguments{ \item{data}{Dataframe. The input data where the \code{n_nodes} diff --git a/man/predict_MRF.Rd b/man/predict_MRF.Rd index 38021f2..99d38fb 100644 --- a/man/predict_MRF.Rd +++ b/man/predict_MRF.Rd @@ -4,8 +4,13 @@ \alias{predict_MRF} \title{Predict training observations from fitted MRFcov models} \usage{ -predict_MRF(data, MRF_mod, prep_covariates = TRUE, n_cores, - progress_bar = FALSE) +predict_MRF( + data, + MRF_mod, + prep_covariates = TRUE, + n_cores, + progress_bar = FALSE +) } \arguments{ \item{data}{Dataframe. The input data to be predicted, where the \code{n_nodes} diff --git a/man/predict_MRFnetworks.Rd b/man/predict_MRFnetworks.Rd index 7c61b5c..074a40a 100644 --- a/man/predict_MRFnetworks.Rd +++ b/man/predict_MRFnetworks.Rd @@ -5,9 +5,17 @@ \title{Extract predicted network metrics for observations in a given dataset using equations from a fitted \code{MRFcov} object} \usage{ -predict_MRFnetworks(data, MRF_mod, cutoff, omit_zeros, metric, - cached_predictions = NULL, prep_covariates, n_cores, - progress_bar = FALSE) +predict_MRFnetworks( + data, + MRF_mod, + cutoff, + omit_zeros, + metric, + cached_predictions = NULL, + prep_covariates, + n_cores, + progress_bar = FALSE +) } \arguments{ \item{data}{Dataframe. The sample data where the diff --git a/vignettes/Bird_Parasite_CRF.R b/vignettes/Bird_Parasite_CRF.R deleted file mode 100644 index 9f24918..0000000 --- a/vignettes/Bird_Parasite_CRF.R +++ /dev/null @@ -1,107 +0,0 @@ -## ---- message=FALSE, warning=FALSE--------------------------------------- -library(MRFcov) -data("Bird.parasites") - -## ----message=F, warning=FALSE, eval = FALSE------------------------------ -# #Not run -# #install.packages(dplyr) -# data.paras = data.frame(data.paras) %>% -# dplyr::group_by(Capturesession,Genus) %>% -# dplyr::summarise(count = dlyr::n()) %>% -# dplyr::mutate(prop.zos = count / sum(count)) %>% -# dplyr::left_join(data.paras) %>% -# dplyr::ungroup() %>% dplyr::filter(Genus == 'Zosterops') %>% -# dplyr::mutate(scale.prop.zos = as.vector(scale(prop.zos))) -# data.paras <- data.paras[, c(12:15, 23)] - -## ----eval=FALSE---------------------------------------------------------- -# help("Bird.parasites") -# View(Bird.parasites) - -## ------------------------------------------------------------------------ -MRF_fit <- MRFcov(data = Bird.parasites[, c(1:4)], n_nodes = 4, family = 'binomial') - -## ------------------------------------------------------------------------ -plotMRF_hm(MRF_mod = MRF_fit, main = 'MRF (no covariates)', - node_names = c('H. zosteropis', 'H. killangoi', - 'Plasmodium', 'Microfilaria')) - -## ------------------------------------------------------------------------ -net <- igraph::graph.adjacency(MRF_fit$graph, weighted = T, mode = "undirected") -igraph::plot.igraph(net, layout = igraph::layout.circle, - edge.width = abs(igraph::E(net)$weight), - edge.color = ifelse(igraph::E(net)$weight < 0, - 'blue', - 'red')) - -## ------------------------------------------------------------------------ -MRF_mod <- MRFcov(data = Bird.parasites, n_nodes = 4, family = 'binomial') - -## ------------------------------------------------------------------------ -plotMRF_hm(MRF_mod = MRF_mod) - -## ------------------------------------------------------------------------ -MRF_mod$key_coefs$Hzosteropis - -## ------------------------------------------------------------------------ -fake.dat <- Bird.parasites -fake.dat$Microfilaria <- rbinom(nrow(Bird.parasites), 1, 0.8) -fake.preds <- predict_MRF(data = fake.dat, MRF_mod = MRF_mod) - -## ------------------------------------------------------------------------ -H.zos.pred.prev <- sum(fake.preds$Binary_predictions[, 'Hzosteropis']) / nrow(fake.preds$Binary_predictions) -Plas.pred.prev <- sum(fake.preds$Binary_predictions[, 'Plas']) / nrow(fake.preds$Binary_predictions) -Plas.pred.prev - -## ------------------------------------------------------------------------ -mod_fits <- cv_MRF_diag_rep(data = Bird.parasites, n_nodes = 4, - n_cores = 1, family = 'binomial', plot = F, - compare_null = T, - n_folds = 10) - -# CRF (with covariates) model sensitivity -quantile(mod_fits$mean_sensitivity[mod_fits$model == 'CRF'], probs = c(0.05, 0.95)) - -# MRF (no covariates) model sensitivity -quantile(mod_fits$mean_sensitivity[mod_fits$model != 'CRF'], probs = c(0.05, 0.95)) - -## ------------------------------------------------------------------------ -booted_MRF <- bootstrap_MRF(data = Bird.parasites, n_nodes = 4, family = 'binomial', n_bootstraps = 10, n_cores = 1) - -## ------------------------------------------------------------------------ -booted_MRF$mean_key_coefs$Hzosteropis - -## ------------------------------------------------------------------------ -booted_MRF$mean_key_coefs$Hkillangoi - -## ------------------------------------------------------------------------ -booted_MRF$mean_key_coefs$Plas - -## ------------------------------------------------------------------------ -booted_MRF$mean_key_coefs$Microfilaria - -## ------------------------------------------------------------------------ -adj_mats <- predict_MRFnetworks(data = Bird.parasites, - MRF_mod = booted_MRF, - metric = 'eigencentrality', - cutoff = 0.33) -colnames(adj_mats) <- colnames(Bird.parasites[, 1:4]) -apply(adj_mats, 2, summary) - -## ----eval = FALSE-------------------------------------------------------- -# Latitude <- sample(seq(120, 140, length.out = 100), nrow(Bird.parasites), TRUE) -# Longitude <- sample(seq(-19, -22, length.out = 100), nrow(Bird.parasites), TRUE) -# coords <- data.frame(Latitude = Latitude, Longitude = Longitude) - -## ----eval = FALSE-------------------------------------------------------- -# CRFmod_spatial <- MRFcov_spatial(data = Bird.parasites, n_nodes = 4, -# family = 'binomial', coords = coords) - -## ----eval = FALSE-------------------------------------------------------- -# CRFmod_spatial$key_coefs$Hzosteropis - -## ----eval = FALSE-------------------------------------------------------- -# cv_MRF_diag_rep_spatial(data = Bird.parasites, n_nodes = 4, -# n_cores = 3, family = 'binomial', plot = T, compare_null = T, -# coords = coords) - diff --git a/vignettes/Bird_Parasite_CRF.html b/vignettes/Bird_Parasite_CRF.html deleted file mode 100644 index 7c25505..0000000 --- a/vignettes/Bird_Parasite_CRF.html +++ /dev/null @@ -1,487 +0,0 @@ - - - - - - - - - - - - - - -MRFs and CRFs for Bird.parasites data - - - - - - - - - - - - - - - - - - - - - -

MRFs and CRFs for Bird.parasites data

- - - -

We can explore the model’s primary functions using a test dataset that is available with the package. Load the Bird.parasites dataset, which contains binary occurrences of four avian blood parasites in New Caledonian Zosterops species (available in its original form at Dryad; Clark et al 2016). A single continuous covariate is also included (scale.prop.zos), which reflects the relative abundance of Zosterops species (compared to other avian species) among different sample sites

- -

The Bird.parasites dataset is already in the appropriate structure for running MRFcov models; however, it is useful to understand how this was done. Using the raw dataset (downloaded from Dryad at the above link), we created a scaled continuous covariate to represent Zosterops spp. proportional captures in each sample site (as an estimate of relative abundance)

- -

You can visualise the dataset to see how analysis data needs to be structured. In short, for (family = "binomial") models, node variable (i.e. species) occurrences should be included as binary variables (1s and 0s) as the n_nodes left-most variables in data. Note that Gaussian continuous variables (family = "gaussian") and Poisson non-negative counts (family = "poisson") are also supported in MRFcov. Any covariates can be included as the right-most variables. It is recommended that these covariates all be on a similar scale, ideally using the scale function for continuous covariates (or similar) so that covariates have roughly mean = 0 and sd = 1, as this makes LASSO variable selection and comparisons of effect sizes very straightforward

- -
-

Running MRFs and visualising interaction coefficients

-

We first test for interactions using a Markov Random Fields (MRF) model without covariates. We set n_nodes as the number of species to be represented in the graphical model (4). We also need to specify the family for the model (binomial in this case). We do not specify the level of penalisation used by the LASSO algorithm. Instead, this is optimised separately for each species through cross validation (using function cv.glmnet in the glmnet package). This ensures the log-likelihood of each species is properly maximised before unifying them into an undirected graph. Finally, it is important to note that for MRF model functions, we can specify the number of processing cores to split the job across (i.e. n_cores). In this case we will only use 1 core, but higher numbers may increase speed (if more cores are available). Check available cores using the detectCores() function in the parallel package

- -
## Leave-one-out cv used for the following low-occurrence (rare) nodes:
-##  Microfilaria ...
-## Fitting MRF models in sequence using 1 core ...
-

The message above can be useful for identifying nodes (i.e. species) that are very rare or very common, as these can be difficult to properly model. In this case, the Microfilaria node is fairly rare, and so the function automatically reverts to using leave-one-out cross-validation to optimise parameters. This can be slow, so be aware when attempting to use large datasets that contain many very rare or very common species. Now that the model has converged, we can plot the estimated interaction coefficients as a heatmap using plotMRF_hm. Note that we can specify the names of nodes should we want to change them

- -

-

Note that other plotting methods an be used as desired. For instance, to plot these as a network instead, we can simply extract the adjacency matrix and plot it using standard igraph functions

- -

-
-
-

Running CRFs using additional covariates

-

We can now run a Conditional Random Fields (CRF) model using the provided continuous covariate (scale.prop.zos). Again, each species’ regression is optimised separately using LASSO regularization. Note that any columns in data to the right of column n_nodes will be presumed to represent covariates if we don’t specify an n_covariates argument

- -
## Leave-one-out cv used for the following low-occurrence (rare) nodes:
-##  Microfilaria ...
-## Fitting MRF models in sequence using 1 core ...
-

Visualise the estimated species interaction coefficients as a heatmap. These represent predicted interactions when the covariate is set to its mean (i.e. in this case, when scale.prop.zos = 0). If we had included other covariates, then this graph would represent interactions predicted when all covariates were set at their means

- -

-

Regression coefficients and their relative importances can be accessed as well. This call returns a matrix of the raw coefficient, as well as standardised coefficients (standardised by the sd of the covariate). Standardisation in this way helps to compare the relative influences of each parameter on the target species’ occurrence probability, but in general the two coefficients will be identical (unless users have not pre-scaled their covariates). The list also contains contains each variable’s relative importance (calculated using the formula B^2 / sum(B^2), where the vector of Bs represents regression coefficients for predictor variables). Variables with an underscore (_) indicate an interaction between a covariate and another node, suggesting that conditional dependencies of the two nodes vary across environmental gradients. Because of this, it is recommended to avoid using column names with _ in them

- -
##                      Variable Rel_importance Standardised_coef   Raw_coef
-## 1                  Hkillangoi     0.66675120        -2.4108758 -2.4108758
-## 5 scale.prop.zos_Microfilaria     0.12537417        -1.0454349 -1.0454349
-## 3                Microfilaria     0.09679433         0.9185819  0.9185819
-## 4              scale.prop.zos     0.09538282        -0.9118597 -0.9118597
-## 2                        Plas     0.01232180        -0.3277405 -0.3277405
-

Finally, a useful capability is to generate some fake data and test predictions. For instance, say we want to know how frequently malaria parasite infections are predicted to occur in sites with high occurrence of microfilaria

- -

The returned object from predict_MRF depends on the family of the model. For family = "binomial", we get a list including both linear prediction probabilities and binary predictions (where a linear prediction probability > 0.5 equates to a binary prediction of 1). These binary predictions can be used to estimate parasite prevalence

- -
## [1] 0.363029
-
-
-

Comparing fits of MRF and CRF models

-

A key step in the process of model exploration is to determine whether inclusion of covariates actually improves model fit and is warranted when estimating interaction parameters. This is straightforward for binomial models, as we can compare classification accuracy quickly and easily. The cv_diag functions will fit models and then determine their predictive performances against the supplied data. We can also compare MRF and CRF models directly to determine whether covariates are warranted. For this dataset, considering that parasite infections are quite rare, we are primarily interested in maximising model Sensitivity (capacity to successfully predict positive infections)

- -
## Generating node-optimised Conditional Random Fields model
-## 
-## Generating Markov Random Fields model (no covariates)
-## 
-## Calculating model predictions of the supplied data
-## Generating CRF predictions ...
-## Generating null MRF predictions ...
-## 
-## Calculating predictive performance across test folds
-## Processing cross-validation run 1 of 10 ...
-## Processing cross-validation run 2 of 10 ...
-## Processing cross-validation run 3 of 10 ...
-## Processing cross-validation run 4 of 10 ...
-## Processing cross-validation run 5 of 10 ...
-## Processing cross-validation run 6 of 10 ...
-## Processing cross-validation run 7 of 10 ...
-## Processing cross-validation run 8 of 10 ...
-## Processing cross-validation run 9 of 10 ...
-## Processing cross-validation run 10 of 10 ...
- -
##        5%       95% 
-## 0.1333333 0.4195992
- -
##        5%       95% 
-## 0.0532032 0.2369565
-
-
-

Bootstrapping the data and running models

-

We now may want to fit models to bootstrapped subsets of the data to account for parameter uncertainty. Users can change the proportion of observations to include in each bootstrap run with the sample_prop option.

- -
## Fitting bootstrap_MRF models in sequence using 1 core...
-
-
-

Exploring regression coefficients and interpreting results

-

Finally, we can explore regression coefficients to get a better understanding of just how important interactions are for predicting species’ occurrence probabilities (in comparison to other covariates). This is perhaps the strongest property of CRFs, as comparing the relative importances of interactions and fixed covariates using competing methods (such as Joint Species Distribution Models) is difficult. The bootstrap_MRF function conveniently returns a matrix of important coefficients for each node in the graph, as well as their relative importances

- -
##                      Variable Rel_importance  Mean_coef
-## 1                  Hkillangoi     0.66925507 -2.4022141
-## 5 scale.prop.zos_Microfilaria     0.12424176 -1.0350222
-## 3                Microfilaria     0.09711082  0.9150602
-## 4              scale.prop.zos     0.09365872 -0.8986488
-## 2                        Plas     0.01245306 -0.3276830
- -
##         Variable Rel_importance  Mean_coef
-## 1    Hzosteropis     0.78477609 -2.4022141
-## 2   Microfilaria     0.12794422 -0.9699496
-## 3 scale.prop.zos     0.08726237 -0.8010365
- -
##                      Variable Rel_importance  Mean_coef
-## 2                Microfilaria     0.65030129  1.5423905
-## 3              scale.prop.zos     0.26039725 -0.9760122
-## 4 scale.prop.zos_Microfilaria     0.05218264  0.4369181
-## 1                 Hzosteropis     0.02935173 -0.3276830
- -
##                     Variable Rel_importance  Mean_coef
-## 3                       Plas     0.35639207  1.5423905
-## 4             scale.prop.zos     0.18814126 -1.1206561
-## 5 scale.prop.zos_Hzosteropis     0.16048656 -1.0350222
-## 2                 Hkillangoi     0.14094109 -0.9699496
-## 1                Hzosteropis     0.12544076  0.9150602
-## 6        scale.prop.zos_Plas     0.02859825  0.4369181
-

Users can also use the predict_MRFnetworks function to calculate network statistics for each node in each observation or to generate adjacency matrices for each observation. By default, this function generates a list of igraph adjacency matrices, one for each row in data, which can be used to make network plots using a host of other packages. Note, both this function and the predict_MRF function rely on data that has a structure exactly matching to the data that was used to fit the model. In other words, the column names and column order need to be identical. The cutoff argument is important for binary problems, as this specifies the probability threshold for stating whether or not a species should be considered present at a site (and thus, whether their interactions will be present). Here, we state that a predicted occurrence above 0.33 is sufficient

- -
##         Hzosteropis Hkillangoi      Plas Microfilaria
-## Min.      0.0000000 0.00000000 0.0000000   0.00000000
-## 1st Qu.   0.0000000 0.00000000 0.0000000   0.00000000
-## Median    0.0000000 0.00000000 0.0000000   0.00000000
-## Mean      0.1846961 0.07572383 0.1908756   0.08908686
-## 3rd Qu.   0.4757349 0.00000000 0.1637703   0.00000000
-## Max.      1.0000000 1.00000000 1.0000000   1.00000000
-
-
-

Accounting for possible spatial autocorrelation

-

Lastly, MRFcov has the capability to account for possible spatial autocorrelation when estimating interaction parameters. To do this, we incorporate functions from the mgcv package to include smoothed Gaussian process spatial regression splines in each node-wise regression. The user must supply a two-column data.frame called coords, which will ideally contain Latitude and Longitude values for each row in data. We don’t have these coordinates for the Bird.parasites dataset, so we will instead create some fake coordinates to showcase the model. Note, these commands were not run here, but feel free to move through them as you did for the above examples

- -

The syntax for the MRFcov_spatial function is nearly identical to MRFcov, with the exception that coords must be supplied

- -

Interpretation is also identical to MRFcov objects. Here, key coefficients are those that are retained after accounting for spatial influences

- -

Finally, we can compare fits of spatial and non-spatial models just as we did for MRFs and CRFs above

- -
- - - - - - - - diff --git a/vignettes/CRF_data_prep.R b/vignettes/CRF_data_prep.R deleted file mode 100644 index 6f9da1f..0000000 --- a/vignettes/CRF_data_prep.R +++ /dev/null @@ -1,31 +0,0 @@ -## ----eval=FALSE---------------------------------------------------------- -# temp <- tempfile() -# download.file('https://ndownloader.figshare.com/files/2075362', -# temp) -# dataset <- read.csv(temp, as.is = T) -# unlink(temp) - -## ----eval=FALSE---------------------------------------------------------- -# dataset$dipping_round <- as.factor(dataset$dipping_round) -# dataset$field_site <- as.factor(dataset$field_site) -# dataset[,c(1,2,5,6)] <- NULL - -## ----eval=FALSE---------------------------------------------------------- -# levels(dataset$dipping_round)[1] -# levels(dataset$field_site)[1] - -## ----eval=FALSE---------------------------------------------------------- -# library(dplyr) -# analysis.data = dataset %>% -# cbind(.,data.frame(model.matrix(~.[,'field_site'], -# .)[,-1])) %>% -# cbind(.,data.frame(model.matrix(~.[,'dipping_round'], -# .)[,-1])) %>% -# dplyr::select(-field_site,-dipping_round) %>% -# dplyr::rename_all(funs(gsub("\\.|model.matrix", "", .))) -# - -## ----eval=FALSE---------------------------------------------------------- -# analysis.data[, 1:16] <- ifelse(analysis.data[, 1:16] > 0, 1, 0) -# analysis.data[, 17:20] <- scale(analysis.data[, 17:20], center = T, scale = T) - diff --git a/vignettes/CRF_data_prep.html b/vignettes/CRF_data_prep.html deleted file mode 100644 index e3430d1..0000000 --- a/vignettes/CRF_data_prep.html +++ /dev/null @@ -1,348 +0,0 @@ - - - - - - - - - - - - - - -Prepping datasets for CRF models - - - - - - - - - - - - - - - - - - - - - -

Prepping datasets for CRF models

- - - -
-

Running CRFs with categorical covariates requires expansion to model matrix format

-

The mosquito occurrence data from Golding et al 2015 (published in Parasites & Vectors) available from figshare here is useful for exploring how datasets need to be prepped for running Conditional Random Fields (CRF) models. Here, we will download the raw data from figshare (note, an internet connection will be needed for this step) change ‘dipping_round’ to a factor variable and remove un-needed columns

- -

We can now change the categorical dipping_round and field_site variables to factors and remove some un-needed variables

- -

It is important here to examine the level names of factor variables, as the 1st level (i.e. the dummy level) will be dropped from the dataset during conversion to model matrix format (as in standard lme4 analysis of factor covariates)

- -

The next step is to convert any factor variables into model matrix format. As mentioned above, this step will drop the first level of a factor and then create an additional column for each additional level (i.e. dipping_round levels "3", "5" and "6" will all be assigned their own unique columns, while dipping_round level "2" will be dropped and treated as the reference level). It is also convenient to change names of the new covariate columns so they are easier to view and interpret (done here using dplyr::rename_all)

- -

Finally, we need to convert species abundances to binary presence-absence format (as we are only estimating co-occurrences, not co-abundances). It is also highly advisable to scale any continuous variables so they all have mean = 0 and sd = 1

- -
- - - - - - - - diff --git a/vignettes/Gaussian_Poisson_CRFs.R b/vignettes/Gaussian_Poisson_CRFs.R deleted file mode 100644 index 8c04193..0000000 --- a/vignettes/Gaussian_Poisson_CRFs.R +++ /dev/null @@ -1,63 +0,0 @@ -## ------------------------------------------------------------------------ -cov <- rnorm(500, 0.2) -cov2 <- rnorm(500, 4) -sp.2 <- ceiling(rnorm(500, 1) + (cov * 2)) -sp.2[sp.2 < 0] <- 0 -poiss.dat <- data.frame(sp.1 = ceiling(rnorm(500, 4) + (cov2 * 15.5) + (sp.2 * -0.15)), - sp.2 = sp.2, sp.3 = ceiling((sp.2 * 1.5) + rnorm(500, 0.1))) -poiss.dat[poiss.dat < 0] <- 0 -poiss.dat$cov <- cov -poiss.dat$cov2 <- cov2 - -## ------------------------------------------------------------------------ -apply(poiss.dat[, c(1:3)], 2, range) - -## ------------------------------------------------------------------------ -sp.1.vs.sp.2 <- coef(glm(sp.1 ~ sp.2, family = 'poisson', data = poiss.dat))[2] -sp.2.vs.sp.1 <- coef(glm(sp.2 ~ sp.1, family = 'poisson', data = poiss.dat))[2] -mean.coef <- mean(sp.1.vs.sp.2, sp.2.vs.sp.1) - -# Exponentiate, due to the log link -mean.coef <- exp(mean.coef) - -# Make plots of predicted vs observed -# First, predict sp.1 (the common species) abundances -plot(poiss.dat$sp.1, poiss.dat$sp.2 * mean.coef, - xlab = 'Observed', - ylab = 'Predicted') - -# Now predict sp.2 (the more rare species) abundances -plot(poiss.dat$sp.2, poiss.dat$sp.1 * mean.coef, - xlab = 'Observed', - ylab = 'Predicted') - -## ------------------------------------------------------------------------ -library(MRFcov) -poiss.crf <- MRFcov(data = poiss.dat, n_nodes = 3, family = 'poisson') -poiss.crf$poiss_sc_factors - -## ------------------------------------------------------------------------ -sd(poiss.dat[,1]) * poiss.crf$direct_coefs$cov2[1] - -## ------------------------------------------------------------------------ -poiss.preds <- predict_MRF(data = poiss.dat, MRF_mod = poiss.crf, - n_cores = 1) -plot(poiss.dat$sp.1, poiss.preds[,1], - xlab = 'Observed', - ylab = 'Predicted') -plot(poiss.dat$sp.2, poiss.preds[,2], - xlab = 'Observed', - ylab = 'Predicted') - -## ------------------------------------------------------------------------ -poiss.cv <- cv_MRF_diag(data = poiss.dat, n_nodes = 3, - n_folds = 5, - n_cores = 1, family = 'poisson', - compare_null = TRUE, plot = FALSE) - -# CRF (with covariates) model deviance -range(poiss.cv$Deviance[poiss.cv$model == 'CRF']) - -# MRF (no covariates) model deviance -range(poiss.cv$Deviance[poiss.cv$model != 'CRF']) - diff --git a/vignettes/Gaussian_Poisson_CRFs.Rmd b/vignettes/Gaussian_Poisson_CRFs.Rmd index d0700c8..a7f17e3 100644 --- a/vignettes/Gaussian_Poisson_CRFs.Rmd +++ b/vignettes/Gaussian_Poisson_CRFs.Rmd @@ -1,89 +1,89 @@ ---- -title: "Gaussian and Poisson CRFs" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Gaussian and Poisson CRFs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -The theory behind using nodewise regression to approximate Markov Random Fields and Conditional Random Fields models was developed with binary presence-absence data in mind. However, the same principle can hold for other data structures, as long as we recognise the key limitations. For Gaussian and Poisson variables, the primary issue with fitting MRF or CRF models in this fashion is that the outcome variables are not necessarily on the same scale. Consider a community where we have three species that show some associations in their abundances, but the ranges of these abundances are quite different (this could perhaps apply to a quickly-reproducing prey species and some wide-ranging predator species, for example). Here, we simulate a dataset with two fake covariates and three species to illustrate the problem -```{r} -cov <- rnorm(500, 0.2) -cov2 <- rnorm(500, 4) -sp.2 <- ceiling(rnorm(500, 1) + (cov * 2)) -sp.2[sp.2 < 0] <- 0 -poiss.dat <- data.frame(sp.1 = ceiling(rnorm(500, 4) + (cov2 * 15.5) + (sp.2 * -0.15)), - sp.2 = sp.2, sp.3 = ceiling((sp.2 * 1.5) + rnorm(500, 0.1))) -poiss.dat[poiss.dat < 0] <- 0 -poiss.dat$cov <- cov -poiss.dat$cov2 <- cov2 -``` - -By investigating the ranges of the species' abundance vectors, we can see that they are quite different, with `sp.1` generally showing higher abundance than the other two species -```{r} -apply(poiss.dat[, c(1:3)], 2, range) -``` - -This can cause a problem for fitting MRF / CRF models, as we are essentially regressing each species' abundance against the abundances of all other species (and covariates) and then *symmetrising* the respective parameter estimates (taking the `mean` of the two estimates, by default). This will undoubtedly lead to severe overestimates for the species with lower abundance ranges and underestimates for the common species. We can illustrate this here for `sp.1` and `sp.2` by running a simple `glm` for each species by including the other species as a predictor -```{r} -sp.1.vs.sp.2 <- coef(glm(sp.1 ~ sp.2, family = 'poisson', data = poiss.dat))[2] -sp.2.vs.sp.1 <- coef(glm(sp.2 ~ sp.1, family = 'poisson', data = poiss.dat))[2] -mean.coef <- mean(sp.1.vs.sp.2, sp.2.vs.sp.1) - -# Exponentiate, due to the log link -mean.coef <- exp(mean.coef) - -# Make plots of predicted vs observed -# First, predict sp.1 (the common species) abundances -plot(poiss.dat$sp.1, poiss.dat$sp.2 * mean.coef, - xlab = 'Observed', - ylab = 'Predicted') - -# Now predict sp.2 (the more rare species) abundances -plot(poiss.dat$sp.2, poiss.dat$sp.1 * mean.coef, - xlab = 'Observed', - ylab = 'Predicted') -``` - -To overcome the issue of possible differences in ranges for Poisson variables, `MRFcov` uses a nonparanormal transformation method whereby each variable is `log2(x + 0.1)` transformed and mapped to a normal distribution via a copula approach prior to fitting the model([see here for more information](http://www.jmlr.org/papers/v10/liu09a.html)). The model is then fitted by considering the variables as Gaussian (e.g., there is no link function for these models). In doing so, the variables are all normalised without adding too much noise (though this transformation should always be recognised as a limitation). Any Poisson models that are fit will return an additional item called `poiss_sc_factors`, which will be necessary to back-transform outputs back to the variables' original distributions. This is done by estimating each variable's negative binomial or poisson parameters, depending on which distribution was more appropriate -```{r} -library(MRFcov) -poiss.crf <- MRFcov(data = poiss.dat, n_nodes = 3, family = 'poisson') -poiss.crf$poiss_sc_factors -``` - -Our coefficient for the effect of `cov2` on `sp.1` above was `15.5`. To see how the model did in isolating this effect, we can get an *approximation* by converting the effect using the `sd` of the raw variable (this is because the model was fitted to normalised variables that were at unit variance) -```{r} -sd(poiss.dat[,1]) * poiss.crf$direct_coefs$cov2[1] -``` - -Not bad, but again this is an approximation. However for generating predictions, `MRFcov` will first generate the normalised predictions and then map these to each raw variable's appropriate distribution (either negative binomial or poisson, depending on which was more appropriate). This is a better way to ensure that outcomes are converted to the correct range. For example, we can use our fitted model to predict the raw data and generate similar plots to those that we created above -```{r} -poiss.preds <- predict_MRF(data = poiss.dat, MRF_mod = poiss.crf, - n_cores = 1) -plot(poiss.dat$sp.1, poiss.preds[,1], - xlab = 'Observed', - ylab = 'Predicted') -plot(poiss.dat$sp.2, poiss.preds[,2], - xlab = 'Observed', - ylab = 'Predicted') -``` - -These plots should look much better. We can also inspect the outputs of the `cv` functions for Poisson data. `MRFcov` can use cross-validation (fitting models to specific subsets of the data and using resulting equations to predict observations for the remaining subset) to describe model fit and compare models with and without covariates. For `family = binomial` models, as seen at the end of the `MRFs and CRFs for Bird.parasites data` vignette, the functions return information on classification accuracy. But for Poisson data, it instead returns information on model deviance and mean squared error. We can showcase this functionality here using the fake data we have generated. We would expect that a CRF would fit the data better in this case, as this model includes the two covariates that we know are influential -```{r} -poiss.cv <- cv_MRF_diag(data = poiss.dat, n_nodes = 3, - n_folds = 5, - n_cores = 1, family = 'poisson', - compare_null = TRUE, plot = FALSE) - -# CRF (with covariates) model deviance -range(poiss.cv$Deviance[poiss.cv$model == 'CRF']) - -# MRF (no covariates) model deviance -range(poiss.cv$Deviance[poiss.cv$model != 'CRF']) -``` - -Lower deviance for the CRF model confirms the idea that inclusion of covariates is supported. - -Please note that the type of normalisation used above **only applies to Poisson models!!** Gaussian variables are not standardised before fitting CRFs / MRFs, so users should inspect the ranges of their variables and consider using the `scale` function to standardise beforehand if need be. The reason Gaussian models do not do this by default is that there are many options for normalisation / standardisation for Gaussian data and so users may wish to have more control over how this is done before fitting models +--- +title: "Gaussian and Poisson CRFs" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Gaussian and Poisson CRFs} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +The theory behind using nodewise regression to approximate Markov Random Fields and Conditional Random Fields models was developed with binary presence-absence data in mind. However, the same principle can hold for other data structures, as long as we recognise the key limitations. For Gaussian and Poisson variables, the primary issue with fitting MRF or CRF models in this fashion is that the outcome variables are not necessarily on the same scale. Consider a community where we have three species that show some associations in their abundances, but the ranges of these abundances are quite different (this could perhaps apply to a quickly-reproducing prey species and some wide-ranging predator species, for example). Here, we simulate a dataset with two fake covariates and three species to illustrate the problem +```{r} +cov <- rnorm(500, 0.2) +cov2 <- rnorm(500, 4) +sp.2 <- ceiling(rnorm(500, 1) + (cov * 2)) +sp.2[sp.2 < 0] <- 0 +poiss.dat <- data.frame(sp.1 = ceiling(rnorm(500, 4) + (cov2 * 15.5) + (sp.2 * -0.15)), + sp.2 = sp.2, sp.3 = ceiling((sp.2 * 1.5) + rnorm(500, 0.1))) +poiss.dat[poiss.dat < 0] <- 0 +poiss.dat$cov <- cov +poiss.dat$cov2 <- cov2 +``` + +By investigating the ranges of the species' abundance vectors, we can see that they are quite different, with `sp.1` generally showing higher abundance than the other two species +```{r} +apply(poiss.dat[, c(1:3)], 2, range) +``` + +This can cause a problem for fitting MRF / CRF models, as we are essentially regressing each species' abundance against the abundances of all other species (and covariates) and then *symmetrising* the respective parameter estimates (taking the `mean` of the two estimates, by default). This will undoubtedly lead to severe overestimates for the species with lower abundance ranges and underestimates for the common species. We can illustrate this here for `sp.1` and `sp.2` by running a simple `glm` for each species by including the other species as a predictor +```{r} +sp.1.vs.sp.2 <- coef(glm(sp.1 ~ sp.2, family = 'poisson', data = poiss.dat))[2] +sp.2.vs.sp.1 <- coef(glm(sp.2 ~ sp.1, family = 'poisson', data = poiss.dat))[2] +mean.coef <- mean(sp.1.vs.sp.2, sp.2.vs.sp.1) + +# Exponentiate, due to the log link +mean.coef <- exp(mean.coef) + +# Make plots of predicted vs observed +# First, predict sp.1 (the common species) abundances +plot(poiss.dat$sp.1, poiss.dat$sp.2 * mean.coef, + xlab = 'Observed', + ylab = 'Predicted') + +# Now predict sp.2 (the more rare species) abundances +plot(poiss.dat$sp.2, poiss.dat$sp.1 * mean.coef, + xlab = 'Observed', + ylab = 'Predicted') +``` + +To overcome the issue of possible differences in ranges for Poisson variables, `MRFcov` uses a nonparanormal transformation method whereby each variable is `log2(x + 0.1)` transformed and mapped to a normal distribution via a copula approach prior to fitting the model([see here for more information](https://www.jmlr.org/papers/v10/liu09a.html)). The model is then fitted by considering the variables as Gaussian (e.g., there is no link function for these models). In doing so, the variables are all normalised without adding too much noise (though this transformation should always be recognised as a limitation). Any Poisson models that are fit will return an additional item called `poiss_sc_factors`, which will be necessary to back-transform outputs back to the variables' original distributions. This is done by estimating each variable's negative binomial or poisson parameters, depending on which distribution was more appropriate +```{r} +library(MRFcov) +poiss.crf <- MRFcov(data = poiss.dat, n_nodes = 3, family = 'poisson') +poiss.crf$poiss_sc_factors +``` + +Our coefficient for the effect of `cov2` on `sp.1` above was `15.5`. To see how the model did in isolating this effect, we can get an *approximation* by converting the effect using the `sd` of the raw variable (this is because the model was fitted to normalised variables that were at unit variance) +```{r} +sd(poiss.dat[,1]) * poiss.crf$direct_coefs$cov2[1] +``` + +Not bad, but again this is an approximation. However for generating predictions, `MRFcov` will first generate the normalised predictions and then map these to each raw variable's appropriate distribution (either negative binomial or poisson, depending on which was more appropriate). This is a better way to ensure that outcomes are converted to the correct range. For example, we can use our fitted model to predict the raw data and generate similar plots to those that we created above +```{r} +poiss.preds <- predict_MRF(data = poiss.dat, MRF_mod = poiss.crf, + n_cores = 1) +plot(poiss.dat$sp.1, poiss.preds[,1], + xlab = 'Observed', + ylab = 'Predicted') +plot(poiss.dat$sp.2, poiss.preds[,2], + xlab = 'Observed', + ylab = 'Predicted') +``` + +These plots should look much better. We can also inspect the outputs of the `cv` functions for Poisson data. `MRFcov` can use cross-validation (fitting models to specific subsets of the data and using resulting equations to predict observations for the remaining subset) to describe model fit and compare models with and without covariates. For `family = binomial` models, as seen at the end of the `MRFs and CRFs for Bird.parasites data` vignette, the functions return information on classification accuracy. But for Poisson data, it instead returns information on model deviance and mean squared error. We can showcase this functionality here using the fake data we have generated. We would expect that a CRF would fit the data better in this case, as this model includes the two covariates that we know are influential +```{r} +poiss.cv <- cv_MRF_diag(data = poiss.dat, n_nodes = 3, + n_folds = 5, + n_cores = 1, family = 'poisson', + compare_null = TRUE, plot = FALSE) + +# CRF (with covariates) model deviance +range(poiss.cv$Deviance[poiss.cv$model == 'CRF']) + +# MRF (no covariates) model deviance +range(poiss.cv$Deviance[poiss.cv$model != 'CRF']) +``` + +Lower deviance for the CRF model confirms the idea that inclusion of covariates is supported. + +Please note that the type of normalisation used above **only applies to Poisson models!!** Gaussian variables are not standardised before fitting CRFs / MRFs, so users should inspect the ranges of their variables and consider using the `scale` function to standardise beforehand if need be. The reason Gaussian models do not do this by default is that there are many options for normalisation / standardisation for Gaussian data and so users may wish to have more control over how this is done before fitting models diff --git a/vignettes/Gaussian_Poisson_CRFs.html b/vignettes/Gaussian_Poisson_CRFs.html deleted file mode 100644 index 792fa67..0000000 --- a/vignettes/Gaussian_Poisson_CRFs.html +++ /dev/null @@ -1,410 +0,0 @@ - - - - - - - - - - - - - - -Gaussian and Poisson CRFs - - - - - - - - - - - - - - - - - - - - - -

Gaussian and Poisson CRFs

- - - -

The theory behind using nodewise regression to approximate Markov Random Fields and Conditional Random Fields models was developed with binary presence-absence data in mind. However, the same principle can hold for other data structures, as long as we recognise the key limitations. For Gaussian and Poisson variables, the primary issue with fitting MRF or CRF models in this fashion is that the outcome variables are not necessarily on the same scale. Consider a community where we have three species that show some associations in their abundances, but the ranges of these abundances are quite different (this could perhaps apply to a quickly-reproducing prey species and some wide-ranging predator species, for example). Here, we simulate a dataset with two fake covariates and three species to illustrate the problem

- -

By investigating the ranges of the species’ abundance vectors, we can see that they are quite different, with sp.1 generally showing higher abundance than the other two species

- -
##      sp.1 sp.2 sp.3
-## [1,]   19    0    0
-## [2,]  108    9   13
-

This can cause a problem for fitting MRF / CRF models, as we are essentially regressing each species’ abundance against the abundances of all other species (and covariates) and then symmetrising the respective parameter estimates (taking the mean of the two estimates, by default). This will undoubtedly lead to severe overestimates for the species with lower abundance ranges and underestimates for the common species. We can illustrate this here for sp.1 and sp.2 by running a simple glm for each species by including the other species as a predictor

- -

- -

-

To overcome the issue of possible differences in ranges for Poisson variables, MRFcov uses a nonparanormal transformation method whereby each variable is log2(x + 0.1) transformed and mapped to a normal distribution via a copula approach prior to fitting the model(see here for more information). The model is then fitted by considering the variables as Gaussian (e.g., there is no link function for these models). In doing so, the variables are all normalised without adding too much noise (though this transformation should always be recognised as a limitation). Any Poisson models that are fit will return an additional item called poiss_sc_factors, which will be necessary to back-transform outputs back to the variables’ original distributions. This is done by estimating each variable’s negative binomial or poisson parameters, depending on which distribution was more appropriate

- -
## Poisson variables will be transformed using a nonparanormal...
-
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
-## Please use a list of either functions or lambdas: 
-## 
-##   # Simple named list: 
-##   list(mean = mean, median = median)
-## 
-##   # Auto named with `tibble::lst()`: 
-##   tibble::lst(mean, median)
-## 
-##   # Using lambdas
-##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
-## This warning is displayed once every 8 hours.
-## Call `lifecycle::last_warnings()` to see where this warning was generated.
-
## Fitting MRF models in sequence using 1 core ...
- -
##          sp.1     sp.2     sp.3
-## size 24.38518 2.550468 2.338238
-## mu   65.74800 2.108000 3.828000
-

Our coefficient for the effect of cov2 on sp.1 above was 15.5. To see how the model did in isolating this effect, we can get an approximation by converting the effect using the sd of the raw variable (this is because the model was fitted to normalised variables that were at unit variance)

- -
## [1] 15.23026
-

Not bad, but again this is an approximation. However for generating predictions, MRFcov will first generate the normalised predictions and then map these to each raw variable’s appropriate distribution (either negative binomial or poisson, depending on which was more appropriate). This is a better way to ensure that outcomes are converted to the correct range. For example, we can use our fitted model to predict the raw data and generate similar plots to those that we created above

- -

- -

-

These plots should look much better. We can also inspect the outputs of the cv functions for Poisson data. MRFcov can use cross-validation (fitting models to specific subsets of the data and using resulting equations to predict observations for the remaining subset) to describe model fit and compare models with and without covariates. For family = binomial models, as seen at the end of the MRFs and CRFs for Bird.parasites data vignette, the functions return information on classification accuracy. But for Poisson data, it instead returns information on model deviance and mean squared error. We can showcase this functionality here using the fake data we have generated. We would expect that a CRF would fit the data better in this case, as this model includes the two covariates that we know are influential

- -
## Generating node-optimised Conditional Random Fields model
-## Poisson variables will be transformed using a nonparanormal...
-## Fitting MRF models in sequence using 1 core ...
-## 
-## Generating Markov Random Fields model (no covariates)
-## Poisson variables will be transformed using a nonparanormal...
-## Fitting MRF models in sequence using 1 core ...
- -
## [1]  6.563763 27.320274
- -
## [1] 192.1156 301.5668
-

Lower deviance for the CRF model confirms the idea that inclusion of covariates is supported.

-

Please note that the type of normalisation used above only applies to Poisson models!! Gaussian variables are not standardised before fitting CRFs / MRFs, so users should inspect the ranges of their variables and consider using the scale function to standardise beforehand if need be. The reason Gaussian models do not do this by default is that there are many options for normalisation / standardisation for Gaussian data and so users may wish to have more control over how this is done before fitting models

- - - - - - - - From cb7d26c52c6aa82f03671f1e018b1358cd8a4816 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 18 Mar 2021 16:03:23 +1000 Subject: [PATCH 6/7] update buildignore --- .Rbuildignore | 4 +++- cran-comments.md | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index c9d9bd8..ff85697 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,4 +2,6 @@ ^\.Rproj\.user$ ^doc$ ^Meta$ -^CRAN-RELEASE$ +^CRAN-RELEASE +^cran-comments.md +^docs diff --git a/cran-comments.md b/cran-comments.md index 0402dca..0953660 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -10,6 +10,8 @@ There were no ERRORs or WARNINGs. An ERROR appears in the win-builder check for r-devel-windows-ix86+x86_64, likely owing to failures on r-devel-windows-ix86+x86_64 due to missing dependencies that need compilation. A test using `rhub::check( platform="windows-x86_64-devel", env_vars=c(R_COMPILE_AND_INSTALL_PACKAGES = "always"))` passes the win-builder check +Previously three top level files were non-standard. These have been moved to .Rbuildignore and now are not flagged in R RMD checks + Previously the version of this update was not sufficient for CRAN requirements: Insufficient package version (submitted: 1.0.4, existing: 1.0.37). This has been changed (new version is 1.0.38) A URL in the one of the vignettes previously did not have https. The CRAN note was From 1be86fecd349c900d800ff7f81754370a4b536d8 Mon Sep 17 00:00:00 2001 From: nicholasjclark Date: Thu, 18 Mar 2021 16:18:15 +1000 Subject: [PATCH 7/7] update NEWS --- .Rbuildignore | 1 + CRAN-RELEASE | 4 ++-- NEWS.md | 6 +----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index ff85697..3ebfe93 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,4 @@ ^CRAN-RELEASE ^cran-comments.md ^docs +^CRAN-RELEASE$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE index 1a152a0..655c554 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2021-03-09. -Once it is accepted, delete this file and tag the release (commit 2be47b9). +This package was submitted to CRAN on 2021-03-18. +Once it is accepted, delete this file and tag the release (commit cb7d26c). diff --git a/NEWS.md b/NEWS.md index 6711afa..411f52c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,7 @@ -# MRFcov 1.0.40 +# MRFcov 1.0.38 * Improved nonparanormal mapping of predictions for discrete variables * Option to remove progress bar in multicore estimation, significantly speeding up analysis runs - -# MRFcov 1.0.39 * Improvement for the detection of infinite values in data and coordinates - -# MRFcov 1.0.38 * Option to prep spatial splines separately to allow efficient out-of-sample cross-validation # MRFcov 1.0.37