Squashed commit of the following:

commit 4142baf Author: choisant <[email protected]> Date: Thu Aug 15 12:43:01 2024 +0200 Update testing setup commit a1f0c5c Author: pglpm <[email protected]> Date: Tue Aug 13 06:16:19 2024 +0200 fix formatting commit f869b93 Author: pglpm <[email protected]> Date: Mon Aug 12 12:40:10 2024 +0200 update docs of some functions commit 93eb98f Author: pglpm <[email protected]> Date: Mon Aug 12 07:52:46 2024 +0200 report number of samples in inferpopulation commit 7b304df Author: pglpm <[email protected]> Date: Mon Aug 12 06:54:09 2024 +0200 correct reported number of dims in inferpopulation commit 11a7b7b Author: pglpm <[email protected]> Date: Mon Aug 12 06:37:09 2024 +0200 modular-rewrite calculation of probabilities III commit ad80dd9 Author: pglpm <[email protected]> Date: Mon Aug 12 06:29:27 2024 +0200 modular-rewrite calculation of probabilities commit 7693f57 Author: pglpm <[email protected]> Date: Sun Aug 11 23:05:20 2024 +0200 modular-rewrite calculation of probabilities commit 1889f69 Merge: e42be40 ee8ffad Author: pglpm <[email protected]> Date: Sat Aug 10 13:46:24 2024 +0200 general update of mutualinfo and consistency tests commit e42be40 Author: pglpm <[email protected]> Date: Sat Aug 10 13:43:10 2024 +0200 update custom tests commit 4edb765 Author: pglpm <[email protected]> Date: Sat Aug 10 13:27:14 2024 +0200 consistency-check mutualinfo() and samplesFDistribution() commit 9c5715d Author: pglpm <[email protected]> Date: Sat Aug 10 11:12:21 2024 +0200 rewrite mutualinfo(); create correlated dataset; move files commit cbe310d Author: pglpm <[email protected]> Date: Sat Aug 10 00:48:32 2024 +0200 fix bugs in samplesFdistribution() commit 5c8ba4a Author: pglpm <[email protected]> Date: Fri Aug 9 16:16:58 2024 +0200 fix bugs in plotFsamples() commit ee8ffad Author: Aurora <[email protected]> Date: Fri Aug 9 11:00:30 2024 +0200 Update apptainer.def Typo
pglpm · Aug 15, 2024 · 2ff5cb3 · 2ff5cb3
1 parent c8a035f
commit 2ff5cb3
Show file tree

Hide file tree

Showing 73 changed files with 2,863 additions and 2,044 deletions.
diff --git a/R/buildmetadata.R b/R/buildmetadata.R
@@ -1,16 +1,20 @@
-#' Build preliminary metadata flie
+#' Build a preliminary metadata file from a given dataset
 #'
-#' @param data data.frame object or filepath
-#' @param file string: name of output metadata file;
-#'   NULL: output metadata as data.frame
-#' @param includevrt string or NULL: name variates in data to be included
-#' @param excludevrt string or NULL: name variates in data to be excluded
-#' @param addsummary2metadata Bool: also output some diagnostic statistics
-#'    in the metadata?
-#' @param backupfiles Bool: rename previous metadata file if it exists?
-#' @param verbose Bool: output heuristics for each variate, default TRUE
+#' @param data A dataset, given as a \code{\link[base]{data.frame}}
+#' or as a file path to a csv file.
+#' @param file String: name of csv file where the metadata should be saved;
+#'   if `NULL`: output metadata as `VALUE`.
+#' @param includevrt Character or `NULL`: name of variates in dataset to be included.
+#' @param excludevrt Character or `NULL`: name of variates in dataset to be excluded.
+#' @param addsummary2metadata Logical: also output some diagnostic statistics
+#'    in the metadata? Default `FALSE`
+#' @param backupfiles Logical: rename previous metadata file if it exists?
+#' Default `TRUE`
+#' @param verbose Logical: output heuristics for each variate? Default `TRUE`.
 #'
-#' @return nothing or data.table object
+#' @return If `file = NULL`, a preliminary metadata file is created
+#'   and `VALUE` is `NULL`;
+#'   otherwise `VALUE` is a data.frame containing the metadata.
 #'
 #' @export
 buildmetadata <- function(
@@ -116,7 +120,7 @@ buildmetadata <- function(
         ## If the values are strings,
         ## check whether they can be reinterpreted as numeric
         if(!is.numeric(unx) &&
-           !any(is.na(suppressWarnings(as.numeric(unx))))) {
+               !any(is.na(suppressWarnings(as.numeric(unx))))) {
             x <- as.numeric(x)
             unx <- sort(as.numeric(unx))
         }
@@ -156,8 +160,8 @@ buildmetadata <- function(
                 ljumpquantum <- gcd(ljumpsx * lmulti) / lmulti
                 if (!(
                     ljumpquantum / lrangex < 1e-5 || # no visible gaps between values
-                    (ljumpquantum / lrangex < 1e-3 &&
-                     maxrep <= 100)  # visible gaps, but not many value repetitions
+                        (ljumpquantum / lrangex < 1e-3 &&
+                             maxrep <= 100)  # visible gaps, but not many value repetitions
                 )) {
                     warninglist <- c(warninglist,
                         paste0('\n* "', name, '" variate',
@@ -196,9 +200,9 @@ buildmetadata <- function(
             }
 
         } else if (!is.numeric(x) &&
-                   !any(sapply(ordinalkeywords, function(keyw){
-                       grepl(keyw, datavalues, fixed = TRUE)
-                   }))) {
+                       !any(sapply(ordinalkeywords, function(keyw){
+                           grepl(keyw, datavalues, fixed = TRUE)
+                       }))) {
             ## Nominal variate? (non-numeric values)
             type <- 'nominal'
             Nvalues <- uniquex
@@ -256,7 +260,7 @@ buildmetadata <- function(
 #### Now we know that the variate is numeric
 
         } else if (uniquex <= 10 &&
-                   (all(x == round(x)) || jumpquantum == round(jumpquantum))) {
+                       (all(x == round(x)) || jumpquantum == round(jumpquantum))) {
             ## Ordinal variate with few numeric values?
             type <- 'ordinal'
             Nvalues <- uniquex
@@ -362,8 +366,8 @@ buildmetadata <- function(
             ## are included in the domain or not
             if (!(
                 jumpquantum / rangex < 1e-5 || # no visible gaps between values
-                (jumpquantum / rangex < 1e-3 &&
-                 maxrep <= 100)  # visible gaps, but not many value repetitions
+                    (jumpquantum / rangex < 1e-3 &&
+                         maxrep <= 100)  # visible gaps, but not many value repetitions
             )) {
                 roundedflag <- TRUE
                 ## The variate seems to have quantized differences

diff --git a/R/inferpopulation.R b/R/inferpopulation.R
@@ -1,55 +1,61 @@
 #' Monte Carlo computation of posterior distribution of population frequencies
 #'
-#' @param data data.frame object or filepath: datapoints
-#' @param metadata Either the name of the csv file containing metadata
-#'   of the current dataset, or a data.frame with the metadata
-#' @param auxdata NULL or data.frame object or filepath: extra datapoints
-#'   which would be too many to use in the Monte Carlo sampling,
-#'   but can be used to calculate hyperparameters
-#' @param outputdir String, path to output file folder ## Rename to
-#'   outputPrefix, also addSuffix?
-#' @param nsamples Integer, nr of desired MC samples
-#' @param nchains Integer, nr of MC chains
-#' @param nsamplesperchain Integer, nr of MC samples per chain
-#' @param parallel, logical or numeric: whether to use pre-existing parallel
-#'   workers, or how many to create and use
-#' @param seed Integer: seed for random number generator. If left as default
-#'   NULL, a random seed based on the system clock is used in the
-#'   set.seed() function
-#' @param cleanup logical, default TRUE, removes files that can be used for
-#'   debugging
-#' @param appendtimestamp logical, default TRUE: add timestamp to name of
-#'   output directory
-#' @param appendinfo logical, default TRUE: add some simulation information
-#'   to name of output directory
-#' @param output if string 'directory', return the output directory name;
-#'   if string 'mcoutput', return the 'Fdistribution' object;
-#'   anything else, no output
-#' @param subsampledata Numeric: use only a subsample of the datapoints in 'data'
-#' @param startupMCiterations Number of initial (burn-in) MC iterations
-#' @param minMCiterations Minimum number of MC iterations to be done
-#' @param maxMCiterations Maximum number of MC iterations
-#' @param maxhours Maximum (approximately) time in hours for MC computation
-#' @param ncheckpoints NULL (default), positive integer, or Inf:
-#'   number of datapoints to use for stopping the sampling;
-#'   if NULL, equal to number of variates + 2; if Inf, use all datapoints
-#' @param relerror positive real: relative error of calculated probabilities
-#'   with respect to their variability with new data. It is only approximate
-#' @param prior logical: Calculate the prior distribution of F?
-#' @param thinning If NULL, let the diagnostics decide the MC thinning;
-#'   if positive, use this thinning value
-#' @param plottraces logical: plot MC traces of diagnostic values
-#' @param showKtraces logical, when true, it saves the K parameter during
-#'   sampling and plots its trace and histogram at the end. Keeping it to
-#'   FALSE (default) saves a little computation time.
-#' @param showAlphatraces logical, : when true, it saves the Alpha parameter
-#'   more frequently during sampling and plots its trace and histogram at
-#'   the end. Keeping it to FALSE (default) saves a little computation
-#'   time.
-#' @param hyperparams list of hyperparameters (used for debugging)
+#' @param data A dataset, given as a \code{\link[base]{data.frame}}
+#' or as a file path to a csv file.
+#' @param metadata A metadata object, given either as a data.frame object,
+#' or as a file pa to a csv file.
+#' @param auxdata A larger dataset, given as a data.frame
+#'   or as a file path to a csv file. Such a dataset
+#'   would be too many to use in the Monte Carlo sampling,
+#'   but can be used to calculate hyperparameters.
+#' @param outputdir Character: path to folder where the output should be saved.
+#' @param nsamples Integer: number of desired Monte Carlo samples. Default 3600.
+#' @param nchains Integer: number of Monte Carlo chains. Default 60.
+#' @param nsamplesperchain Integer: number of Monte Carlo samples per chain.
+#' @param parallel Logical: use pre-existing parallel workers from package
+#'   \code{\link[doParallel]{doParallel}}.
+#'   Or integer: create and use that many parallel workers.
+#' @param seed Integer: use this seed for the random number generator.
+#'   If missing or `NULL` (default), do not set the seed.
+#' @param cleanup Logical: remove diagnostic files at the end of the computation?
+#'   Default `TRUE`.
+#' @param appendtimestamp Logical: append a timestamp to the name of
+#'   the output directory `outputdir`? Default `TRUE`.
+#' @param appendinfo Logical: append information about dataset and Monte Carlo
+#'   parameters to the name of the output directory `outputdir`? Default `TRUE`.
+#' @param output Character: if `'directory'`, return the output directory name
+#'   as `VALUE`; if string `'mcoutput'`, return the `'Fdistribution'` object
+#'   containing the parameters obtained from the Monte Carlo computation.
+#'   Any other value: `VALUE` is `NULL`.
+#' @param subsampledata Integer: use only a subset of this many datapoints for
+#'   the Monte Carlo computation.
+#' @param startupMCiterations Integer: number of initial (burn-in)
+#'   Monte Carlo iterations. Default 3600.
+#' @param minMCiterations Integer: minimum number of Monte Carlo iterations
+#'   to be done. Default 0.
+#' @param maxMCiterations Integer: Do at most this many Monte Carlo iterations.
+#'   Default `Inf`.
+#' @param maxhours Numeric: approximate time limit, in hours, for the
+#'   Monte Carlo computation to last. Default `Inf`.
+#' @param ncheckpoints Integer: number of datapoints to use
+#'   for checking when the Monte Carlo computation should end.
+#'   If NULL (default), this is equal to number of variates + 2.
+#'   If Inf, use all datapoints.
+#' @param relerror Numeric: desired maximal relative error of calculated probabilities
+#'   with respect to their variability with new data.
+#' @param prior Logical: Calculate the prior distribution?
+#' @param thinning Integer: thin out the Monte Carlo samples by this value.
+#'   If NULL (default): let the diagnostics decide the thinning value.
+#' @param plottraces Logical: save plots of the Monte Carlo traces
+#'   of diagnostic values? Default `TRUE`.
+#' @param showKtraces Logical: save plots of the Monte Carlo traces
+#'   of the K parameter? Default `FALSE`.
+#' @param showAlphatraces Logical: save plots of the Monte Carlo traces
+#'   of the Alpha parameter? Default `FALSE`.
+#' @param hyperparams List: hyperparameters of the prior.
 #'
-#' @return name of directory containing output files, or Fdistribution object,
-#'   or empty
+#' @return Name of directory containing output files, or Fdistribution object,
+#'   or `NULL`, depending on argument `output`.
 #'
 #' @import parallel foreach doParallel doRNG nimble
 #'
@@ -109,7 +115,9 @@ inferpopulation <- function(
     cat('\n') # make sure possible error messages start on new line
 
     ## Set the RNG seed if given by user, or if no seed already exists
-    if (!missing(seed) || !exists('.Random.seed')) {set.seed(seed)}
+    if (!is.null(seed) || !missing(seed) || !exists('.Random.seed')) {
+        set.seed(seed)
+    }
     currentseed <- .Random.seed
 
 ##################################################
@@ -169,9 +177,9 @@ inferpopulation <- function(
     ## in an appropriate way
 
     if (!(missing(nsamples) && missing(nchains) &&
-          missing(nsamplesperchain)) &&
-        (!missing(nsamples) && !missing(nchains) &&
-         !missing(nsamplesperchain))) {
+              missing(nsamplesperchain)) &&
+            (!missing(nsamples) && !missing(nchains) &&
+                 !missing(nsamplesperchain))) {
         ## The user set all these three arguments or only one
         stop('Please specify exactly two among "nsamples", "nchains", "nsamplesperchain"')
     }
@@ -416,10 +424,10 @@ inferpopulation <- function(
             '-vrt', nrow(auxmetadata),
             '_dat',
             (if (npoints == 1 && all(is.na(data))) {
-                 0
-             } else {
-                 npoints
-             }),
+                0
+            } else {
+                npoints
+            }),
             ## '-K', ncomponents, # unimportant for user
             '_smp', nsamples)
     }
@@ -488,7 +496,7 @@ inferpopulation <- function(
                     lapply(seq_len(nrow(auxmetadata)),
                         function(ii){
                             if(auxmetadata[ii, 'mcmctype'] %in%
-                               c('R', 'C', 'D', 'L')) {
+                                   c('R', 'C', 'D', 'L')) {
                                 rnorm(n = ncheckpoints, mean = 0, sd = 2)
                             } else {
                                 sample(1:auxmetadata[ii, 'Nvalues'], ncheckpoints,
@@ -498,7 +506,7 @@ inferpopulation <- function(
                     ),
                     row.names = NULL,
                     col.names = auxmetadata$name)
-               )
+            )
             ## ## original alternative way. Delete soon
             ## testdata <- data.frame(1:3)[,-1]
             ## for(xx in seq_len(nrow(auxmetadata))) {
@@ -640,7 +648,7 @@ inferpopulation <- function(
     ## N: nominal
     ## B: binary
     ## number and names of variates of each type
-    vn <- vnames <- list(R=NULL, C=NULL, D=NULL, L=NULL, O=NULL, N=NULL, B=NULL)
+    vn <- vnames <- list(R=NULL, C=NULL, D=NULL, O=NULL, N=NULL, B=NULL)
 
     for (atype in names(vn)) {
         vnames[[atype]] <- auxmetadata[auxmetadata$mcmctype == atype, 'name']
@@ -847,21 +855,27 @@ inferpopulation <- function(
         'Starting Monte Carlo sampling of', nsamples, 'samples by',
         nchains, 'chains'
     )
-    cat('\nin a space of',
-        (sum(as.numeric(vn) * c(2, 2, 2, 2, 0, 0, 1)) +
-         sum(Nalpha0 > 2e-100) - nrow(Nalpha0) + 1 +
-         sum(Oalpha0 > 2e-100) - nrow(Oalpha0) + 1 ) * ncomponents - 1,
-        ## '(effectively',
-        ## paste0(
-        ## (sum(as.numeric(vn) * c(
-        ##     3 + npoints, 3 + npoints, 3 + npoints,
-        ##     3 + npoints, 0, 0, 1 + npoints
-        ## )) +
-        ## sum(Nalpha0 > 2e-100) +
-        ## nrow(Nalpha0) * (npoints - 1) + 1 +
-        ## sum(Oalpha0 > 2e-100) +
-        ## nrow(Oalpha0) * (npoints - 1) + 1 ) * ncomponents - 1 + nalpha - 1,
-        ## ')' ),
+
+    samplespacedims <- vn$R * 2 * ncomponents +
+        vn$C * 2 * ncomponents +
+        vn$D * 2 * ncomponents +
+        sum(apply(Oalpha0, 1, function(x) sum(x > 2e-17) - 1)) * ncomponents +
+        sum(apply(Nalpha0, 1, function(x) sum(x > 2e-17) - 1)) * ncomponents +
+        sum(apply(Nalpha0, 1, function(x) sum(x > 2e-17) - 1)) +
+        vn$B * ncomponents +
+        ncomponents - 1
+    samplespacexdims <- 1 + # Alpha
+        vn$R * ncomponents + # Rrate
+        vn$C * ncomponents + # Crate
+        vn$D * ncomponents + # Drate
+        1 + # K
+        vn$C * npoints * ncomponents + # latent C
+        vn$D * npoints * ncomponents + # latent D
+        sum(is.na(data)) # missing data
+
+
+    cat('\nin a space of', samplespacedims,
+        '(effectively', paste0(samplespacedims + samplespacexdims, ')'),
         'dimensions.\n')
 
     cat('Using', ncores, 'cores:',
@@ -974,14 +988,14 @@ inferpopulation <- function(
                 if (vn$C > 0) { # continuous closed domain
                     for (v in 1:Cn) {
                         Caux[d, v] ~ dconstraint(Clat[d, v] >= Cleft[d, v] &
-                                                 Clat[d, v] <= Cright[d, v])
+                                                     Clat[d, v] <= Cright[d, v])
                         Clat[d, v] ~ dnorm(mean = Cmean[v, K[d]], var = Cvar[v, K[d]])
                     }
                 }
                 if (vn$D > 0) { # continuous rounded
                     for (v in 1:Dn) {
                         Daux[d, v] ~ dconstraint(Dlat[d, v] >= Dleft[d, v] &
-                                                 Dlat[d, v] < Dright[d, v])
+                                                     Dlat[d, v] < Dright[d, v])
                         Dlat[d, v] ~ dnorm(mean = Dmean[v, K[d]], var = Dvar[v, K[d]])
                     }
                 }
@@ -1003,7 +1017,7 @@ inferpopulation <- function(
                     }
                 }
                 if (vn$B > 0) { # binary
-                    for (v in 1:Bn) {
+                    for (v in 1:Bn) { # Bprob is the probability that Bdata=1
                         Bdata[d, v] ~ dbern(prob = Bprob[v, K[d]])
                     }
                 }
@@ -1321,7 +1335,7 @@ inferpopulation <- function(
 
         ## replace Alpha's cat-sampler with slice
         if (Alphatoslice &&
-            !('Alpha' %in% targetslist[nameslist == 'posterior_predictive'])) {
+                !('Alpha' %in% targetslist[nameslist == 'posterior_predictive'])) {
             confnimble$replaceSampler(target='Alpha', type='slice')
             ## ## Old replacement method, didn't work in previous Nimble
             ## confnimble$removeSamplers('Alpha')
@@ -1425,7 +1439,7 @@ inferpopulation <- function(
         ## Inform user that compilation is done, if core 1:
         if (acore == 1) {
             print2user(paste0('\rCompiled core ', acore, '. ',
-                'Effective dimensions of sampled space: ',
+                'Number of samplers: ',
                 length(confnimble$samplerExecutionOrder), '.\n',
                 'Estimating remaining time, please be patient...'),
                 outcon)
@@ -1450,7 +1464,7 @@ inferpopulation <- function(
 
             ## calculate how the remaining time is allotted to remaining chains
             timeleft <- (maxhours -
-                         as.double(Sys.time() - timestart0, units = 'hours')) /
+                             as.double(Sys.time() - timestart0, units = 'hours')) /
                 (nchainspercore - achain + 1)
 
             showsamplertimes0 <- showsamplertimes && (achain == 1)
@@ -1503,10 +1517,10 @@ inferpopulation <- function(
                     niter = niter,
                     thin = 1,
                     thin2 = (if (showAlphatraces || showKtraces) {
-                                 max(1, round(niter / ncomponentsamples))
-                             } else {
-                                 max(2, floor(niter / 2))
-                             }),
+                        max(1, round(niter / ncomponentsamples))
+                    } else {
+                        max(2, floor(niter / 2))
+                    }),
                     nburnin = 0,
                     time = showsamplertimes0,
                     reset = reset,
@@ -1764,7 +1778,7 @@ inferpopulation <- function(
                 cat('\nWARNING: have to reduce thinning owing to time constraints\n')
                 tokeep <- round(seq(from = 1,
                     to = ncol(allmcsamples$W),
-                length.out = nsamplesperchain))
+                    length.out = nsamplesperchain))
             }
 
             saveRDS(mcsubset(allmcsamples, tokeep),