Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 4142baf
Author: choisant <[email protected]>
Date:   Thu Aug 15 12:43:01 2024 +0200

    Update testing setup

commit a1f0c5c
Author: pglpm <[email protected]>
Date:   Tue Aug 13 06:16:19 2024 +0200

    fix formatting

commit f869b93
Author: pglpm <[email protected]>
Date:   Mon Aug 12 12:40:10 2024 +0200

    update docs of some functions

commit 93eb98f
Author: pglpm <[email protected]>
Date:   Mon Aug 12 07:52:46 2024 +0200

    report number of samples in inferpopulation

commit 7b304df
Author: pglpm <[email protected]>
Date:   Mon Aug 12 06:54:09 2024 +0200

    correct reported number of dims in inferpopulation

commit 11a7b7b
Author: pglpm <[email protected]>
Date:   Mon Aug 12 06:37:09 2024 +0200

    modular-rewrite calculation of probabilities III

commit ad80dd9
Author: pglpm <[email protected]>
Date:   Mon Aug 12 06:29:27 2024 +0200

    modular-rewrite calculation of probabilities

commit 7693f57
Author: pglpm <[email protected]>
Date:   Sun Aug 11 23:05:20 2024 +0200

    modular-rewrite calculation of probabilities

commit 1889f69
Merge: e42be40 ee8ffad
Author: pglpm <[email protected]>
Date:   Sat Aug 10 13:46:24 2024 +0200

    general update of mutualinfo and consistency tests

commit e42be40
Author: pglpm <[email protected]>
Date:   Sat Aug 10 13:43:10 2024 +0200

    update custom tests

commit 4edb765
Author: pglpm <[email protected]>
Date:   Sat Aug 10 13:27:14 2024 +0200

    consistency-check mutualinfo() and samplesFDistribution()

commit 9c5715d
Author: pglpm <[email protected]>
Date:   Sat Aug 10 11:12:21 2024 +0200

    rewrite mutualinfo(); create correlated dataset; move files

commit cbe310d
Author: pglpm <[email protected]>
Date:   Sat Aug 10 00:48:32 2024 +0200

    fix bugs in samplesFdistribution()

commit 5c8ba4a
Author: pglpm <[email protected]>
Date:   Fri Aug 9 16:16:58 2024 +0200

    fix bugs in plotFsamples()

commit ee8ffad
Author: Aurora <[email protected]>
Date:   Fri Aug 9 11:00:30 2024 +0200

    Update apptainer.def

    Typo
  • Loading branch information
choisant committed Aug 15, 2024
1 parent c8a035f commit 2ff5cb3
Show file tree
Hide file tree
Showing 73 changed files with 2,863 additions and 2,044 deletions.
44 changes: 24 additions & 20 deletions R/buildmetadata.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
#' Build preliminary metadata flie
#' Build a preliminary metadata file from a given dataset
#'
#' @param data data.frame object or filepath
#' @param file string: name of output metadata file;
#' NULL: output metadata as data.frame
#' @param includevrt string or NULL: name variates in data to be included
#' @param excludevrt string or NULL: name variates in data to be excluded
#' @param addsummary2metadata Bool: also output some diagnostic statistics
#' in the metadata?
#' @param backupfiles Bool: rename previous metadata file if it exists?
#' @param verbose Bool: output heuristics for each variate, default TRUE
#' @param data A dataset, given as a \code{\link[base]{data.frame}}
#' or as a file path to a csv file.
#' @param file String: name of csv file where the metadata should be saved;
#' if `NULL`: output metadata as `VALUE`.
#' @param includevrt Character or `NULL`: name of variates in dataset to be included.
#' @param excludevrt Character or `NULL`: name of variates in dataset to be excluded.
#' @param addsummary2metadata Logical: also output some diagnostic statistics
#' in the metadata? Default `FALSE`
#' @param backupfiles Logical: rename previous metadata file if it exists?
#' Default `TRUE`
#' @param verbose Logical: output heuristics for each variate? Default `TRUE`.
#'
#' @return nothing or data.table object
#' @return If `file = NULL`, a preliminary metadata file is created
#' and `VALUE` is `NULL`;
#' otherwise `VALUE` is a data.frame containing the metadata.
#'
#' @export
buildmetadata <- function(
Expand Down Expand Up @@ -116,7 +120,7 @@ buildmetadata <- function(
## If the values are strings,
## check whether they can be reinterpreted as numeric
if(!is.numeric(unx) &&
!any(is.na(suppressWarnings(as.numeric(unx))))) {
!any(is.na(suppressWarnings(as.numeric(unx))))) {
x <- as.numeric(x)
unx <- sort(as.numeric(unx))
}
Expand Down Expand Up @@ -156,8 +160,8 @@ buildmetadata <- function(
ljumpquantum <- gcd(ljumpsx * lmulti) / lmulti
if (!(
ljumpquantum / lrangex < 1e-5 || # no visible gaps between values
(ljumpquantum / lrangex < 1e-3 &&
maxrep <= 100) # visible gaps, but not many value repetitions
(ljumpquantum / lrangex < 1e-3 &&
maxrep <= 100) # visible gaps, but not many value repetitions
)) {
warninglist <- c(warninglist,
paste0('\n* "', name, '" variate',
Expand Down Expand Up @@ -196,9 +200,9 @@ buildmetadata <- function(
}

} else if (!is.numeric(x) &&
!any(sapply(ordinalkeywords, function(keyw){
grepl(keyw, datavalues, fixed = TRUE)
}))) {
!any(sapply(ordinalkeywords, function(keyw){
grepl(keyw, datavalues, fixed = TRUE)
}))) {
## Nominal variate? (non-numeric values)
type <- 'nominal'
Nvalues <- uniquex
Expand Down Expand Up @@ -256,7 +260,7 @@ buildmetadata <- function(
#### Now we know that the variate is numeric

} else if (uniquex <= 10 &&
(all(x == round(x)) || jumpquantum == round(jumpquantum))) {
(all(x == round(x)) || jumpquantum == round(jumpquantum))) {
## Ordinal variate with few numeric values?
type <- 'ordinal'
Nvalues <- uniquex
Expand Down Expand Up @@ -362,8 +366,8 @@ buildmetadata <- function(
## are included in the domain or not
if (!(
jumpquantum / rangex < 1e-5 || # no visible gaps between values
(jumpquantum / rangex < 1e-3 &&
maxrep <= 100) # visible gaps, but not many value repetitions
(jumpquantum / rangex < 1e-3 &&
maxrep <= 100) # visible gaps, but not many value repetitions
)) {
roundedflag <- TRUE
## The variate seems to have quantized differences
Expand Down
186 changes: 100 additions & 86 deletions R/inferpopulation.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,61 @@
#' Monte Carlo computation of posterior distribution of population frequencies
#'
#' @param data data.frame object or filepath: datapoints
#' @param metadata Either the name of the csv file containing metadata
#' of the current dataset, or a data.frame with the metadata
#' @param auxdata NULL or data.frame object or filepath: extra datapoints
#' which would be too many to use in the Monte Carlo sampling,
#' but can be used to calculate hyperparameters
#' @param outputdir String, path to output file folder ## Rename to
#' outputPrefix, also addSuffix?
#' @param nsamples Integer, nr of desired MC samples
#' @param nchains Integer, nr of MC chains
#' @param nsamplesperchain Integer, nr of MC samples per chain
#' @param parallel, logical or numeric: whether to use pre-existing parallel
#' workers, or how many to create and use
#' @param seed Integer: seed for random number generator. If left as default
#' NULL, a random seed based on the system clock is used in the
#' set.seed() function
#' @param cleanup logical, default TRUE, removes files that can be used for
#' debugging
#' @param appendtimestamp logical, default TRUE: add timestamp to name of
#' output directory
#' @param appendinfo logical, default TRUE: add some simulation information
#' to name of output directory
#' @param output if string 'directory', return the output directory name;
#' if string 'mcoutput', return the 'Fdistribution' object;
#' anything else, no output
#' @param subsampledata Numeric: use only a subsample of the datapoints in 'data'
#' @param startupMCiterations Number of initial (burn-in) MC iterations
#' @param minMCiterations Minimum number of MC iterations to be done
#' @param maxMCiterations Maximum number of MC iterations
#' @param maxhours Maximum (approximately) time in hours for MC computation
#' @param ncheckpoints NULL (default), positive integer, or Inf:
#' number of datapoints to use for stopping the sampling;
#' if NULL, equal to number of variates + 2; if Inf, use all datapoints
#' @param relerror positive real: relative error of calculated probabilities
#' with respect to their variability with new data. It is only approximate
#' @param prior logical: Calculate the prior distribution of F?
#' @param thinning If NULL, let the diagnostics decide the MC thinning;
#' if positive, use this thinning value
#' @param plottraces logical: plot MC traces of diagnostic values
#' @param showKtraces logical, when true, it saves the K parameter during
#' sampling and plots its trace and histogram at the end. Keeping it to
#' FALSE (default) saves a little computation time.
#' @param showAlphatraces logical, : when true, it saves the Alpha parameter
#' more frequently during sampling and plots its trace and histogram at
#' the end. Keeping it to FALSE (default) saves a little computation
#' time.
#' @param hyperparams list of hyperparameters (used for debugging)
#' @param data A dataset, given as a \code{\link[base]{data.frame}}
#' or as a file path to a csv file.
#' @param metadata A metadata object, given either as a data.frame object,
#' or as a file pa to a csv file.
#' @param auxdata A larger dataset, given as a data.frame
#' or as a file path to a csv file. Such a dataset
#' would be too many to use in the Monte Carlo sampling,
#' but can be used to calculate hyperparameters.
#' @param outputdir Character: path to folder where the output should be saved.
#' @param nsamples Integer: number of desired Monte Carlo samples. Default 3600.
#' @param nchains Integer: number of Monte Carlo chains. Default 60.
#' @param nsamplesperchain Integer: number of Monte Carlo samples per chain.
#' @param parallel Logical: use pre-existing parallel workers from package
#' \code{\link[doParallel]{doParallel}}.
#' Or integer: create and use that many parallel workers.
#' @param seed Integer: use this seed for the random number generator.
#' If missing or `NULL` (default), do not set the seed.
#' @param cleanup Logical: remove diagnostic files at the end of the computation?
#' Default `TRUE`.
#' @param appendtimestamp Logical: append a timestamp to the name of
#' the output directory `outputdir`? Default `TRUE`.
#' @param appendinfo Logical: append information about dataset and Monte Carlo
#' parameters to the name of the output directory `outputdir`? Default `TRUE`.
#' @param output Character: if `'directory'`, return the output directory name
#' as `VALUE`; if string `'mcoutput'`, return the `'Fdistribution'` object
#' containing the parameters obtained from the Monte Carlo computation.
#' Any other value: `VALUE` is `NULL`.
#' @param subsampledata Integer: use only a subset of this many datapoints for
#' the Monte Carlo computation.
#' @param startupMCiterations Integer: number of initial (burn-in)
#' Monte Carlo iterations. Default 3600.
#' @param minMCiterations Integer: minimum number of Monte Carlo iterations
#' to be done. Default 0.
#' @param maxMCiterations Integer: Do at most this many Monte Carlo iterations.
#' Default `Inf`.
#' @param maxhours Numeric: approximate time limit, in hours, for the
#' Monte Carlo computation to last. Default `Inf`.
#' @param ncheckpoints Integer: number of datapoints to use
#' for checking when the Monte Carlo computation should end.
#' If NULL (default), this is equal to number of variates + 2.
#' If Inf, use all datapoints.
#' @param relerror Numeric: desired maximal relative error of calculated probabilities
#' with respect to their variability with new data.
#' @param prior Logical: Calculate the prior distribution?
#' @param thinning Integer: thin out the Monte Carlo samples by this value.
#' If NULL (default): let the diagnostics decide the thinning value.
#' @param plottraces Logical: save plots of the Monte Carlo traces
#' of diagnostic values? Default `TRUE`.
#' @param showKtraces Logical: save plots of the Monte Carlo traces
#' of the K parameter? Default `FALSE`.
#' @param showAlphatraces Logical: save plots of the Monte Carlo traces
#' of the Alpha parameter? Default `FALSE`.
#' @param hyperparams List: hyperparameters of the prior.
#'
#' @return name of directory containing output files, or Fdistribution object,
#' or empty
#' @return Name of directory containing output files, or Fdistribution object,
#' or `NULL`, depending on argument `output`.
#'
#' @import parallel foreach doParallel doRNG nimble
#'
Expand Down Expand Up @@ -109,7 +115,9 @@ inferpopulation <- function(
cat('\n') # make sure possible error messages start on new line

## Set the RNG seed if given by user, or if no seed already exists
if (!missing(seed) || !exists('.Random.seed')) {set.seed(seed)}
if (!is.null(seed) || !missing(seed) || !exists('.Random.seed')) {
set.seed(seed)
}
currentseed <- .Random.seed

##################################################
Expand Down Expand Up @@ -169,9 +177,9 @@ inferpopulation <- function(
## in an appropriate way

if (!(missing(nsamples) && missing(nchains) &&
missing(nsamplesperchain)) &&
(!missing(nsamples) && !missing(nchains) &&
!missing(nsamplesperchain))) {
missing(nsamplesperchain)) &&
(!missing(nsamples) && !missing(nchains) &&
!missing(nsamplesperchain))) {
## The user set all these three arguments or only one
stop('Please specify exactly two among "nsamples", "nchains", "nsamplesperchain"')
}
Expand Down Expand Up @@ -416,10 +424,10 @@ inferpopulation <- function(
'-vrt', nrow(auxmetadata),
'_dat',
(if (npoints == 1 && all(is.na(data))) {
0
} else {
npoints
}),
0
} else {
npoints
}),
## '-K', ncomponents, # unimportant for user
'_smp', nsamples)
}
Expand Down Expand Up @@ -488,7 +496,7 @@ inferpopulation <- function(
lapply(seq_len(nrow(auxmetadata)),
function(ii){
if(auxmetadata[ii, 'mcmctype'] %in%
c('R', 'C', 'D', 'L')) {
c('R', 'C', 'D', 'L')) {
rnorm(n = ncheckpoints, mean = 0, sd = 2)
} else {
sample(1:auxmetadata[ii, 'Nvalues'], ncheckpoints,
Expand All @@ -498,7 +506,7 @@ inferpopulation <- function(
),
row.names = NULL,
col.names = auxmetadata$name)
)
)
## ## original alternative way. Delete soon
## testdata <- data.frame(1:3)[,-1]
## for(xx in seq_len(nrow(auxmetadata))) {
Expand Down Expand Up @@ -640,7 +648,7 @@ inferpopulation <- function(
## N: nominal
## B: binary
## number and names of variates of each type
vn <- vnames <- list(R=NULL, C=NULL, D=NULL, L=NULL, O=NULL, N=NULL, B=NULL)
vn <- vnames <- list(R=NULL, C=NULL, D=NULL, O=NULL, N=NULL, B=NULL)

for (atype in names(vn)) {
vnames[[atype]] <- auxmetadata[auxmetadata$mcmctype == atype, 'name']
Expand Down Expand Up @@ -847,21 +855,27 @@ inferpopulation <- function(
'Starting Monte Carlo sampling of', nsamples, 'samples by',
nchains, 'chains'
)
cat('\nin a space of',
(sum(as.numeric(vn) * c(2, 2, 2, 2, 0, 0, 1)) +
sum(Nalpha0 > 2e-100) - nrow(Nalpha0) + 1 +
sum(Oalpha0 > 2e-100) - nrow(Oalpha0) + 1 ) * ncomponents - 1,
## '(effectively',
## paste0(
## (sum(as.numeric(vn) * c(
## 3 + npoints, 3 + npoints, 3 + npoints,
## 3 + npoints, 0, 0, 1 + npoints
## )) +
## sum(Nalpha0 > 2e-100) +
## nrow(Nalpha0) * (npoints - 1) + 1 +
## sum(Oalpha0 > 2e-100) +
## nrow(Oalpha0) * (npoints - 1) + 1 ) * ncomponents - 1 + nalpha - 1,
## ')' ),

samplespacedims <- vn$R * 2 * ncomponents +
vn$C * 2 * ncomponents +
vn$D * 2 * ncomponents +
sum(apply(Oalpha0, 1, function(x) sum(x > 2e-17) - 1)) * ncomponents +
sum(apply(Nalpha0, 1, function(x) sum(x > 2e-17) - 1)) * ncomponents +
sum(apply(Nalpha0, 1, function(x) sum(x > 2e-17) - 1)) +
vn$B * ncomponents +
ncomponents - 1
samplespacexdims <- 1 + # Alpha
vn$R * ncomponents + # Rrate
vn$C * ncomponents + # Crate
vn$D * ncomponents + # Drate
1 + # K
vn$C * npoints * ncomponents + # latent C
vn$D * npoints * ncomponents + # latent D
sum(is.na(data)) # missing data


cat('\nin a space of', samplespacedims,
'(effectively', paste0(samplespacedims + samplespacexdims, ')'),
'dimensions.\n')

cat('Using', ncores, 'cores:',
Expand Down Expand Up @@ -974,14 +988,14 @@ inferpopulation <- function(
if (vn$C > 0) { # continuous closed domain
for (v in 1:Cn) {
Caux[d, v] ~ dconstraint(Clat[d, v] >= Cleft[d, v] &
Clat[d, v] <= Cright[d, v])
Clat[d, v] <= Cright[d, v])
Clat[d, v] ~ dnorm(mean = Cmean[v, K[d]], var = Cvar[v, K[d]])
}
}
if (vn$D > 0) { # continuous rounded
for (v in 1:Dn) {
Daux[d, v] ~ dconstraint(Dlat[d, v] >= Dleft[d, v] &
Dlat[d, v] < Dright[d, v])
Dlat[d, v] < Dright[d, v])
Dlat[d, v] ~ dnorm(mean = Dmean[v, K[d]], var = Dvar[v, K[d]])
}
}
Expand All @@ -1003,7 +1017,7 @@ inferpopulation <- function(
}
}
if (vn$B > 0) { # binary
for (v in 1:Bn) {
for (v in 1:Bn) { # Bprob is the probability that Bdata=1
Bdata[d, v] ~ dbern(prob = Bprob[v, K[d]])
}
}
Expand Down Expand Up @@ -1321,7 +1335,7 @@ inferpopulation <- function(

## replace Alpha's cat-sampler with slice
if (Alphatoslice &&
!('Alpha' %in% targetslist[nameslist == 'posterior_predictive'])) {
!('Alpha' %in% targetslist[nameslist == 'posterior_predictive'])) {
confnimble$replaceSampler(target='Alpha', type='slice')
## ## Old replacement method, didn't work in previous Nimble
## confnimble$removeSamplers('Alpha')
Expand Down Expand Up @@ -1425,7 +1439,7 @@ inferpopulation <- function(
## Inform user that compilation is done, if core 1:
if (acore == 1) {
print2user(paste0('\rCompiled core ', acore, '. ',
'Effective dimensions of sampled space: ',
'Number of samplers: ',
length(confnimble$samplerExecutionOrder), '.\n',
'Estimating remaining time, please be patient...'),
outcon)
Expand All @@ -1450,7 +1464,7 @@ inferpopulation <- function(

## calculate how the remaining time is allotted to remaining chains
timeleft <- (maxhours -
as.double(Sys.time() - timestart0, units = 'hours')) /
as.double(Sys.time() - timestart0, units = 'hours')) /
(nchainspercore - achain + 1)

showsamplertimes0 <- showsamplertimes && (achain == 1)
Expand Down Expand Up @@ -1503,10 +1517,10 @@ inferpopulation <- function(
niter = niter,
thin = 1,
thin2 = (if (showAlphatraces || showKtraces) {
max(1, round(niter / ncomponentsamples))
} else {
max(2, floor(niter / 2))
}),
max(1, round(niter / ncomponentsamples))
} else {
max(2, floor(niter / 2))
}),
nburnin = 0,
time = showsamplertimes0,
reset = reset,
Expand Down Expand Up @@ -1764,7 +1778,7 @@ inferpopulation <- function(
cat('\nWARNING: have to reduce thinning owing to time constraints\n')
tokeep <- round(seq(from = 1,
to = ncol(allmcsamples$W),
length.out = nsamplesperchain))
length.out = nsamplesperchain))
}

saveRDS(mcsubset(allmcsamples, tokeep),
Expand Down
Loading

0 comments on commit 2ff5cb3

Please sign in to comment.