geco-bern · fabern · Sep 23, 2024 · Sep 16, 2024 · Sep 17, 2024 · Sep 18, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,7 @@
 
 export(calib_sofun)
 export(cost_likelihood_biomee)
-export(cost_likelihood_pmodel)
+export(cost_likelihood_phydromodel)
 export(cost_rmse_biomee)
 export(cost_rmse_pmodel)
 export(init_dates_dataframe)

diff --git a/R/calib_sofun.R b/R/calib_sofun.R
@@ -16,7 +16,7 @@
 #'   \item{\code{par}}{A list of model parameters. For each parameter, an initial value 
 #'   and lower and upper bounds should be provided. The calibratable parameters
 #'   include model parameters 'kphio', 'kphio_par_a', 'kphio_par_b', 'soilm_thetastar',
-#'   'soilm_betao', 'beta_costunitratio', 'rd_to_vcmax', 'tau_acclim', 'kc_jmax'
+#'   'beta_costunitratio', 'rd_to_vcmax', 'tau_acclim', 'kc_jmax'
 #'   and 'rootzone_whc' , and (if
 #'   doing Bayesian calibration) error parameters
 #'   for each target variable, named for example 'err_gpp'. This list must match
@@ -49,7 +49,6 @@
 #'   kphio_par_a        = 0,
 #'   kphio_par_b        = 1.0,
 #'   soilm_thetastar    = 0.6*240,
-#'   soilm_betao        = 0.01,
 #'   beta_unitcostratio = 146,
 #'   rd_to_vcmax        = 0.014,
 #'   tau_acclim         = 30,
@@ -120,7 +119,7 @@ calib_sofun <- function(
     # create bounds
     lower <- unlist(lapply(settings$par, function(x) x$lower))
     upper <- unlist(lapply(settings$par, function(x) x$upper))
-    pars <- unlist(lapply( settings$par, function(x) x$init))
+    pars  <- unlist(lapply(settings$par, function(x) x$init))
 
     out <- GenSA::GenSA(
       par   = pars,
@@ -149,15 +148,20 @@ calib_sofun <- function(
       eval(settings$metric)(par = par,
                             obs = obs,
                             drivers = drivers,
-                            ...)
+                            ...) # the dots contain: par_fixed, targets, parallel, ncores
     }
 
     # reformat parameters
-    pars <- as.data.frame(do.call("rbind", settings$par), row.names = FALSE)
-
-    priors  <- BayesianTools::createTruncatedNormalPrior(
-      unlist(pars$mean),
-      unlist(pars$sd),
+    pars <- as.data.frame(do.call("rbind", settings$par)) # use rownames later on
+
+    # priors  <- BayesianTools::createTruncatedNormalPrior(
+    #   unlist(pars$mean),   # NOTE: This needs a value otherwise: Error in `parallelSampler(1000)`: sampler provided doesn't work
+    #   unlist(pars$sd),     # NOTE: This needs a value otherwise: Error in `parallelSampler(1000)`: sampler provided doesn't work
+    #   unlist(pars$lower),          # As a workaround BayesianTools::createUniformPrior could be used
+    #   unlist(pars$upper)
+    #   # unlist(pars$init)
+    # )
+    priors  <- BayesianTools::createUniformPrior( # workaround for TruncatedNormalPrior, this does not require mean and sd
       unlist(pars$lower),
       unlist(pars$upper)
       # unlist(pars$init)
@@ -166,24 +170,16 @@ calib_sofun <- function(
     # setup the bayes run, no message forwarding is provided
     # so wrap the function in a do.call
     setup <- BayesianTools::createBayesianSetup(
-      likelihood = function(
-    random_par) {
-        # cost(
-        #   par = random_par,
-        #   obs = obs,
-        #   drivers = drivers,
-        #   ...
-        # )
+      likelihood = function(random_par) {
         do.call("cost",
                 list(
-                  par = random_par,
+                  par = setNames(random_par, rownames(pars)),
                   obs = obs,
                   drivers = drivers
-                ))
-      },
-    prior = priors,
-    names = names(settings$par)
-    )    
+                ))},
+      prior = priors,
+      names = rownames(pars)
+    )
 
     # set bt control parameters
     bt_settings <- settings$control$settings
@@ -194,7 +190,7 @@ calib_sofun <- function(
       sampler = settings$control$sampler,
       settings = bt_settings
     )
-    
+
     # drop last value
     bt_par <- BayesianTools::MAP(out)$parametersMAP
     bt_par <- bt_par[1:(length(bt_par))]

diff --git a/R/cost_likelihood_phydro.R b/R/cost_likelihood_phydro.R
@@ -1,4 +1,75 @@
-cost_likelihood_phydro <- function(
+#' Cost function computing a log-likelihood for calibration of Phydro-model
+#' parameters
+#' 
+#' The cost function performs a Phydro-model run for the input drivers and model parameter
+#' values, and computes the outcome's normal log-likelihood centered at the input
+#' observed values and with standard deviation given as an input parameter 
+#' (calibratable).
+#' 
+#' @param par A vector of values for the parameters to be calibrated, including
+#' a subset of model parameters (described in \code{\link{runread_pmodel_f}}),
+#' in order, and error terms
+#' for each target variable (for example \code{'gpp_err'}), in the same order as
+#' the targets appear in \code{targets}.
+#' @param obs A nested data.frame of observations, with columns \code{'sitename'}
+#' and \code{'data'} (see \code{\link{p_model_validation}} or \code{\link{p_model_validation_vcmax25}}
+#' to check their structure). 
+#' @param drivers A nested data.frame of driver data. See \code{\link{p_model_drivers}}
+#' for a description of the data structure.
+#' @param targets A character vector indicating the target variables for which the
+#' optimization will be done and the RMSE computed. This string must be a column 
+#' name of the \code{data} data.frame belonging to the validation nested data.frame 
+#' (for example 'gpp').
+#' @param par_fixed A named list of model parameter values to keep fixed during the
+#' calibration. These should complement the input \code{par} such that all model
+#' parameters are passed on to \code{\link{runread_pmodel_f}}.
+#' @param parallel A logical specifying whether simulations are to be parallelised
+#' (sending data from a certain number of sites to each core). Defaults to
+#' \code{FALSE}.
+#' @param ncores An integer specifying the number of cores used for parallel
+#' computing. Defaults to 2.
+#' 
+#' @return The log-likelihood of the observed target values, assuming that they
+#' are independent, normally distributed and centered on the predictions
+#' made by the P-model run with standard deviation given as input (via `par` because
+#' the error terms are estimated through the calibration with `BayesianTools`,
+#' as shown in the "Parameter calibration and cost functions" vignette).
+#' 
+#' @details To run the P-model, all model parameters must be given. The cost
+#' function uses arguments \code{par} and \code{par_fixed} such that, in the
+#' calibration routine, \code{par} can be updated by the optimizer and 
+#' \code{par_fixed} are kept unchanged throughout calibration.
+#' 
+#' If the validation data contains a "date" column (fluxes), the simulated target time series
+#' is compared to the observed values on those same dates (e.g. for GPP). Otherwise, 
+#' there should only be one observed value per site (leaf traits), and the outputs 
+#' (averaged over the growing season, weighted by predicted GPP) will be 
+#' compared to this single value representative of the site (e.g. Vcmax25). As an exception,
+#' when the date of a trait measurement is available, it will be compared to the 
+#' trait value predicted on that date.
+#' 
+#' @export
+#' 
+#' @examples
+#' # Compute the likelihood for a set of 
+#' # model parameter values involved in the
+#' # temperature dependence of kphio 
+#' # and example data
+#' cost_likelihood_phydromodel(
+#'  par = c(0.05, -0.01, 1,     # model parameters
+#'          2),                # err_gpp
+#'  obs = p_model_validation,
+#'  drivers = p_model_drivers,
+#'  targets = c('gpp'),
+#'  par_fixed = list(
+#'   soilm_thetastar    = 0.6 * 240,  # old setup with soil moisture stress
+#'   beta_unitcostratio = 146.0,
+#'   rd_to_vcmax        = 0.014,      # from Atkin et al. 2015 for C3 herbaceous
+#'   tau_acclim         = 30.0,
+#'   kc_jmax            = 0.41
+#'  )
+#' )
+cost_likelihood_phydromodel <- function(
     par,   # model parameters & error terms for each target
     obs,
     drivers,
@@ -7,53 +78,46 @@ cost_likelihood_phydro <- function(
     parallel = FALSE,
     ncores = 2
 ){
+  # NOTE(fabian): These different cost functions share a LOT of code in common. Consider consolidation for maintainability?
+
   # predefine variables for CRAN check compliance
   sitename <- data <- gpp_mod <- NULL
 
-  using_phydro = drivers$params_siml[[1]]$use_phydro
-
-  # FIXME Jaideep: Instead of checking the number of params, 
-  #    it might be better to check for presence of each param in par and par_fixed
-  ## check input parameters
-  expected_params = ifelse(using_phydro, yes=14, no=10)
-  if( (length(par) + length(par_fixed)) != (expected_params + length(targets)) ){
-    stop(paste0('Error: Input calibratable and fixed parameters (par = ',length(par),' and par_fixed = ',length(par_fixed),')
-    do not match length of the required P-model parameters (',expected_params + length(targets),').'))
+  if (!("use_phydro" %in% colnames(drivers$params_siml[[1]]))){
+    warning("Parameter use_phydro not set. Assuming FALSE")
+    using_phydro = FALSE
+  } else {
+    using_phydro = drivers$params_siml[[1]]$use_phydro
   }
 
-
-  ## define parameter set based on calibrated parameters
-  if (using_phydro){
-    calib_param_names <- c('kphio', 'kphio_par_a', 'kphio_par_b',
-                           'rd_to_vcmax', 'tau_acclim', 'kc_jmax',
-                           'phydro_K_plant', 'phydro_p50_plant', 'phydro_b_plant',
-                           'phydro_alpha', 'phydro_gamma',
-                           'bsoil', 'Ssoil', 'whc')
+  ## define required parameter set based on calibrated parameters
+  if (!using_phydro){
+    required_param_names <- rsofun:::required_param_names$p_model
   } else {
-    calib_param_names <- c('kphio', 'kphio_par_a', 'kphio_par_b',
-                           'soilm_thetastar', 'soilm_betao',
-                           'beta_unitcostratio', 'rd_to_vcmax', 
-                           'tau_acclim', 'kc_jmax', 'whc')
+    required_param_names <- rsofun:::required_param_names$phydro_model
   }
 
-  # FIXME Jaideep: Here it is assumed that the params in par will appear in exactly the same order in settings as in the list above. Better to do this in an order-independent way.
-  if(!is.null(par_fixed)){
-    params_modl <- list()
-    # complete with calibrated values
-    i <- 1 # start counter
-    for(par_name in calib_param_names){
-      if(is.null(par_fixed[[par_name]])){
-        params_modl[[par_name]] <- par[i]   # use calibrated par value
-        i <- i + 1                          # counter of calibrated params
-      }else{
-        params_modl[[par_name]] <- par_fixed[[par_name]]  # use fixed par value
-      }
-    }
-  }else{
-    params_modl <- as.list(par[1:expected_params])       # all parameters calibrated
-    names(params_modl) <- calib_param_names
+  ## split calibrated parameters into model and error parameters
+  par_calibrated_model      <- par[ ! names(par) %in% c("err_gpp") ] # consider only model parameters for the check
+  # par_calibrated_errormodel <- par[   names(par) %in% c("err_gpp") ]
+  # par_fixed
+
+  ## check parameters
+  if (!identical(sort(c(names(par_calibrated_model), names(par_fixed))), required_param_names)){
+    stop(sprintf(paste0("Error: Input calibratable and fixed parameters do not ",
+                        "match required model parameters:",
+                        "\n         par:       c(%s)",
+                        "\n         par_fixed: c(%s)",
+                        "\n         required:  c(%s)"),
+                 paste0(sort(names(par_calibrated_model)), collapse = ", "),
+                 paste0(sort(names(par_fixed)), collapse = ", "),
+                 paste0(sort(required_param_names), collapse = ", ")))
   }
 
+  # Combine fixed and estimated params to result in all the params required to run the model
+  # This basically uses all params except those of the error model of the observations
+  params_modl <- c(par, par_fixed)[required_param_names]
+
   ## run the model
   df <- runread_pmodel_f(
     drivers,

diff --git a/R/cost_likelihood_pmodel.R b/R/cost_likelihood_pmodel.R
@@ -63,7 +63,6 @@
 #'  targets = c('gpp'),
 #'  par_fixed = list(
 #'   soilm_thetastar    = 0.6 * 240,  # old setup with soil moisture stress
-#'   soilm_betao        = 0.0,
 #'   beta_unitcostratio = 146.0,
 #'   rd_to_vcmax        = 0.014,      # from Atkin et al. 2015 for C3 herbaceous
 #'   tau_acclim         = 30.0,
@@ -80,6 +79,8 @@ cost_likelihood_pmodel <- function(
     parallel = FALSE,
     ncores = 2
 ){
+  # NOTE(fabian): These different cost functions share a LOT of code in common. Consider consolidation for maintainability?
+
   # predefine variables for CRAN check compliance
   sitename <- data <- gpp_mod <- NULL
 
@@ -90,48 +91,34 @@ cost_likelihood_pmodel <- function(
     using_phydro = drivers$params_siml[[1]]$use_phydro
   }
 
-  # FIXME Jaideep: Instead of checking the number of params, 
-  #    it might be better to check for presence of each param in par and par_fixed
-  ## check input parameters
-  expected_params = ifelse(using_phydro, yes=14, no=10)
-  if( (length(par) + length(par_fixed)) != (expected_params + length(targets)) ){
-    stop(paste0('Error: Input calibratable and fixed parameters (par = ',length(par),' and par_fixed = ',length(par_fixed),')
-    do not match length of the required P-model parameters (',expected_params + length(targets),').'))
-  }
-
-
-  ## define parameter set based on calibrated parameters
-  if (using_phydro){
-    calib_param_names <- c('kphio', 'kphio_par_a', 'kphio_par_b',
-                           'rd_to_vcmax', 'tau_acclim', 'kc_jmax',
-                           'phydro_K_plant', 'phydro_p50_plant', 'phydro_b_plant',
-                           'phydro_alpha', 'phydro_gamma',
-                           'bsoil', 'Ssoil', 'whc')
+  ## define required parameter set based on model parameters
+  if (!using_phydro){
+    required_param_names <- rsofun:::required_param_names$p_model
   } else {
-    calib_param_names <- c('kphio', 'kphio_par_a', 'kphio_par_b',
-                           'soilm_thetastar', 'soilm_betao',
-                           'beta_unitcostratio', 'rd_to_vcmax', 
-                           'tau_acclim', 'kc_jmax', 'whc')
+    required_param_names <- rsofun:::required_param_names$phydro_model
   }
 
-  # FIXME Jaideep: Here it is assumed that the params in par will appear in exactly the same order in settings as in the list above. Better to do this in an order-independent way.
-  if(!is.null(par_fixed)){
-    params_modl <- list()
-    # complete with calibrated values
-    i <- 1 # start counter
-    for(par_name in calib_param_names){
-      if(is.null(par_fixed[[par_name]])){
-        params_modl[[par_name]] <- par[i]   # use calibrated par value
-        i <- i + 1                          # counter of calibrated params
-      }else{
-        params_modl[[par_name]] <- par_fixed[[par_name]]  # use fixed par value
-      }
-    }
-  }else{
-    params_modl <- as.list(par[1:expected_params])       # all parameters calibrated
-    names(params_modl) <- calib_param_names
+  ## split calibrated parameters into model and error parameters
+  par_calibrated_model      <- par[ ! names(par) %in% c("err_gpp", "err_vcmax25") ] # consider only model parameters for the check
+  # par_calibrated_errormodel <- par[   names(par) %in% c("err_gpp", "err_vcmax25") ]
+  # par_fixed
+
+  ## check parameters
+  if (!identical(sort(c(names(par_calibrated_model), names(par_fixed))), required_param_names)){
+    stop(sprintf(paste0("Error: Input calibratable and fixed parameters do not ",
+                        "match required model parameters:",
+                        "\n         par:       c(%s)",
+                        "\n         par_fixed: c(%s)",
+                        "\n         required:  c(%s)"),
+                 paste0(sort(names(par_calibrated_model)), collapse = ", "),
+                 paste0(sort(names(par_fixed)), collapse = ", "),
+                 paste0(sort(required_param_names), collapse = ", ")))
   }
 
+  # Combine fixed and estimated params to result in all the params required to run the model
+  # This basically uses all params except those of the error model of the observations
+  params_modl <- c(par, par_fixed)[required_param_names]
+
   ## run the model
   df <- runread_pmodel_f(
     drivers,
@@ -232,6 +219,8 @@ cost_likelihood_pmodel <- function(
   }) |>
     unlist() |>
     sum()
+  # TODO(fabian): make above ll more robust by using advantages of named vector `par`
+  #               instead of relying on an expected order
 
   # trap boundary conditions
   if(is.nan(ll) | is.na(ll) | ll == 0){ll <- -Inf}