diff --git a/R/ExperimentSetup.R b/R/ExperimentSetup.R index 5721cfc0..ec43dce5 100644 --- a/R/ExperimentSetup.R +++ b/R/ExperimentSetup.R @@ -189,6 +189,12 @@ extract_experimental_setup <- function( # Suppress NOTES due to non-standard evaluation in data.table main_data_id <- NULL + ...get_n_samples <- function(x, type) { + if (is_empty(x[[type]])) return(0L) + + return(nrow(x[[type]])) + } + # Add perturbation level. section_table[, "perturbation_level" := 1L] for (data_id in section_table$main_data_id) { @@ -201,16 +207,18 @@ extract_experimental_setup <- function( section_table[main_data_id == data_id, "n_runs" := length(iteration_list[[as.character(data_id)]]$run)] } - # Set the (max) number of available validation instances. + # Determine the number of instances available for development and validation. for (data_id in section_table$main_data_id) { - section_table[main_data_id == data_id, "max_validation_instances" := max(sapply( - iteration_list[[as.character(data_id)]]$run, - function(x) { - if (is_empty(x$valid_samples)) return(0L) - - return(nrow(x$valid_samples)) - } - ))] + + n_run_training_samples <- sapply(iteration_list[[as.character(data_id)]]$run, ...get_n_samples, type = "train_samples") + n_run_validation_samples <- sapply(iteration_list[[as.character(data_id)]]$run, ...get_n_samples, type = "valid_samples") + + section_table[main_data_id == data_id, ":="( + "min_training_instances" = min(n_run_training_samples), + "max_training_instances" = max(n_run_training_samples), + "min_validation_instances" = min(n_run_validation_samples), + "max_validation_instances" = max(n_run_validation_samples) + )] } return(section_table) diff --git a/R/Familiar.R b/R/Familiar.R index 2f724fea..ca615923 100644 --- a/R/Familiar.R +++ b/R/Familiar.R @@ -414,7 +414,7 @@ summon_familiar <- function( # Select and sort unique tasks. tasks <- .sort_tasks(tasks) -browser() + # Pre-processing ------------------------------------------------------------- .run_preprocessing( cl = cl, diff --git a/R/FamiliarS4Classes.R b/R/FamiliarS4Classes.R index e26460cb..a744c6d7 100644 --- a/R/FamiliarS4Classes.R +++ b/R/FamiliarS4Classes.R @@ -642,20 +642,16 @@ setClass("familiarCollection", #' @slot outcome_info Outcome information object, which contains additional #' information concerning the outcome, such as class levels. #' @slot data_column_info Object containing column information. -#' @slot delay_loading logical. Allows delayed loading data, which enables data -#' parsing downstream without additional workflow complexity or memory -#' utilisation. -#' @slot perturb_level numeric. This is the perturbation level for data which -#' has not been loaded. Used for data retrieval by interacting with the run -#' table of the accompanying model. -#' @slot load_validation logical. This determines which internal data set will -#' be loaded. If TRUE, the validation data will be loaded, whereas FALSE loads -#' the development data. -#' @slot aggregate_on_load logical. Determines whether data is aggregated after -#' loading. -#' @slot sample_set_on_load NULL or vector of sample identifiers to be loaded. -#' -setClass("dataObject", +#' @slot data_id Data identifier for dataset. Set using internal routines if the +#' `dataObject` was created from a `delayedDataObject` +#' @slot run_id Run identifier for dataset. Set using internal routines if the +#' `dataObject` was created from a `delayedDataObject` +#' @slot validation Identifies if validation or development samples were loaded. +#' Set using internal routines if the `dataObject` was created from a +#' `delayedDataObject`. +#' @slot sample_seed Seed used for creating a bootstrap of the data. +setClass( + "dataObject", slots = list( # Data data = "ANY", @@ -667,29 +663,81 @@ setClass("dataObject", outcome_info = "ANY", # Info related to the columns in the dataset. data_column_info = "ANY", - # Flag for delayed loading. This can only be meaningfully set using internal - # data. - delay_loading = "logical", - # Perturbation level for data which has not been loaded. Used for data - # retrieval in combination with the run table of the accompanying model. - perturb_level = "numeric", - # Determines which data should be loaded. - load_validation = "logical", - # Flag for aggregation after loading and pre-processing - aggregate_on_load = "logical", - # Samples to be loaded - sample_set_on_load = "ANY" + # Data id + data_id = "integer", + # Run id + run_id = "integer", + # Validation marker. + validation = "logical", + # Sample seed + sample_seed = "integer" ), prototype = list( data = NULL, preprocessing_level = "none", outcome_type = NA_character_, outcome_info = NULL, - delay_loading = FALSE, - perturb_level = NA_integer_, - load_validation = TRUE, - aggregate_on_load = FALSE, - sample_set_on_load = NULL + data_column_info = NULL, + data_id = NA_integer_, + run_id = NA_integer_, + validation = NA, + sample_seed = NA_integer_ + ) +) + + + +# delayedDataObject object ----------------------------------------------------- + +#' Data object with delayed loading +#' +#' The delayed loading object provides an interface to the backend data. This +#' data object is typically used within the evaluation pipeline to load data +#' when needed. +#' +#' @slot data NULL or data table containing the data. If present (not `NULL`), +#' data is considered loaded. +#' @slot preprocessing_level character indicating the level of pre-processing +#' already conducted. `"none"` by default. +#' @slot outcome_type character, determines the outcome type. +#' @slot outcome_info Outcome information object, which contains additional +#' information concerning the outcome, such as class levels. +#' @slot data_column_info Object containing column information. +#' @slot data_id integer. Defines the data_id of the dataset that should be +#' loaded. +#' @slot run_id integer. Defines the run_id of the dataset that should be load. +#' Together with data_id, run_id and validation allows for looking up the +#' sample set. +#' @slot validation logical. This determines which internal data set will be +#' loaded. If TRUE, the validation data will be loaded, whereas FALSE loads +#' the development data. +#' @slot aggregate_on_load logical. Determines whether data is aggregated after +#' loading. +#' @slot sample_set_on_load NULL or vector of sample identifiers to be loaded. +#' Overrides any `sample_seed` that may have been provided. +#' @slot defer_to_model_data_and_run_id logical. Determines whether the provided +#' data_id and run_id should be used (`FALSE`), or data_id and run_id of a +#' model (`TRUE`). +setClass( + "delayedDataObject", + contains = "dataObject", + slots = list( + # Determines if validation or development data should be loaded. + validation = "logical", + # Flag for aggregation after loading and pre-processing + aggregate_on_load = "logical", + # Samples to be loaded. + sample_set_on_load = "ANY", + # Flag for deferring loading of data depending on data_id and run_id of + # models. Used to ensure that development data and internal validation data + # are correctly handled. Overrides and data_id and run_id that may have been + # provided. + defer_to_model_data_and_run_id = "logical" + ), + prototype = list( + aggregate_on_load = NA, + sample_set_on_load = NULL, + defer_to_model_data_and_run_id = NA ) ) diff --git a/R/TaskEvaluate.R b/R/TaskEvaluate.R index b0c13f48..b03b231a 100644 --- a/R/TaskEvaluate.R +++ b/R/TaskEvaluate.R @@ -132,6 +132,8 @@ setClass( "validation" = "logical", "ensemble_data_id" = "integer", "ensemble_run_id" = "integer", + "get_predictions_at_model_level" = "logical", + "force_ensemble_detail_level" = "logical", "vimp_method" = "character", "learner" = "character", "data_set_name" = "character", @@ -139,10 +141,23 @@ setClass( ), prototype = methods::prototype( validation = NA, - # Whereas data_id describes where the data comes from, the ensemble_data_id - # describes where ensembles are formed. + # Whereas data_id describes where the overall data comes from, the + # ensemble_data_id describes where ensembles are formed. ensemble_data_id = NA_integer_, ensemble_run_id = NA_integer_, + # This parameter determines which data is used for generating predictions. + # If FALSE, data will be obtained using data_id and run_id of the task. If + # TRUE, data will be obtained using the data_id and run_id associated with + # each model. In practice, for internal runs this will be set to TRUE except + # for external validation data, whereas for external tasks (e.g. on existing + # models) this will be FALSE, and the provided data will be used directly. + get_predictions_at_model_level = FALSE, + # If individual models do not have sufficient data to perform hybrid + # analysis (each model is used to compute part of the bootstraps when + # computing confidence intervals) for an evaluation step, that evaluation + # step may fail, even though over all models, sufficient data are present. + # In that case, we need to force an ensemble detail level. + force_ensemble_detail_level = FALSE, vimp_method = NA_character_, learner = NA_character_, data_set_name = NA_character_, @@ -200,6 +215,54 @@ setMethod( +# .perform_task (evaluation task , NULL) -------------------------------------------- +setMethod( + ".perform_task", + signature( + object = "familiarTaskEvaluate", + data = "NULL" + ), + function( + object, + data, + experiment_data = NULL, + outcome_info = NULL, + ... + ) { + # This method is called when "data" is expected to be available somewhere in + # the backend. + + if (is.null(experiment_data)) { + ..error_reached_unreachable_code("experiment_data is required for retrieving data from the backend.") + } + if (is.null(outcome_info)) { + ..error_reached_unreachable_code("outcome_info is required.") + } + + data <- methods::new( + "dataObject", + data = NULL, + preprocessing_level = "none", + outcome_type = outcome_info@outcome_type, + outcome_info = outcome_info, + load_validation = object@validation, + delay_loading = TRUE, + aggregate_on_load = FALSE + ) + + # Pass to method that dispatches with dataObject for further processing. + return(.perform_task( + object = object, + data = data, + experiment_data = experiment_data, + ... + )) + } +) + + + + # .perform_task (evaluation task, dataObject) ---------------------------------- setMethod( ".perform_task", @@ -237,7 +300,41 @@ setMethod( # Check which detail level should be provided based on the number of # available instances for each model. - browser() + fam_ensemble <- methods::new( + "familiarEnsemble", + model_list = as.list(object@model_files), + learner = object@learner, + vimp_method = object@vimp_method + ) + + # Add package version. + fam_ensemble <- add_package_version(object = fam_ensemble) + + # Load models and prevent auto-detaching. + fam_ensemble <- load_models( + object = fam_ensemble, + suppress_auto_detach = TRUE + ) + + # Create a run table + fam_ensemble@run_table <- list( + "run_table" = lapply( + fam_ensemble@model_list, + function(fam_model) fam_model@run_table + ), + "ensemble_data_id" = object@ensemble_data_id, + "ensemble_run_id" = object@ensemble_run_id + ) + + # Complete the ensemble using information provided by the model + fam_ensemble <- complete_familiar_ensemble(object = fam_ensemble) + + # Set evaluation level. + if (object@force_ensemble_detail_level) { + detail_level <- "ensemble" + } else { + detail_level <- settings$eval$detail_level + } # Compute evaluation data. evaluation_data <- extract_data( @@ -344,20 +441,56 @@ setMethod( # evaluation tasks ----------------------------------------------------------- + data_file_names <- NULL + + n_min_model_instances <- Inf + evaluate_external_validation <- evaluate_internal_validation <- evaluate_development <- FALSE + + # External validation: the top level has associated validation data. + if (experiment_data@experiment_setup[main_data_id == 1L]$max_validation_instances > 0L) { + evaluate_external_validation <- TRUE + n_min_model_instances <- min( + c(experiment_data@experiment_setup[main_data_id == 1L]$max_validation_instances), + n_min_model_instances + ) + } + + # Internal validation: the lowest model ensembling level has associated + # validation data. + if (experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_validation_instances > 0L) { + evaluate_internal_validation <- TRUE + n_min_model_instances <- min( + c(experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_validation_instances), + n_min_model_instances + ) + } + + # Development data: the lowest model ensembling level has associated + # development data. NOTE: this should always be the case. + if (experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_training_instances > 0L) { + evaluate_development <- TRUE + n_min_model_instances <- min( + c(experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_training_instances), + n_min_model_instances + ) + } + + # Check model instances and determine if we need to force ensemble detail + # level for evaluation. + force_ensemble_detail_level <- n_min_model_instances < 10L + # Use collection tasks to set up the evaluation tasks, including for internal # validation. evaluate_task_list <- list() ii <- 1L + for (jj in seq_along(collect_task_list)) { - data_file_names <- NULL - for (learner in learners) { for (vimp_method in vimp_methods) { ## external validation ------------------------------------------------- - # External validation: the top level has associated validation data. - if (experiment_data@experiment_setup[main_data_id == 1L]$max_validation_instances > 0L) { + if (evaluate_external_validation) { # Initialise task. evaluate_task <- methods::new( "familiarTaskEvaluate", @@ -366,6 +499,8 @@ setMethod( validation = TRUE, ensemble_data_id = collect_task_list[[jj]]@data_id, ensemble_run_id = collect_task_list[[jj]]@run_id, + get_predictions_at_model_level = FALSE, + force_ensemble_detail_level = force_ensemble_detail_level, learner = learner, vimp_method = vimp_method, data_set_name = "external_validation", @@ -387,8 +522,6 @@ setMethod( # internal validation -------------------------------------------------- - # Internal validation: the ensembling level has associated validation - # data. if (experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_validation_instances > 0L) { # Initialise task. evaluate_task <- methods::new( @@ -398,6 +531,8 @@ setMethod( validation = TRUE, ensemble_data_id = collect_task_list[[jj]]@data_id, ensemble_run_id = collect_task_list[[jj]]@run_id, + get_predictions_at_model_level = TRUE, + force_ensemble_detail_level = force_ensemble_detail_level, learner = learner, vimp_method = vimp_method, data_set_name = "internal_validation", @@ -418,31 +553,39 @@ setMethod( } # development ---------------------------------------------------------- - # Development at the ensembling level. - evaluate_task <- methods::new( - "familiarTaskEvaluate", - data_id = collect_task_list[[jj]]@data_id, - run_id = collect_task_list[[jj]]@run_id, - validation = FALSE, - ensemble_data_id = collect_task_list[[jj]]@data_id, - ensemble_run_id = collect_task_list[[jj]]@run_id, - learner = learner, - vimp_method = vimp_method, - data_set_name = "development", - project_id = experiment_data@project_id - ) - - # Set file name. - evaluate_task <-.set_file_name( - object = evaluate_task, - file_paths = file_paths - ) - # Make task and associated file names available. - data_file_names <- c(data_file_names, evaluate_task@file) - evaluate_task_list[[ii]] <- evaluate_task - - ii <- ii + 1L + if (evaluate_development) { + # Initialise task. + evaluate_task <- methods::new( + "familiarTaskEvaluate", + data_id = collect_task_list[[jj]]@data_id, + run_id = collect_task_list[[jj]]@run_id, + validation = FALSE, + ensemble_data_id = collect_task_list[[jj]]@data_id, + ensemble_run_id = collect_task_list[[jj]]@run_id, + get_predictions_at_model_level = TRUE, + force_ensemble_detail_level = force_ensemble_detail_level, + learner = learner, + vimp_method = vimp_method, + data_set_name = "development", + project_id = experiment_data@project_id + ) + + # Set file name. + evaluate_task <-.set_file_name( + object = evaluate_task, + file_paths = file_paths + ) + + # Make task and associated file names available. + data_file_names <- c(data_file_names, evaluate_task@file) + evaluate_task_list[[ii]] <- evaluate_task + + ii <- ii + 1L + + } else { + ..error_reached_unreachable_code("development data are always present.") + } } } @@ -475,7 +618,6 @@ setMethod( # Iterate over evaluation tasks and add corresponding models based on ensemble # data id and run id. for (ii in seq_along(evaluate_task_list)) { - # Select run tables where the ensemble data and run identifiers appear. selected_run_tables <- run_tables[sapply( run_tables, diff --git a/man/dataObject-class.Rd b/man/dataObject-class.Rd index be446f2f..b80e63c1 100644 --- a/man/dataObject-class.Rd +++ b/man/dataObject-class.Rd @@ -25,21 +25,16 @@ information concerning the outcome, such as class levels.} \item{\code{data_column_info}}{Object containing column information.} -\item{\code{delay_loading}}{logical. Allows delayed loading data, which enables data -parsing downstream without additional workflow complexity or memory -utilisation.} +\item{\code{data_id}}{Data identifier for dataset. Set using internal routines if the +\code{dataObject} was created from a \code{delayedDataObject}} -\item{\code{perturb_level}}{numeric. This is the perturbation level for data which -has not been loaded. Used for data retrieval by interacting with the run -table of the accompanying model.} +\item{\code{run_id}}{Run identifier for dataset. Set using internal routines if the +\code{dataObject} was created from a \code{delayedDataObject}} -\item{\code{load_validation}}{logical. This determines which internal data set will -be loaded. If TRUE, the validation data will be loaded, whereas FALSE loads -the development data.} +\item{\code{validation}}{Identifies if validation or development samples were loaded. +Set using internal routines if the \code{dataObject} was created from a +\code{delayedDataObject}.} -\item{\code{aggregate_on_load}}{logical. Determines whether data is aggregated after -loading.} - -\item{\code{sample_set_on_load}}{NULL or vector of sample identifiers to be loaded.} +\item{\code{sample_seed}}{Seed used for creating a bootstrap of the data.} }} diff --git a/man/delayedDataObject-class.Rd b/man/delayedDataObject-class.Rd new file mode 100644 index 00000000..988685f7 --- /dev/null +++ b/man/delayedDataObject-class.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{delayedDataObject-class} +\alias{delayedDataObject-class} +\title{Data object with delayed loading} +\description{ +The delayed loading object provides an interface to the backend data. This +data object is typically used within the evaluation pipeline to load data +when needed. +} +\section{Slots}{ + +\describe{ +\item{\code{data}}{NULL or data table containing the data. If present (not \code{NULL}), +data is considered loaded.} + +\item{\code{preprocessing_level}}{character indicating the level of pre-processing +already conducted. \code{"none"} by default.} + +\item{\code{outcome_type}}{character, determines the outcome type.} + +\item{\code{outcome_info}}{Outcome information object, which contains additional +information concerning the outcome, such as class levels.} + +\item{\code{data_column_info}}{Object containing column information.} + +\item{\code{data_id}}{integer. Defines the data_id of the dataset that should be +loaded.} + +\item{\code{run_id}}{integer. Defines the run_id of the dataset that should be load. +Together with data_id, run_id and validation allows for looking up the +sample set.} + +\item{\code{validation}}{logical. This determines which internal data set will be +loaded. If TRUE, the validation data will be loaded, whereas FALSE loads +the development data.} + +\item{\code{aggregate_on_load}}{logical. Determines whether data is aggregated after +loading.} + +\item{\code{sample_set_on_load}}{NULL or vector of sample identifiers to be loaded. +Overrides any \code{sample_seed} that may have been provided.} + +\item{\code{defer_to_model_data_and_run_id}}{logical. Determines whether the provided +data_id and run_id should be used (\code{FALSE}), or data_id and run_id of a +model (\code{TRUE}).} +}} + diff --git a/tests/testthat/test-task_based_workflow.R b/tests/testthat/test-task_based_workflow.R index ec05bfd5..b75dfbeb 100644 --- a/tests/testthat/test-task_based_workflow.R +++ b/tests/testthat/test-task_based_workflow.R @@ -173,9 +173,11 @@ testthat::test_that("all models are present", { # Including evaluation --------------------------------------------------------- +data <- familiar:::test_create_small_good_data("binomial") + results <- familiar::summon_familiar( data = data, - experimental_design = "bs(fs,3)+bs(mb,3)", + experimental_design = "bs(fs,3)+bs(mb, 3)", vimp_method = "mim", learner = "glm_logistic", evaluate_top_level_only = FALSE,