diff --git a/DESCRIPTION b/DESCRIPTION index 7d7bfe60..4c8608c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -199,6 +199,7 @@ Collate: 'RankStabilityAggregation.R' 'SocketServer.R' 'StringUtilities.R' + 'TaskEvaluate.R' 'TaskFeatureInfo.R' 'TaskLearn.R' 'TaskLearnerHyperparameters.R' diff --git a/NEWS.md b/NEWS.md index 123128a5..359eb367 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,12 @@ ## Breaking changes -- Naming and on-disk location of variable importance tables and models has - changed. These are no longer nested to limit path lengths. +- Naming and on-disk location of variable importance tables, models, evaluated + datasets and collections, have changed. These are no longer nested to reduce + path lengths and avoid issues due to long path lengths, particularly on + Windows OS. + +- Ensembles are now no longer explicitly stored, but are formed at run-time. ## Major changes @@ -37,6 +41,13 @@ - `parallel_feature_selection` was renamed to `parallel_vimp`. +- It is now possible to build models without explicitly defining a variable + importance (feature selection) step. For example, + `experimental_design = mb + ev` is now valid and will result in training of a + single model on the development dataset with subsequent evaluation on an + external dataset. This is realised by using variable importance data obtained + during hyperparameter optimisation. + ## Minor changes - The `iteration_seed` configuration parameter was added to provide a fixed seed @@ -73,7 +84,7 @@ lead to too few samples to allow for assessment. This affected Leave-One-Out-Cross-Validation (LOOCV) schemes in particular. -## Bug fixes +## Fixes - Fixed errors when creating feature or similarity plots caused by sample or feature names matching internal column names. @@ -83,6 +94,9 @@ - Variable importance methods and outcome information objects were missing a familiar version attribute, which has now been added to ensure future compatibility. + +- Some vignettes referred to `experiment_design` where `experimental_design` was + intended. # Version 1.5.0 (Whole Whale) diff --git a/R/ExperimentSetup.R b/R/ExperimentSetup.R index abff3a07..5721cfc0 100644 --- a/R/ExperimentSetup.R +++ b/R/ExperimentSetup.R @@ -201,6 +201,18 @@ extract_experimental_setup <- function( section_table[main_data_id == data_id, "n_runs" := length(iteration_list[[as.character(data_id)]]$run)] } + # Set the (max) number of available validation instances. + for (data_id in section_table$main_data_id) { + section_table[main_data_id == data_id, "max_validation_instances" := max(sapply( + iteration_list[[as.character(data_id)]]$run, + function(x) { + if (is_empty(x$valid_samples)) return(0L) + + return(nrow(x$valid_samples)) + } + ))] + } + return(section_table) } diff --git a/R/Familiar.R b/R/Familiar.R index 0835dbed..e338d23d 100644 --- a/R/Familiar.R +++ b/R/Familiar.R @@ -407,6 +407,7 @@ summon_familiar <- function( optimisation_determine_vimp = settings$hpo$hpo_determine_vimp, vimp_methods = settings$vimp$vimp_methods, learners = settings$mb$learners, + pool_only = settings$eval$pool_only, file_paths = file_paths ) } diff --git a/R/TaskEvaluate.R b/R/TaskEvaluate.R index e6a43d60..50107d92 100644 --- a/R/TaskEvaluate.R +++ b/R/TaskEvaluate.R @@ -16,6 +16,124 @@ setClass( ) +# .set_file_name (collection task) --------------------------------------------- +setMethod( + ".set_file_name", + signature(object = "familiarTaskCollect"), + function(object, file_paths = NULL) { + if (is.null(file_paths)) return(object) + + name <- NULL + if (object@data_id == 1L && object@run_id == 1L) { + name <- "pooled" + } + + # Generate file name of the model. + object@file <- get_object_file_name( + object_type = "familiarCollection", + data_id = object@data_id, + run_id = object@run_id, + name = name, + project_id = object@project_id, + dir_path = file_paths$fam_coll_dir + ) + + return(object) + } +) + + + +# .get_task_descriptor (collection task) --------------------------------------- +setMethod( + ".get_task_descriptor", + signature(object = "familiarTaskCollect"), + function(object, ...) { + return(paste0( + object@task_name, "_", + object@data_id, "_", + object@run_id + )) + } +) + + + + + +# familiarTaskEvaluate --------------------------------------------------------- +setClass( + "familiarTaskEvaluate", + contains = "familiarTask", + slots = list( + "validation" = "logical", + "ensemble_data_id" = "integer", + "ensemble_run_id" = "integer", + "vimp_method" = "character", + "learner" = "character", + "data_set_name" = "character" + ), + prototype = methods::prototype( + validation = NA, + # Whereas data_id describes where the data comes from, the ensemble_data_id + # describes where ensembles are formed. + ensemble_data_id = NA_integer_, + ensemble_run_id = NA_integer_, + vimp_method = NA_character_, + learner = NA_character_, + data_set_name = NA_character_, + task_name = "evaluate" + ) +) + + + +# .set_file_name (evaluation task) --------------------------------------------- +setMethod( + ".set_file_name", + signature(object = "familiarTaskEvaluate"), + function(object, file_paths = NULL) { + if (is.null(file_paths)) return(object) + + # Generate file name of the model. + object@file <- get_object_file_name( + object_type = "familiarData", + data_id = object@data_id, + run_id = object@run_id, + learner = object@learner, + vimp_method = object@vimp_method, + ensemble_data_id = object@ensemble_data_id, + ensemble_run_id = object@ensemble_run_id, + name = object@data_set_name, + project_id = object@project_id, + dir_path = file_paths$fam_data_dir + ) + + return(object) + } +) + + + +# .get_task_descriptor (evaluation task) --------------------------------------- +setMethod( + ".get_task_descriptor", + signature(object = "familiarTaskEvaluate"), + function(object, ...) { + return(paste0( + object@task_name, "_", + object@data_id, "_", + object@run_id, "_", + object@vimp_method, "_", + object@learner, "_", + object@ensemble_data, "_", + object@ensemble_run_id, "_", + object@data_set_name + )) + } +) + + .generate_evaluation_tasks <- function( experiment_data, @@ -42,50 +160,182 @@ setClass( ) ) + # Find the data_id related to ensembling of models. + train_data_id <- experiment_data@experiment_setup[train == TRUE, ]$main_data_id[1L] + if (is_empty(train_data_id)) return(NULL) + + # Determine which parts of the experiment can be used for internal validation.. + run_table <- .get_run_table_from_experiment_setup( + data_id = train_data_id, + experiment_setup = experiment_data@experiment_setup + ) + + # Internal validation could exist at the level where pre-processing for + # training is allowed. + internal_validation_data_id <- tail(run_table[main_data_id <= train_data_id & can_pre_process == TRUE], n = 1L)$main_data_id[1L] + if (!pool_only) { # Determine the collections at the last experimental level that can # pre-process and is part of the model-building branch. + collection_run_ids <- seq_len(run_table[main_data_id == internal_validation_data_id]$n_runs[1L]) - # Find the data_id related to training. - train_data_id <- experiment_data@experiment_setup[train == TRUE, ]$main_data_id[1L] - if (is_empty(train_data_id)) return(NULL) - - # Determine which parts of the experimental setup are used by training. - run_table <- .get_run_table_from_experiment_setup( - data_id = train_data_id, - experiment_setup = experiment_data@experiment_setup - ) - - # Find the data_id and run_ids for preprocessing. - collection_data_id <- tail(run_table[main_data_id <= train_data_id & can_pre_process == TRUE], n = 1L)$main_data_id[1L] - collection_run_ids <- seq_len(run_table[main_data_id == collection_data_id]$n_runs[1L]) - + ii <- 2L for (run_id in collection_run_ids) { - collect_task_list <- c( - collect_task_list, - methods::new( - "familiarTaskCollect", - data_id = collection_data_id, - run_id = run_id, - project_id = experiment_data@project_id - ) + collect_task_list[[ii]] <- methods::new( + "familiarTaskCollect", + data_id = internal_validation_data_id, + run_id = run_id, + project_id = experiment_data@project_id ) + + ii <- ii + 1L } } # evaluation tasks ----------------------------------------------------------- - # Use collection tasks to set up the evaluation tasks, including for internal validation. + # Use collection tasks to set up the evaluation tasks, including for internal + # validation. + evaluate_task_list <- list() + ii <- 1L + for (jj in seq_along(collect_task_list)) { + data_file_names <- NULL + + for (learner in learners) { + for (vimp_method in vimp_methods) { + + ## external validation ------------------------------------------------- + + # External validation: the top level has associated validation data. + if (experiment_data@experiment_setup[main_data_id == 1L]$max_validation_instances > 0L) { + # Initialise task. + evaluate_task <- methods::new( + "familiarTaskEvaluate", + data_id = 1L, + run_id = 1L, + validation = TRUE, + ensemble_data_id = collect_task_list[[jj]]@data_id, + ensemble_run_id = collect_task_list[[jj]]@run_id, + learner = learner, + vimp_method = vimp_method, + data_set_name = "external_validation", + project_id = experiment_data@project_id + ) + + # Set file name. + evaluate_task <-.set_file_name( + object = evaluate_task, + file_paths = file_paths + ) + + # Make task and associated file names available. + data_file_names <- c(data_file_names, evaluate_task@file) + evaluate_task_list[[ii]] <- evaluate_task + + ii <- ii + 1L + } + + # internal validation -------------------------------------------------- + + # Internal validation: the ensembling level has associated validation + # data. + if (experiment_data@experiment_setup[main_data_id == internal_validation_data_id]$max_validation_instances > 0L) { + # Initialise task. + evaluate_task <- methods::new( + "familiarTaskEvaluate", + data_id = collect_task_list[[jj]]@data_id, + run_id = collect_task_list[[jj]]@run_id, + validation = TRUE, + ensemble_data_id = collect_task_list[[jj]]@data_id, + ensemble_run_id = collect_task_list[[jj]]@run_id, + learner = learner, + vimp_method = vimp_method, + data_set_name = "internal_validation", + project_id = experiment_data@project_id + ) + + # Set file name. + evaluate_task <-.set_file_name( + object = evaluate_task, + file_paths = file_paths + ) + + # Make task and associated file names available. + data_file_names <- c(data_file_names, evaluate_task@file) + evaluate_task_list[[ii]] <- evaluate_task + + ii <- ii + 1L + } + + # development ---------------------------------------------------------- + # Development at the ensembling level. + evaluate_task <- methods::new( + "familiarTaskEvaluate", + data_id = collect_task_list[[jj]]@data_id, + run_id = collect_task_list[[jj]]@run_id, + validation = FALSE, + ensemble_data_id = collect_task_list[[jj]]@data_id, + ensemble_run_id = collect_task_list[[jj]]@run_id, + learner = learner, + vimp_method = vimp_method, + data_set_name = "development", + project_id = experiment_data@project_id + ) + + # Set file name. + evaluate_task <-.set_file_name( + object = evaluate_task, + file_paths = file_paths + ) + + # Make task and associated file names available. + data_file_names <- c(data_file_names, evaluate_task@file) + evaluate_task_list[[ii]] <- evaluate_task + + ii <- ii + 1L + } + } + + # Update collection tasks by adding file paths to + collect_task_list[[jj]]@data_file <- data_file_names + collect_task_list[[jj]] <- .set_file_name( + object = collect_task_list[[jj]], + file_paths = file_paths + ) + } + # ensembles ------------------------------------------------------------------ + + # Obtain run tables related to models. + train_data_id <- experiment_data@experiment_setup[train == TRUE, ]$main_data_id[1L] + if (is_empty(train_data_id)) return(NULL) + + run_tables <- .collect_run_tables(iteration_list = experiment_data@iteration_list) + run_tables <- run_tables[sapply( + run_tables, + function(x, data_id) { + return (tail(x, n = 1L)$data_id == data_id) + }, + data_id = train_data_id + )] + browser() + + # Iterate over evaluation tasks and add corresponding models based on ensemble + # data id and run id. + for (ii in seq_along(evaluate_task_list)) { + ensemble_data_id <- evaluate_task_list[[ii]]@ensemble_data_id + ensemble_run_id <- evaluate_task_list[[ii]]@ensemble_run_id + + + } + + browser() # train and variable importance tasks ---------------------------------------- - task_list <- c( - task_list, - .generate_learner_tasks( - experiment_data = experiment_data, - vimp_methods = vimp_methods, - file_paths = file_paths, - skip_existing = skip_existing, - ... - ) + task_list <- .generate_trainer_tasks( + experiment_data = experiment_data, + vimp_methods = vimp_methods, + file_paths = file_paths, + skip_existing = skip_existing, + ... ) } diff --git a/R/TaskLearn.R b/R/TaskLearn.R index 06aafc2b..970e1880 100644 --- a/R/TaskLearn.R +++ b/R/TaskLearn.R @@ -35,7 +35,7 @@ setMethod( function(object, file_paths = NULL) { if (is.null(file_paths)) return(object) - # Generate file name of variable importance table + # Generate file name of the model. object@file <- get_object_file_name( object_type = "familiarModel", data_id = object@data_id, diff --git a/R/TaskMain.R b/R/TaskMain.R index 4b6f4197..5755faad 100644 --- a/R/TaskMain.R +++ b/R/TaskMain.R @@ -229,7 +229,6 @@ setMethod( - .generate_learner_data_preprocessing_tasks <- function( experiment_data, file_paths @@ -300,15 +299,6 @@ setMethod( -.generate_evaluation_tasks <- function( - file_paths, - project_id -) { - -} - - - .sort_tasks <- function(task_list) { # Select unique tasks. duplicate_tasks <- duplicated(sapply(task_list, FUN = .get_task_descriptor)) @@ -324,7 +314,6 @@ setMethod( "vimp" = task_list[task_class == "familiarTaskVimp"], "hyperparameters_learner" = task_list[task_class == "familiarTaskLearnerHyperparameters"], "train" = task_list[task_class == "familiarTaskTrain"], - "ensemble" = task_list[task_class == "familiarTaskEnsemble"], "evaluate" = task_list[task_class == "familiarTaskEvaluate"], "collect" = task_list[task_class == "familiarTaskCollect"] ) diff --git a/R/Utilities.R b/R/Utilities.R index 18ddf733..5f52c06f 100644 --- a/R/Utilities.R +++ b/R/Utilities.R @@ -757,11 +757,10 @@ get_object_file_name <- function( run_id, learner = NULL, vimp_method = NULL, - pool_data_id = NULL, - pool_run_id = NULL, + ensemble_data_id = NULL, + ensemble_run_id = NULL, + name = NULL, object_type, - is_ensemble = NULL, - is_validation = NULL, with_extension = TRUE, dir_path = NULL ) { @@ -782,29 +781,13 @@ get_object_file_name <- function( "_model" ) - } else if (object_type == "familiarEnsemble") { - # For familiarEnsemble objects - - if (is.null(learner) || is.null(vimp_method) || is.null(project_id) || is.null(is_ensemble)) { - ..error_reached_unreachable_code("missing arguments") - } - - output_str <- paste0( - project_id, "_", - learner, "_", - vimp_method, "_", - data_id, "_", - run_id, "_", - ifelse(is_ensemble, "ensemble", "pool") - ) - } else if (object_type == "familiarData") { # For familiarData objects if ( is.null(learner) || is.null(vimp_method) || is.null(project_id) || - is.null(is_ensemble) || is.null(is_validation) || is.null(pool_data_id) || - is.null(pool_run_id) + is.null(data_id) || is.null(run_id) || + is.null(ensemble_data_id) || is.null(ensemble_run_id) || is.null(name) ) { ..error_reached_unreachable_code("missing arguments") } @@ -814,13 +797,36 @@ get_object_file_name <- function( learner, "_", vimp_method, "_", data_id, "_", - run_id, "_", - ifelse(is_ensemble, "ensemble", "pool"), "_", - pool_data_id, "_", - pool_run_id, "_", - ifelse(is_validation, "validation", "development"), "_data" + run_id, "_ensemble_", + ensemble_data_id, "_", + ensemble_run_id, "_", + name, "_data" ) + } else if (object_type == "familiarCollection") { + + if ( + is.null(project_id) || ((is.null(data_id) || is.null(run_id)) && is.null(name)) + ) { + ..error_reached_unreachable_code("missing arguments") + } + + if (!is.null(name)) { + output_str <- paste0( + project_id, "_", + name, + "_collection" + ) + + } else { + output_str <- paste0( + project_id, "_", + data_id, "_", + run_id, + "_collection" + ) + } + } else if (object_type == "vimpTable") { if (is.null(vimp_method) || is.null(project_id)) { diff --git a/tests/testthat/test-task_based_workflow.R b/tests/testthat/test-task_based_workflow.R index 5cd118f4..ec05bfd5 100644 --- a/tests/testthat/test-task_based_workflow.R +++ b/tests/testthat/test-task_based_workflow.R @@ -169,3 +169,22 @@ testthat::test_that("all models are present", { testthat::expect_true(all(sapply(model, familiar:::model_is_trained))) testthat::expect_false(any(sapply(model, function(x) (is.null(x@vimp_table))))) }) + + +# Including evaluation --------------------------------------------------------- + +results <- familiar::summon_familiar( + data = data, + experimental_design = "bs(fs,3)+bs(mb,3)", + vimp_method = "mim", + learner = "glm_logistic", + evaluate_top_level_only = FALSE, + outcome_type = "binomial", + outcome_column = "outcome", + batch_id_column = "batch_id", + sample_id_column = "sample_id", + series_id_column = "series_id", + class_levels = c("red", "green"), + verbose = verbose, + parallel = FALSE +)