From 97ebba48b78454e09f3f14d152df10cc5e2643d7 Mon Sep 17 00:00:00 2001 From: Alex Zwanenburg Date: Thu, 5 Dec 2024 17:46:43 +0100 Subject: [PATCH] WIP perform_task for evaluations. --- R/Familiar.R | 18 ++- R/TaskEvaluate.R | 354 +++++++++++++++++++++++++++++++++++++++++++++-- R/TaskLearn.R | 1 - 3 files changed, 360 insertions(+), 13 deletions(-) diff --git a/R/Familiar.R b/R/Familiar.R index e338d23d..2f724fea 100644 --- a/R/Familiar.R +++ b/R/Familiar.R @@ -414,7 +414,7 @@ summon_familiar <- function( # Select and sort unique tasks. tasks <- .sort_tasks(tasks) - +browser() # Pre-processing ------------------------------------------------------------- .run_preprocessing( cl = cl, @@ -503,15 +503,25 @@ summon_familiar <- function( # Explanation and evaluation ------------------------------------------------- - # Start evaluation - run_evaluation( + .run_evaluation( cl = cl, - project_list = project_info, + tasks = tasks, + experiment_data = experiment_data, settings = settings, + outcome_info = outcome_info, file_paths = file_paths, verbose = verbose ) + browser() +# run_evaluation( +# cl = cl, +# project_list = project_info, +# settings = settings, +# file_paths = file_paths, +# verbose = verbose +# ) + if (file_paths$is_temporary) { # Collect all familiarModels, familiarEnsemble, familiarData and # familiarCollection objects. diff --git a/R/TaskEvaluate.R b/R/TaskEvaluate.R index 50107d92..b0c13f48 100644 --- a/R/TaskEvaluate.R +++ b/R/TaskEvaluate.R @@ -16,6 +16,7 @@ setClass( ) + # .set_file_name (collection task) --------------------------------------------- setMethod( ".set_file_name", @@ -59,6 +60,68 @@ setMethod( +# .perform_task (collection task, NULL) ---------------------------------------- +setMethod( + ".perform_task", + signature( + object = "familiarTaskCollect", + data = "NULL" + ), + function( + object, + data, + return_results = TRUE, + ... + ) { + # Process collection. + collection_object <- suppressWarnings( + as_familiar_collection( + object = object@data_file + ) + ) + + if (!is.na(object@file)) { + saveRDS(collection_object, file = object@file) + } + + if (return_results) { + return(collection_object) + } + } +) + + + +# .perform_task (collection task, ANY) ----------------------------------------- +setMethod( + ".perform_task", + signature( + object = "familiarTaskCollect", + data = "ANY" + ), + function( + object, + data, + return_results = TRUE, + ... + ) { + # Process collection. + collection_object <- suppressWarnings( + as_familiar_collection( + object = data + ) + ) + + if (!is.na(object@file)) { + saveRDS(collection_object, file = object@file) + } + + if (return_results) { + return(collection_object) + } + } +) + # familiarTaskEvaluate --------------------------------------------------------- @@ -71,7 +134,8 @@ setClass( "ensemble_run_id" = "integer", "vimp_method" = "character", "learner" = "character", - "data_set_name" = "character" + "data_set_name" = "character", + "model_files" = "character" ), prototype = methods::prototype( validation = NA, @@ -82,6 +146,7 @@ setClass( vimp_method = NA_character_, learner = NA_character_, data_set_name = NA_character_, + model_files = NA_character_, task_name = "evaluate" ) ) @@ -126,7 +191,7 @@ setMethod( object@run_id, "_", object@vimp_method, "_", object@learner, "_", - object@ensemble_data, "_", + object@ensemble_data_id, "_", object@ensemble_run_id, "_", object@data_set_name )) @@ -135,6 +200,90 @@ setMethod( +# .perform_task (evaluation task, dataObject) ---------------------------------- +setMethod( + ".perform_task", + signature( + object = "familiarTaskEvaluate", + data = "dataObject" + ), + function( + object, + data, + settings, + cl = NULL, + message_indent = 0L, + return_results = TRUE, + ... + ) { + + # Signal evaluation start for the current task. + logger_message( + paste0( + "Evaluation: Starting evaluation for the \"", object@learner, + "\" learner and the \"", object@vimp_method, + "\" variable importance method for ", + object@data_set_name, " data (data_id: ", + object@data_id, "; run_id: ", object@run_id, + "). This is task ", + object@task_id, " of ", + object@n_tasks, "." + ), + indent = message_indent, + verbose = verbose + ) + + # Form an ensemble using the associated or provided models. + + # Check which detail level should be provided based on the number of + # available instances for each model. + browser() + + # Compute evaluation data. + evaluation_data <- extract_data( + object = fam_ensemble, + data = data, + cl = cl, + data_element = settings$eval$evaluation_data_elements, + time_max = settings$eval$time_max, + evaluation_times = settings$eval$eval_times, + sample_limit = settings$eval$sample_limit, + detail_level = detail_level, + estimation_type = settings$eval$estimation_type, + aggregate_results = settings$eval$aggregate_results, + aggregation_method = settings$eval$aggregation, + rank_threshold = settings$eval$aggr_rank_threshold, + ensemble_method = settings$eval$ensemble_method, + stratification_method = settings$eval$strat_method, + metric = settings$eval$metric, + feature_cluster_method = settings$eval$feature_cluster_method, + feature_cluster_cut_method = settings$eval$feature_cluster_cut_method, + feature_linkage_method = settings$eval$feature_linkage_method, + feature_similarity_metric = settings$eval$feature_similarity_metric, + feature_similarity_threshold = settings$eval$feature_similarity_threshold, + sample_cluster_method = settings$eval$sample_cluster_method, + sample_linkage_method = settings$eval$sample_linkage_method, + sample_similarity_metric = settings$eval$sample_similarity_metric, + confidence_level = settings$eval$confidence_level, + bootstrap_ci_method = settings$eval$bootstrap_ci_method, + dynamic_model_loading = settings$eval$auto_detach, + icc_type = settings$eval$icc_type, + message_indent = message_indent + 1L, + verbose = verbose + ) + + if (!is.na(object@file)) { + saveRDS(evaluation_data, file = object@file) + } + + if (return_results) { + return(evaluation_data) + } + } +) + + + .generate_evaluation_tasks <- function( experiment_data, vimp_methods, @@ -147,6 +296,7 @@ setMethod( # Suppress NOTES due to non-standard evaluation in data.table train <- can_pre_process <- perturbation_level <- main_data_id <- NULL + data_id <- run_id <- NULL # collection tasks ----------------------------------------------------------- @@ -306,10 +456,13 @@ setMethod( # ensembles ------------------------------------------------------------------ - # Obtain run tables related to models. + # Obtain run tables that directly refer to data on which models were trained. + # First select the data_id related to training. train_data_id <- experiment_data@experiment_setup[train == TRUE, ]$main_data_id[1L] if (is_empty(train_data_id)) return(NULL) + # Then select the run-tables which contain this data_id as their final + # (bottom) level. run_tables <- .collect_run_tables(iteration_list = experiment_data@iteration_list) run_tables <- run_tables[sapply( run_tables, @@ -318,24 +471,209 @@ setMethod( }, data_id = train_data_id )] - browser() - + # Iterate over evaluation tasks and add corresponding models based on ensemble # data id and run id. for (ii in seq_along(evaluate_task_list)) { - ensemble_data_id <- evaluate_task_list[[ii]]@ensemble_data_id - ensemble_run_id <- evaluate_task_list[[ii]]@ensemble_run_id + # Select run tables where the ensemble data and run identifiers appear. + selected_run_tables <- run_tables[sapply( + run_tables, + function(x, ensemble_data_id, ensemble_run_id) { + return(!is_empty(x[data_id == ensemble_data_id & run_id == ensemble_run_id])) + }, + ensemble_data_id = evaluate_task_list[[ii]]@ensemble_data_id, + ensemble_run_id = evaluate_task_list[[ii]]@ensemble_run_id + )] + # Create corresponding model object names. + evaluate_task_list[[ii]]@model_files <- unname(sapply( + selected_run_tables, + function(x, ...) { + get_object_file_name( + object_type = "familiarModel", + data_id = tail(x, n = 1L)$data_id, + run_id = tail(x, n = 1L)$run_id, + ... + ) + }, + learner = evaluate_task_list[[ii]]@learner, + vimp_method = evaluate_task_list[[ii]]@vimp_method, + project_id = evaluate_task_list[[ii]]@project_id, + dir_path = file_paths$mb_dir + )) } - browser() # train and variable importance tasks ---------------------------------------- task_list <- .generate_trainer_tasks( experiment_data = experiment_data, + learners = learners, vimp_methods = vimp_methods, file_paths = file_paths, skip_existing = skip_existing, ... ) + + return(c( + task_list, + evaluate_task_list, + collect_task_list + )) +} + + + +.run_evaluation <- function( + cl, + tasks, + message_indent = 0L, + verbose, + ... +) { + + # Check that any tasks are available for processing. + if (is_empty(tasks$evaluate) || is_empty(tasks$collect)) return(invisible(FALSE)) + + # Determine which evaluation tasks need to be performed. + finished_tasks <- sapply(tasks$evaluate, .file_exists) + unfinished_tasks <- tasks$evaluate[!finished_tasks] + finished_tasks <- tasks$evaluate[finished_tasks] + + # Process any unfinished tasks. + if (length(unfinished_tasks) > 0L) { + ..run_evaluation( + cl = cl, + tasks = unfinished_tasks, + message_indent = message_indent, + verbose = verbose, + ... + ) + } + + # Determine which collection tasks are required. + finished_tasks <- sapply(tasks$train, .file_exists) + unfinished_tasks <- tasks$train[!finished_tasks] + finished_tasks <- tasks$train[finished_tasks] + + # Process any unfinished tasks. + if (length(unfinished_tasks) > 0L) { + ..run_collection( + tasks = unfinished_tasks, + message_indent = message_indent, + verbose = verbose, + ... + ) + } + + return(invisible(TRUE)) +} + + + +..run_evaluation <- function( + tasks, + cl, + settings, + message_indent = 0L, + verbose, + ... +) { + + # Message that evaluation is starting. + logger_message( + paste0( + "Evaluation: Starting model evaluation." + ), + indent = message_indent, + verbose = verbose + ) + + # Set outer vs. inner loop parallelisation. + if (settings$eval$do_parallel %in% c("TRUE", "inner")) { + cl_inner <- cl + cl_outer <- NULL + + } else if (settings$eval$do_parallel %in% c("outer")) { + cl_inner <- NULL + cl_outer <- cl + + logger_message( + paste0( + "Evaluation: Parallel processing is done in the outer loop. ", + "No progress can be displayed." + ), + indent = message_indent, + verbose = verbose && !is.null(cl_outer) + ) + + } else { + cl_inner <- cl_outer <- NULL + } + + fam_mapply_lb( + cl = cl, + assign = "all", + FUN = .perform_task, + progress_bar = !is.null(cl_outer), + object = tasks, + MoreArgs = list( + "data" = NULL, + "return_results" = FALSE, + "settings" = settings, + "message_indent" = message_indent + 1L, + "verbose" = verbose && is.null(cl_outer), + ... + ) + ) + + # Message that variable importances have been computed. + logger_message( + paste0( + "Evaluation: Models were evaluated.\n" + ), + indent = message_indent, + verbose = verbose + ) +} + + + +..run_collection <- function( + tasks, + message_indent = 0L, + verbose, + ... +) { + # Message that evaluation is starting. + logger_message( + paste0( + "Evaluation: Starting collection of evaluation datasets." + ), + indent = message_indent, + verbose = verbose + ) + + fam_mapply_lb( + cl = NULL, + assign = "all", + FUN = .perform_task, + progress_bar = verbose, + object = tasks, + MoreArgs = list( + "data" = NULL, + "return_results" = FALSE, + "message_indent" = message_indent + 1L, + "verbose" = verbose, + ... + ) + ) + + # Message that variable importances have been computed. + logger_message( + paste0( + "Evaluation: Evaluation datasets were collected.\n" + ), + indent = message_indent, + verbose = verbose + ) } diff --git a/R/TaskLearn.R b/R/TaskLearn.R index 970e1880..86ff810d 100644 --- a/R/TaskLearn.R +++ b/R/TaskLearn.R @@ -27,7 +27,6 @@ setClass( - # .set_file_name (train task) -------------------------------------------------- setMethod( ".set_file_name",