diff --git a/R/DataPreProcessing.R b/R/DataPreProcessing.R index d7b3bf2d..0b185a05 100644 --- a/R/DataPreProcessing.R +++ b/R/DataPreProcessing.R @@ -1,14 +1,14 @@ -.get_feature_info_file_name <- function(file_paths, project_id) { - # Generate file name of pre-processing file - file_name <- paste0(project_id, "_feature_info.RDS") - - # Add file path and normalise according to the OS - file_name <- normalizePath(file.path( - file_paths$process_data_dir, file_name - ), mustWork = FALSE) - - return(file_name) -} +# .get_feature_info_file_name <- function(file_paths, project_id) { +# # Generate file name of pre-processing file +# file_name <- paste0(project_id, "_feature_info.RDS") +# +# # Add file path and normalise according to the OS +# file_name <- normalizePath(file.path( +# file_paths$process_data_dir, file_name +# ), mustWork = FALSE) +# +# return(file_name) +# } diff --git a/R/DataServerBackend.R b/R/DataServerBackend.R index 8d80626e..6c53fc6c 100644 --- a/R/DataServerBackend.R +++ b/R/DataServerBackend.R @@ -401,7 +401,7 @@ get_feature_info_from_backend <- function( } else if (!is.null(data_id) && !is.null(run_id)) { # Retrieve run-specific feature information. x <- get("master_feature_info_list", envir = data_env)[[ - .get_feature_info_list_name(data_id = data_id, run_id = run_id) + paste0(data_id, ".", run_id) ]] } else { diff --git a/R/ExperimentData.R b/R/ExperimentData.R index 068857ad..21f4cbe4 100644 --- a/R/ExperimentData.R +++ b/R/ExperimentData.R @@ -74,30 +74,49 @@ load_experiment_data <- function(x, file_paths) { # Start writing feature information. if (!is.null(x@feature_info)) { - # Set file name - file_name <- .get_feature_info_file_name( - file_paths = file_paths, - project_id = x@project_id - ) - - # Check if the directory exists, and create it otherwise. - if (!dir.exists(dirname(file_name))) { - dir.create(dirname(file_name), recursive = TRUE) - } + for (feature_info_name in names(x@feature_info)) { + feature_info <- x@feature_info[[feature_info_name]] + + # Set file name. + if (feature_info_name == "generic") { + file_name <- get_object_file_name( + object_type = "genericFeatureInfo", + project_id = feature_info@project_id, + dir_path = file_paths$process_data_dir + ) + + } else { + file_name <- get_object_file_name( + object_type = "featureInfo", + project_id = feature_info@project_id, + data_id = feature_info@data_id, + run_id = feature_info@run_id, + dir_path = file_paths$process_data_dir + ) + } + + # Check if the directory exists, and create it otherwise. + if (!dir.exists(file_paths$process_data_dir)) { + dir.create(file_paths$process_data_dir, recursive = TRUE) + } - # Write to file. - saveRDS(x@feature_info, file = file_name) + # Write to file. + saveRDS(feature_info, file = file_name) + } } # Write variable importance information. if (!is.null(x@vimp_table_list)) { - for (vimp_method in names(x@vimp_table_list)) { + for (vimp_table in names(x@vimp_table_list)) { # Set file name - file_name <- .get_variable_importance_data_filename( - project_id = x@project_id, - vimp_method = vimp_method, - file_paths = file_paths + file_name <- get_object_file_name( + object_type = "vimpTable", + data_id = vimp_table@data_id, + run_id = vimp_table@run_id, + vimp_method = vimp_table@vimp_method, + project_id = vimp_table@project_id, + dir_path = file_paths$vimp_dir ) # Check if the directory exists, and create it otherwise. @@ -106,7 +125,7 @@ load_experiment_data <- function(x, file_paths) { } # Write to file. - saveRDS(x@vimp_table_list[[vimp_method]], file = file_name) + saveRDS(vimp_table, file = file_name) } } diff --git a/R/FamiliarS4Classes.R b/R/FamiliarS4Classes.R index 93eb05d1..e04b3a0c 100644 --- a/R/FamiliarS4Classes.R +++ b/R/FamiliarS4Classes.R @@ -989,7 +989,7 @@ setClass("vimpTable", # Variable importance method that generated the current variable # importance table. vimp_method = "character", - # Run table for the current model + # Run table for the current table. run_table = "ANY", # Set how scores from encoded features should be aggregated. score_aggregation = "character", diff --git a/R/TrainS4Methods.R b/R/TrainS4Methods.R index 54f4bebb..c1747f54 100644 --- a/R/TrainS4Methods.R +++ b/R/TrainS4Methods.R @@ -83,22 +83,22 @@ setMethod( # testing of very rare cases where preprocessing may run fine, but the # subsample does not allow for training. if (is.null(data_bypass)) data_bypass <- data - + # Prepare settings --------------------------------------------------------- - + # Reconstitute settings from the data. settings <- extract_settings_from_data(data_bypass) - + # Update some missing settings that can be fixed within this method. settings$data$train_cohorts <- unique(data_bypass@data[[get_id_columns(single_column = "batch")]]) - + # Parse the remaining settings that are important. Remove outcome_type from # ... This prevents an error caused by multiple matching arguments. dots <- list(...) dots$parallel <- NULL dots$vimp_method <- NULL dots$hyperparameter <- NULL - + # Create setting_hyperparam so that it can be parsed correctly. if (!learner %in% names(hyperparameter_list) && length(hyperparameter_list) > 0L) { setting_hyperparam <- list() @@ -106,7 +106,7 @@ setMethod( } else { setting_hyperparam <- hyperparameter_list } - + # Determine if a naive model should be forced. vimp_method <- ifelse(create_naive, "no_features", "none") @@ -127,14 +127,14 @@ setMethod( # Push settings to the backend. .assign_settings_to_global(settings = settings) - + # Prepare outcome_info ----------------------------------------------------- - + # Create a generic outcome object outcome_info <- data_bypass@outcome_info - + # Prepare featureInfo objects ---------------------------------------------- - + # Create a list of featureInfo objects. feature_info_list <- .get_feature_info_data( data = data_bypass@data, @@ -142,10 +142,10 @@ setMethod( project_id = character(), outcome_type = settings$data$outcome_type ) - + # Extract the generic data. feature_info_list <- feature_info_list[["generic"]] - + # Perform some pre-processing (i.e. remove singular features) feature_info_list <- .determine_preprocessing_parameters( cl = cl, @@ -154,50 +154,50 @@ setMethod( settings = settings, verbose = FALSE ) - + # Remove invariant features from the data data <- filter_features( data = data, available_features = get_available_features(feature_info_list = feature_info_list) ) - + # Find features that are required for processing the data required_features <- get_required_features( x = data, feature_info_list = feature_info_list ) - + # Find important features, i.e. those that constitute the signature either # individually or as part of a cluster. model_features <- get_model_features( x = data, feature_info_list = feature_info_list ) - + # Naive models do not require features. if (create_naive) required_features <- model_features <- NULL # Prepare hyperparameters -------------------------------------------------- - + # Get default hyperparameters. param_list <- .get_preset_hyperparameters( data = data, learner = learner, names_only = FALSE ) - + # Update with user-provided settings. param_list <- .update_hyperparameters( parameter_list = param_list, user_list = settings$mb$hyper_param[[learner]] ) - + # Determine which hyperparameters still need to be specified. unset_parameters <- sapply( param_list, function(hyperparameter_entry) hyperparameter_entry$randomise ) - + # Raise an error if any hyperparameters were not set. if (any(unset_parameters)) { ..error(paste0( @@ -205,15 +205,15 @@ setMethod( paste_s(names(unset_parameters)[unset_parameters]) )) } - + # Obtain the final list of hyperparameters. param_list <- lapply( param_list, function(hyperparameter_entry) hyperparameter_entry$init_config ) - + # Prepare model and data --------------------------------------------------- - + # Create familiar model object <- methods::new( "familiarModel", @@ -230,17 +230,17 @@ setMethod( settings = settings$eval, project_id = 0L ) - + # Add package version/ object <- add_package_version(object = object) - + # Process data. data <- process_input_data( object = object, data = data, stop_at = "clustering" ) - + # Create bootstraps. if (create_bootstrap) { data <- select_data_from_samples( @@ -252,7 +252,7 @@ setMethod( ) ) } - + # Train model. object <- .train( object = object, @@ -261,7 +261,7 @@ setMethod( trim_model = trim_model, timeout = Inf ) - + # Train novelty detector. object <- .train_novelty_detector( object = object, @@ -269,14 +269,228 @@ setMethod( detector = ifelse(create_novelty_detector, "isolation_forest", "none"), get_additional_info = TRUE ) - + # Generate a placeholder name for the familiarModel object object <- set_object_name(x = object) - + return(object) } ) +# setMethod( +# "test_train", +# signature(data = "dataObject"), +# function( +# data, +# data_bypass = NULL, +# learner, +# hyperparameter_list = list(), +# create_bootstrap = FALSE, +# create_novelty_detector = FALSE, +# create_naive = FALSE, +# cl = NULL, +# trim_model = FALSE, +# ... +# ) { +# # The bypass data allows for bypassing important aspects of the +# # pre-processing pipeline, e.g. the preprocessing checks. This enables +# # testing of very rare cases where preprocessing may run fine, but the +# # subsample does not allow for training. +# if (is.null(data_bypass)) data_bypass <- data +# +# # Prepare settings --------------------------------------------------------- +# +# # Reconstitute settings from the data. +# settings <- extract_settings_from_data(data_bypass) +# +# # Update some missing settings that can be fixed within this method. +# settings$data$train_cohorts <- unique(data_bypass@data[[get_id_columns(single_column = "batch")]]) +# +# # Parse the remaining settings that are important. Remove outcome_type from +# # ... This prevents an error caused by multiple matching arguments. +# dots <- list(...) +# dots$parallel <- NULL +# dots$vimp_method <- NULL +# dots$hyperparameter <- NULL +# +# # Create setting_hyperparam so that it can be parsed correctly. +# if (!learner %in% names(hyperparameter_list) && length(hyperparameter_list) > 0L) { +# setting_hyperparam <- list() +# setting_hyperparam[[learner]] <- hyperparameter_list +# } else { +# setting_hyperparam <- hyperparameter_list +# } +# +# # Determine if a naive model should be forced. +# vimp_method <- ifelse(create_naive, "no_features", "none") +# +# settings <- do.call( +# .parse_general_settings, +# args = c( +# list( +# "settings" = settings, +# "data" = data_bypass@data, +# "parallel" = FALSE, +# "vimp_method" = vimp_method, +# "learner" = learner, +# "hyperparameter" = setting_hyperparam +# ), +# dots +# ) +# ) +# +# # Push settings to the backend. +# .assign_settings_to_global(settings = settings) +# +# # Prepare outcome_info ----------------------------------------------------- +# +# # Create a generic outcome object +# outcome_info <- data_bypass@outcome_info +# +# # Prepare featureInfo objects ---------------------------------------------- +# +# # Create a list of featureInfo objects. +# feature_info_list <- .get_feature_info_data( +# data = data_bypass@data, +# file_paths = NULL, +# project_id = character(), +# outcome_type = settings$data$outcome_type +# ) +# +# # Extract the generic data. +# feature_info_list <- feature_info_list[["generic"]] +# +# # Perform some pre-processing (i.e. remove singular features) +# feature_info_list <- .determine_preprocessing_parameters( +# cl = cl, +# data = data_bypass, +# feature_info_list = feature_info_list, +# settings = settings, +# verbose = FALSE +# ) +# +# # Remove invariant features from the data +# data <- filter_features( +# data = data, +# available_features = get_available_features(feature_info_list = feature_info_list) +# ) +# +# # Find features that are required for processing the data +# required_features <- get_required_features( +# x = data, +# feature_info_list = feature_info_list +# ) +# +# # Find important features, i.e. those that constitute the signature either +# # individually or as part of a cluster. +# model_features <- get_model_features( +# x = data, +# feature_info_list = feature_info_list +# ) +# +# # Naive models do not require features. +# if (create_naive) required_features <- model_features <- NULL +# +# # Prepare hyperparameters -------------------------------------------------- +# +# # Get default hyperparameters. +# param_list <- .get_preset_hyperparameters( +# data = data, +# learner = learner, +# names_only = FALSE +# ) +# +# # Update with user-provided settings. +# param_list <- .update_hyperparameters( +# parameter_list = param_list, +# user_list = settings$mb$hyper_param[[learner]] +# ) +# +# # Determine which hyperparameters still need to be specified. +# unset_parameters <- sapply( +# param_list, +# function(hyperparameter_entry) hyperparameter_entry$randomise +# ) +# +# # Raise an error if any hyperparameters were not set. +# if (any(unset_parameters)) { +# ..error(paste0( +# "The following hyperparameters need to be specified: ", +# paste_s(names(unset_parameters)[unset_parameters]) +# )) +# } +# +# # Obtain the final list of hyperparameters. +# param_list <- lapply( +# param_list, +# function(hyperparameter_entry) hyperparameter_entry$init_config +# ) +# +# # Prepare model and data --------------------------------------------------- +# +# # Create familiar model +# object <- methods::new( +# "familiarModel", +# outcome_type = settings$data$outcome_type, +# learner = learner, +# vimp_method = vimp_method, +# hyperparameters = param_list, +# required_features = required_features, +# model_features = model_features, +# novelty_features = model_features, +# run_table = get_placeholder_run_table(), +# feature_info = feature_info_list, +# outcome_info = outcome_info, +# settings = settings$eval, +# project_id = 0L +# ) +# +# # Add package version/ +# object <- add_package_version(object = object) +# +# # Process data. +# data <- process_input_data( +# object = object, +# data = data, +# stop_at = "clustering" +# ) +# +# # Create bootstraps. +# if (create_bootstrap) { +# data <- select_data_from_samples( +# data = data, +# samples = fam_sample( +# x = data@data, +# replace = TRUE, +# seed = 19L +# ) +# ) +# } +# +# # Train model. +# object <- .train( +# object = object, +# data = data, +# get_additional_info = TRUE, +# trim_model = trim_model, +# timeout = Inf +# ) +# +# # Train novelty detector. +# object <- .train_novelty_detector( +# object = object, +# data = data, +# detector = ifelse(create_novelty_detector, "isolation_forest", "none"), +# get_additional_info = TRUE +# ) +# +# # Generate a placeholder name for the familiarModel object +# object <- set_object_name(x = object) +# +# return(object) +# } +# ) + # test_train_novelty_detector (generic) ----------------------------------------