Skip to content

Commit

Permalink
WIP on translating the main data preprocessing, variable importance a…
Browse files Browse the repository at this point in the history
…nd training engine to a task-based engine.
  • Loading branch information
alexzwanenburg committed Oct 25, 2024
1 parent bddbccb commit cc0cc92
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 12 deletions.
21 changes: 21 additions & 0 deletions R/FamiliarS4Classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -1565,3 +1565,24 @@ setClass(
prediction_data = NULL
)
)



# familiarTask object ----------------------------------------------------------
setClass(
"familiarTask",
slots = list(
name = "character",
data_id = "integer",
run_id = "integer",
file = "character",
project_id = "ANY"
),
prototype = methods::prototype(
name = NA_character_,
data_id = NA_integer_,
run_id = NA_integer_,
file = NA_character_,
project_id = NULL
)
)
25 changes: 13 additions & 12 deletions R/FamiliarS4Generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -346,12 +346,7 @@ setGeneric("preprocess_vimp_table", function(x, ...) standardGeneric("preprocess

setGeneric("remove_signature_features", function(x, ...) standardGeneric("remove_signature_features"))

setGeneric(
"update_vimp_table_to_reference",
function(x, ...) {
standardGeneric("update_vimp_table_to_reference")
}
)
setGeneric("update_vimp_table_to_reference", function(x, ...) standardGeneric("update_vimp_table_to_reference"))

setGeneric("collect_vimp_table", function(x, ...) standardGeneric("collect_vimp_table"))

Expand All @@ -370,9 +365,15 @@ setGeneric("feature_info_complete", function(object, ...) standardGeneric("featu

setGeneric("add_feature_info_parameters", function(object, data, ...) standardGeneric("add_feature_info_parameters"))

setGeneric(
"apply_feature_info_parameters",
function(object, data, ...) {
standardGeneric("apply_feature_info_parameters")
}
)
setGeneric("apply_feature_info_parameters", function(object, data, ...) standardGeneric("apply_feature_info_parameters"))



# task methods -----------------------------------------------------------------
setGeneric(".set_file_name", function(object, ...) standardGeneric(".set_file_name"))

setGeneric(".file_exists", function(object, ...) standardGeneric(".file_exists"))

setGeneric(".perform_task", function(object, ...) standardGeneric(".perform_task"))

setGeneric(".get_task_descriptor", function(object, ...) standardGeneric(".get_task_descriptor"))
239 changes: 239 additions & 0 deletions R/TaskFeatureInfo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
# familiarTaskGenericFeatureInfo -----------------------------------------------
setClass(
"familiarTaskGenericFeatureInfo",
prototype = methods::prototype(
name = "create_generic_feature_info"
)
)


# .set_file_name (generic feature info task) -----------------------------------
setMethod(
".set_file_name",
signature(object = "familiarTaskGenericFeatureInfo"),
function(object, file_paths = NULL) {
if (is.null(file_paths)) return(object)

# Generate file name of pre-processing file
file_name <- paste0(object@project_id, "_generic_feature_info.RDS")

# Add file path and normalise according to the OS
object@file <- normalizePath(
file.path(file_paths$process_data_dir, file_name),
mustWork = FALSE
)

return(object)
}
)



# .get_task_descriptor (generic feature info task) -----------------------------
setMethod(
".get_task_descriptor",
signature(object = "familiarTaskGenericFeatureInfo"),
function(object, ...) {
return(object@name)
}
)



# .perform_task (generic feature info task) ------------------------------------
setMethod(
".perform_task",
signature(object = "familiarTaskGenericFeatureInfo"),
function(
object,
data,
outcome_type = NULL,
descriptor = NULL
) {
if (is(data, "dataObject")) outcome_type <- data@outcome_type
if (is.null(outcome_type)) {
..error_reached_unreachable_code("outcome_type is expected to be provided")
}

# Extract basic feature information from the data.
feature_info_list <- .get_generic_feature_info(
data = data,
outcome_type = outcome_type,
descriptor = NULL
)

# Write to file or return.
if (!is.na(file)) {
saveRDS(feature_info_list, file = object@file)
} else {
return(feature_info_list)
}

return(invisible(TRUE))
}
)



# familiarTaskFeatureInfo ------------------------------------------------------
setClass(
"familiarTaskFeatureInfo",
prototype = methods::prototype(
name = "create_feature_info"
)
)



# .set_file_name (feature info task) -------------------------------------------
setMethod(
".set_file_name",
signature(object = "familiarTaskFeatureInfo"),
function(object, file_paths = NULL) {
if (is.null(file_paths)) return(object)

# Generate file name of pre-processing file.
file_name <- paste0(
object@project_id, "_", object@data_id, "_", object@run_id, "_feature_info.RDS"
)

# Add file path and normalise according to the OS
object@file <- normalizePath(
file.path(file_paths$process_data_dir, file_name),
mustWork = FALSE
)

return(object)
}
)



# .get_task_descriptor (feature info task) -------------------------------------
setMethod(
".get_task_descriptor",
signature(object = "familiarTaskFeatureInfo"),
function(object, ...) {
return(paste0(object@name, "_", object@data_id, "_", object@run_id))
}
)



# .perform_task (feature info task) --------------------------------------------
setMethod(
".perform_task",
signature(object = "familiarTaskFeatureInfo"),
function(
object,
data,
settings,
feature_info_list = NULL,
project_info = NULL,
message_indent = 0L,
verbose = FALSE,
cl = NULL
) {

logger_message(
paste0(
"\nPre-processing: Starting preprocessing for run ",
object@task_id, " of ",
object@n_tasks, "."
),
indent = message_indent,
verbose = verbose
)

# Check that a feature info list is provided, otherwise create an ad-hoc
# list as an template.
if (is.null(feature_info_list)) {
# Set up task, and explicitly don't write to file.
generic_feature_info_task <- methods::new(
"familiarTaskGenericFeatureInfo",
project_id = project_info$project_id,
file = NA_character_
)

# Execute the task.
feature_info_list <- .perform_task(generic_feature_info_task)
}

# Update feature info list.
feature_info_list <- determine_preprocessing_parameters(
cl = cl,
feature_info_list = feature_info_list,
data_id = object@data_id,
run_id = object@run_id,
project_info = project_info,
settings = settings,
message_indent = message_indent + 1L,
verbose = verbose
)

if (!is.na(object@file)) {
saveRDS(feature_info_list, file = object@file)
} else {
return(feature_info_list)
}

return(invisible(TRUE))
}
)





..generate_data_preprocessing_tasks <- function(
data_ids,
run_ids,
file_paths,
project_id
) {
task_list <- list()

# Create task to generic feature_info.
generic_info_task <- methods::new(
"familiarTaskGenericFeatureInfo",
project_id = project_id
)

# Add file names.
generic_info_task <- .set_file_name(
object = generic_info_task,
file_paths = file_paths
)

# Add to list, if the file does not exist on disk.
if (!.file_exists(generic_info_task)) {
task_list[[1L]] <- generic_info_task
}

ii <- 2L
for (data_id in data_ids) {
for (run_id in run_ids) {
# Create task to generate run-specific feature info.
run_info_task <- methods::new(
"familiarTaskFeatureInfo",
data_id = data_id,
run_id = run_id,
project_id = project_id
)

# Add file names.
run_info_task <- .set_file_name(
object = run_info_task,
file_paths = file_paths
)

# Add to list, if the file does not exist on disk.
if (!.file_exists(run_info_task)) {
task_list[[ii]] <- run_info_task
ii <- ii + 1L
}
}
}

return(task_list)
}
75 changes: 75 additions & 0 deletions R/TaskMain.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#' @include FamiliarS4Generics.R
#' @include FamiliarS4Classes.R
NULL


# .file_exists (generic task) --------------------------------------------------
setMethod(
".file_exists",
signature(object = "familiarTask"),
function(object, ...) {
if (is.na(object@file) || is.null(object@file)) return(FALSE)

return(file.exists(object@file))
}
)


.generate_trainer_tasks <- function() {

for (data_id in data_ids) {
for (run_id in run_ids) {
for (vimp_method in vimp_methods) {
for (learner in learners) {
# Set up trainer task.

# Set up hyperparameter extraction task.

}
}
}
}

# Add tasks related to variable importance objects.

# Add tasks related to data processing for learners.
}


.generate_vimp_tasks <- function() {

# Check if vimp should be computed separately or is computed during
# hyperparameter optimisation.

for (data_id in data_ids) {
for (run_id in run_ids) {
for (vimp_method in vimp_methods) {

# Check if the variable importance method requires any computation.
# For example, signature_only, none and random do not require
# computation.

# Set up variable importance computation task.

# Set up variable importance hyperparameter task.

}
}
}

# Add tasks related to data processing for vimp methods.

}



.generate_learner_data_preprocessing_tasks <- function() {

}


.generate_vimp_data_preprocessing_tasks <- function() {

}


0 comments on commit cc0cc92

Please sign in to comment.