WIP on data tasks.

alexzwanenburg · Jan 2, 2025 · 2729584 · 2729584
1 parent fb4c880
commit 2729584
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 4 deletions.
diff --git a/R/DataObject.R b/R/DataObject.R
@@ -703,7 +703,7 @@ setMethod(
     keep_novelty = FALSE,
     ...
   ) {
-
+    browser()
     # Check whether model data- and run-ids should be used.
     if (data@defer_to_model_data_and_run_id) {
       data@data_id <- object@data_id

diff --git a/R/FamiliarDataComputation.R b/R/FamiliarDataComputation.R
@@ -497,6 +497,7 @@ setMethod(
     verbose = FALSE,
     ...
 ) {
+  browser()
   ## Compute distance between features ---------------------------------------
   feature_similarity <- NULL
   if (any(c("model_vimp", "feature_similarity", "univariate_analysis",

diff --git a/R/FamiliarS4Classes.R b/R/FamiliarS4Classes.R
@@ -36,6 +36,9 @@
 #' @slot calibration_info Calibration information, e.g. baseline survival in the
 #'   development cohort.
 #' @slot km_info Data concerning stratification into risk groups.
+#' @slot data_id Internal identifier for the dataset used to train the model.
+#' @slot run_id Internal identifier for the specific subset of the dataset used
+#'   used to train the model.
 #' @slot run_table Run table for the data used to train the model. Used
 #'   internally.
 #' @slot settings A copy of the evaluation configuration parameters used at
@@ -90,6 +93,10 @@ setClass("familiarModel",
     model_features = "ANY",
     # Features that are required for novelty detection.
     novelty_features = "ANY",
+    # data_id for the data used to train the model.
+    data_id = "integer",
+    # run_id for the data used to train the model.
+    run_id = "integer",
     # Run table for the current model
     run_table = "ANY",
     # Information required to assess model calibrations (e.g. baseline survival)
@@ -133,6 +140,8 @@ setClass("familiarModel",
     novelty_features = NULL,
     calibration_info = NULL,
     km_info = NULL,
+    data_id = NA_integer_,
+    run_id = NA_integer_,
     run_table = NULL,
     settings = NULL,
     is_trimmed = FALSE,
@@ -172,6 +181,9 @@ setClass("familiarModel",
 #'   models in the ensemble,
 #' @slot novelty_features The combined set of features that is used to train all
 #'   novelty detectors in the ensemble.
+#' @slot data_id Internal identifier for the dataset used to train the ensemble.
+#' @slot run_id Internal identifier for the specific subset of the dataset used
+#'   used to train the ensemble.
 #' @slot run_table Run table for the data used to train the ensemble. Used
 #'   internally.
 #' @slot calibration_info Calibration information, e.g. baseline survival in the
@@ -214,6 +226,10 @@ setClass("familiarEnsemble",
     model_features = "ANY",
     # Features that are required for novelty detection.
     novelty_features = "ANY",
+    # data_id for the data used to train the model.
+    data_id = "integer",
+    # run_id for the data used to train the model.
+    run_id = "integer",
     # Set of run tables for the current ensemble. This is only required for
     # processing internal data.
     run_table = "ANY",
@@ -246,6 +262,8 @@ setClass("familiarEnsemble",
     required_features = NULL,
     model_features = NULL,
     novelty_features = NULL,
+    data_id = NA_integer_,
+    run_id = NA_integer_,
     run_table = NULL,
     calibration_info = NULL,
     model_dir_path = NA_character_,

diff --git a/R/TaskEvaluate.R b/R/TaskEvaluate.R
@@ -239,17 +239,30 @@ setMethod(
       ..error_reached_unreachable_code("outcome_info is required.")
     }
 
+    # Set up a delayed 
     data <- methods::new(
-      "dataObject",
+      "delayedDataObject",
       data = NULL,
       preprocessing_level = "none",
       outcome_type = outcome_info@outcome_type,
       outcome_info = outcome_info,
       validation = object@validation,
-      delay_loading = TRUE,
       aggregate_on_load = FALSE
     )
 
+    # Set the data_id and run_id for the data itself.
+    if (object@force_ensemble_detail_level) {
+      data@data_id <- object@ensemble_data_id
+      data@run_id <- object@ensemble_run_id
+
+    } else {
+      data@data_id <- object@data_id
+      data@run_id <- object@run_id
+    }
+
+    # Determine whether model data and run ids should be used for predictions.
+    data@defer_to_model_data_and_run_id <- object@get_predictions_at_model_level
+
     # Pass to method that dispatches with dataObject for further processing.
     return(.perform_task(
       object = object,

diff --git a/R/TaskLearn.R b/R/TaskLearn.R
@@ -253,7 +253,7 @@ setMethod(
       rank_threshold = vimp_rank_threshold
     )
 
-    # Create the raw model object for training..
+    # Create the raw model object for training.
     model_object <- methods::new(
       "familiarModel",
       outcome_type = data@outcome_type,
@@ -264,6 +264,8 @@ setMethod(
       learner = object@learner,
       feature_info = feature_info_list,
       outcome_info = data@outcome_info,
+      data_id = object@data_id,
+      run_id = object@run_id,
       run_table = .get_current_run_table(object = object),
       settings = settings$eval,
       project_id = object@project_id

diff --git a/man/familiarEnsemble-class.Rd b/man/familiarEnsemble-class.Rd
diff --git a/man/familiarModel-class.Rd b/man/familiarModel-class.Rd
diff --git a/tests/testthat/test-task_based_workflow.R b/tests/testthat/test-task_based_workflow.R
@@ -178,6 +178,7 @@ data <- familiar:::test_create_small_good_data("binomial")
 results <- familiar::summon_familiar(
   data = data,
   experimental_design = "bs(fs,3)+bs(mb, 3)",
+  evaluation_elements = "auc_data",
   vimp_method = "mim",
   learner = "glm_logistic",
   evaluate_top_level_only = FALSE,