From ca63273512801460eb2acc453bd156aab5d259fb Mon Sep 17 00:00:00 2001
From: Alex Zwanenburg <alexander.zwanenburg@nct-dresden.de>
Date: Fri, 11 Oct 2024 17:30:17 +0200
Subject: [PATCH] Added iteration seed as configuration parameter.

---
 NEWS.md                              | 43 ++++++++++----
 R/Iterations.R                       | 19 +++++--
 R/ParseSettings.R                    | 17 ++++++
 inst/config.xml                      |  2 +
 tests/testthat/test-iteration_seed.R | 84 ++++++++++++++++++++++++++++
 5 files changed, 150 insertions(+), 15 deletions(-)
 create mode 100644 tests/testthat/test-iteration_seed.R

diff --git a/NEWS.md b/NEWS.md
index f9c3c69e..f4b78344 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,39 +4,60 @@
 
 - Some functionality was deprecated because of redundancy and stability issues:
 
-  - The `count` outcome type has been deprecated. `count` is a subset of `continuous` outcomes. Its previous implementation did not provide any benefits over `continuous`.
+  - The `count` outcome type has been deprecated. `count` is a subset of 
+    `continuous` outcomes. Its previous implementation did not provide any 
+    benefits over `continuous`.
 
-  - Gradient boosting using the `mboost` package was deprecated. Use `xgboost` instead.
+  - Gradient boosting using the `mboost` package was deprecated. Use `xgboost` 
+    instead.
 
   - The `qvalue` package for computing q-values was deprecated.
 
-  - The `VGAM` package, which has been soft-deprecated since version 1.3.0, has now fully been deprecated.
+  - The `VGAM` package, which has been soft-deprecated since version 1.3.0, has 
+    now fully been deprecated.
 
-  - The variable hunting feature selection method for random forests was removed due to stability issues in unit tests.
+  - The variable hunting feature selection method for random forests was removed
+    due to stability issues in unit tests.
 
-- Many evaluation steps that only require model predictions can now be called externally by providing a `familiarDataElementPredictionTable` object that contains model prediction data. Such objects can be created using the `as_prediction_table` table.
+- Many evaluation steps that only require model predictions can now be called 
+  externally by providing a `familiarDataElementPredictionTable` object that 
+  contains model prediction data. Such objects can be created using the
+  `as_prediction_table` function.
 
 ## Minor changes
 
-- The `evaluation_elements` configuration parameter was added to allow for specifying which evaluation steps should be performed.
+- The `iteration_seed` configuration parameter was added to provide a fixed seed 
+  for the sampling algorithms that create e.g. bootstraps, cross-validation, for 
+  the experiment. Providing a seed allows for reproducing the sample division
+  across different experiments.
 
-- Concordance variable importance for categorical outcomes now relies on the internal implementation for the area under the receiver operating characteristic curve instead of the Gini measure from the `corelearn` package.
+- The `evaluation_elements` configuration parameter was added to allow for
+  specifying which evaluation steps should be performed.
+
+- Concordance variable importance for categorical outcomes now relies on the 
+  internal implementation for the area under the receiver operating 
+  characteristic curve instead of the Gini measure from the `corelearn` package.
 
 - Palettes from the `paletteer` package can now be used to customise plots.
 
-- Plausibility of datasets is now checked more thoroughly to detect common issues:
+- Plausibility of datasets is now checked more thoroughly to detect common 
+  issues:
 
   - The presence of duplicate rows with the same feature values and outcome.
   
-  - The presence of one-to-one predictors of the outcome. This might be outcome-related columns that have been left in the data accidentally.
+  - The presence of one-to-one predictors of the outcome. This might be 
+    outcome-related columns that have been left in the data accidentally.
   
   - The presence of invariant predictors.
   
-- Statistical tests now assess differences between batches if batch normalisation is performed, and warns if the outcome in any batch is significantly different from others.
+- Statistical tests now assess differences between batches if batch 
+  normalisation is performed, and warns if the outcome in any batch is 
+  significantly different from others.
 
 ## Bug fixes
 
-- Fixed errors when creating feature or similarity plots caused by sample or feature names matching internal column names.
+- Fixed errors when creating feature or similarity plots caused by sample or 
+  feature names matching internal column names.
 
 - The `sample_similarity` evaluation element is now mentioned in the documentation.
 
diff --git a/R/Iterations.R b/R/Iterations.R
index 9caea3ef..1b0e731c 100644
--- a/R/Iterations.R
+++ b/R/Iterations.R
@@ -262,6 +262,13 @@
   id_columns <- get_id_columns(id_depth = "series")
   batch_id_column <- get_id_columns(id_depth = "batch")
 
+  # Start random stream
+  if (is.null(settings$data$iteration_seed)) {
+    rstream_object <- NULL
+  } else {
+    rstream_object <- .start_random_number_stream(seed = settings$data$iteration_seed)
+  }
+  
   # Generate new iterations ----------------------------------------------------
 
   if (is.null(iteration_list)) {
@@ -401,7 +408,8 @@
                 train_or_validate = "train"
               ),
               settings = settings,
-              data = data
+              data = data,
+              rstream_object = rstream_object
             )
 
             # Append runs to the run list
@@ -441,7 +449,8 @@
               ),
               n_iter = n_iter,
               settings = settings,
-              data = data
+              data = data,
+              rstream_object = rstream_object
             )
             
             # Append runs to the run list
@@ -481,7 +490,8 @@
               n_rep = n_rep,
               n_folds = n_folds,
               settings = settings,
-              data = data
+              data = data,
+              rstream_object = rstream_object
             )
 
             # Append runs to the run list
@@ -518,7 +528,8 @@
             cv_iter_list <- .create_loocv(
               sample_identifiers = sample_identifiers,
               settings = settings,
-              data = data
+              data = data,
+              rstream_object = rstream_object
             )
 
             # Append runs to the run list
diff --git a/R/ParseSettings.R b/R/ParseSettings.R
index d21e9df4..a3282f59 100644
--- a/R/ParseSettings.R
+++ b/R/ParseSettings.R
@@ -746,6 +746,10 @@
 #' @param imbalance_n_partitions (*optional*) Number of times random
 #'   undersampling should be repeated. 10 undersampled subsets with balanced
 #'   classes are formed by default.
+#' @param iteration_seed (*optional*) Integer seed used in sampling algorithms
+#'   specified by the `experimental_design` argument. This allows for creating
+#'   the same sample assignments across different experiments -- of course
+#'   provided that the same dataset is used. By default a random seed is used.
 #' @param ... Unused arguments.
 #'
 #' @return List of parameters related to data parsing and the experiment.
@@ -773,6 +777,7 @@
     experimental_design = waiver(),
     imbalance_correction_method = waiver(),
     imbalance_n_partitions = waiver(),
+    iteration_seed = waiver(),
     ...
 ) {
   settings <- list()
@@ -821,6 +826,18 @@
     range = c(1L, Inf)
   )
 
+  # iteration_seed -------------------------------------------------------------
+  # Seed for sampling algorithms.
+  settings$iteration_seed <- .parse_arg(
+    x_config = config$iteration_seed,
+    x_var = iteration_seed,
+    var_name = "iteration_seed",
+    type = "integer",
+    optional = TRUE,
+    default = NULL
+  )
+  
+  
   # sample_id_column -----------------------------------------------------------
   # Sample identifier column
   settings$sample_col <- .parse_arg(
diff --git a/inst/config.xml b/inst/config.xml
index 5b2b67de..ace5de60 100644
--- a/inst/config.xml
+++ b/inst/config.xml
@@ -17,6 +17,8 @@
       <imbalance_correction_method></imbalance_correction_method>
       <!-- Number of partitions with random sampling (optional, default:10; only used when random_sample is selected as class imbalance correction method)-->
       <imbalance_n_partitions></imbalance_n_partitions>
+      <!-- Seed for sampling algorithms used in setting up the experiment (optional, default: no fixed seed) -->
+      <iteration_seed></iteration_seed>
        <!-- Name of data column with cohort identifiers (required for external validation experiments, otherwise optional) -->
       <batch_id_column></batch_id_column>
       <!-- Name of data column with sample identifiers (required for separate data and outcome files, otherwise optional) -->
diff --git a/tests/testthat/test-iteration_seed.R b/tests/testthat/test-iteration_seed.R
new file mode 100644
index 00000000..bd9a0679
--- /dev/null
+++ b/tests/testthat/test-iteration_seed.R
@@ -0,0 +1,84 @@
+# Don't perform any further tests on CRAN due to running time.
+testthat::skip_on_cran()
+
+# Create data.table.
+data <- familiar:::test_create_good_data(
+  outcome_type = "binomial",
+  to_data_object = FALSE
+)
+
+# Check reproducibility of sample assignment -----------------------------------
+
+# Create data assignment object without fixed seed.
+experiment_data_assignment_random <- familiar::precompute_data_assignment(
+  data = data,
+  experimental_design = "bs(fs+mb,3)",
+  outcome_type = "binomial",
+  outcome_column = "outcome",
+  batch_id_column = "batch_id",
+  sample_id_column = "sample_id",
+  series_id_column = "series_id",
+  class_levels = c("red", "green"),
+  verbose = FALSE
+)
+
+# Create data assignment object with fixed seed.
+experiment_data_assignment_a <- familiar::precompute_data_assignment(
+  data = data,
+  experimental_design = "bs(fs+mb,3)",
+  outcome_type = "binomial",
+  outcome_column = "outcome",
+  batch_id_column = "batch_id",
+  sample_id_column = "sample_id",
+  series_id_column = "series_id",
+  class_levels = c("red", "green"),
+  iteration_seed = 19L,
+  verbose = FALSE
+)
+
+# Create data assignment object with fixed seed.
+experiment_data_assignment_b <- familiar::precompute_data_assignment(
+  data = data,
+  experimental_design = "bs(fs+mb,3)",
+  outcome_type = "binomial",
+  outcome_column = "outcome",
+  batch_id_column = "batch_id",
+  sample_id_column = "sample_id",
+  series_id_column = "series_id",
+  class_levels = c("red", "green"),
+  iteration_seed = 19L,
+  verbose = FALSE
+)
+
+# Create data assignment object with fixed seed.
+experiment_data_assignment_different <- familiar::precompute_data_assignment(
+  data = data,
+  experimental_design = "bs(fs+mb,3)",
+  outcome_type = "binomial",
+  outcome_column = "outcome",
+  batch_id_column = "batch_id",
+  sample_id_column = "sample_id",
+  series_id_column = "series_id",
+  class_levels = c("red", "green"),
+  iteration_seed = 20L,
+  verbose = FALSE
+)
+
+# Test that the same fixed seed leads to the same sample assignment.
+testthat::expect_equal(
+  experiment_data_assignment_a@iteration_list,
+  experiment_data_assignment_b@iteration_list,
+  ignore_attr = TRUE
+)
+
+# Test that the random seed leads to a different sample assignment.
+testthat::expect_false(identical(
+  experiment_data_assignment_random@iteration_list,
+  experiment_data_assignment_a@iteration_list
+))
+
+# Test that a different fixed seed leads to a different sample assignment.
+testthat::expect_false(identical(
+  experiment_data_assignment_different@iteration_list,
+  experiment_data_assignment_a@iteration_list
+))