From ca63273512801460eb2acc453bd156aab5d259fb Mon Sep 17 00:00:00 2001 From: Alex Zwanenburg Date: Fri, 11 Oct 2024 17:30:17 +0200 Subject: [PATCH] Added iteration seed as configuration parameter. --- NEWS.md | 43 ++++++++++---- R/Iterations.R | 19 +++++-- R/ParseSettings.R | 17 ++++++ inst/config.xml | 2 + tests/testthat/test-iteration_seed.R | 84 ++++++++++++++++++++++++++++ 5 files changed, 150 insertions(+), 15 deletions(-) create mode 100644 tests/testthat/test-iteration_seed.R diff --git a/NEWS.md b/NEWS.md index f9c3c69e..f4b78344 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,39 +4,60 @@ - Some functionality was deprecated because of redundancy and stability issues: - - The `count` outcome type has been deprecated. `count` is a subset of `continuous` outcomes. Its previous implementation did not provide any benefits over `continuous`. + - The `count` outcome type has been deprecated. `count` is a subset of + `continuous` outcomes. Its previous implementation did not provide any + benefits over `continuous`. - - Gradient boosting using the `mboost` package was deprecated. Use `xgboost` instead. + - Gradient boosting using the `mboost` package was deprecated. Use `xgboost` + instead. - The `qvalue` package for computing q-values was deprecated. - - The `VGAM` package, which has been soft-deprecated since version 1.3.0, has now fully been deprecated. + - The `VGAM` package, which has been soft-deprecated since version 1.3.0, has + now fully been deprecated. - - The variable hunting feature selection method for random forests was removed due to stability issues in unit tests. + - The variable hunting feature selection method for random forests was removed + due to stability issues in unit tests. -- Many evaluation steps that only require model predictions can now be called externally by providing a `familiarDataElementPredictionTable` object that contains model prediction data. Such objects can be created using the `as_prediction_table` table. +- Many evaluation steps that only require model predictions can now be called + externally by providing a `familiarDataElementPredictionTable` object that + contains model prediction data. Such objects can be created using the + `as_prediction_table` function. ## Minor changes -- The `evaluation_elements` configuration parameter was added to allow for specifying which evaluation steps should be performed. +- The `iteration_seed` configuration parameter was added to provide a fixed seed + for the sampling algorithms that create e.g. bootstraps, cross-validation, for + the experiment. Providing a seed allows for reproducing the sample division + across different experiments. -- Concordance variable importance for categorical outcomes now relies on the internal implementation for the area under the receiver operating characteristic curve instead of the Gini measure from the `corelearn` package. +- The `evaluation_elements` configuration parameter was added to allow for + specifying which evaluation steps should be performed. + +- Concordance variable importance for categorical outcomes now relies on the + internal implementation for the area under the receiver operating + characteristic curve instead of the Gini measure from the `corelearn` package. - Palettes from the `paletteer` package can now be used to customise plots. -- Plausibility of datasets is now checked more thoroughly to detect common issues: +- Plausibility of datasets is now checked more thoroughly to detect common + issues: - The presence of duplicate rows with the same feature values and outcome. - - The presence of one-to-one predictors of the outcome. This might be outcome-related columns that have been left in the data accidentally. + - The presence of one-to-one predictors of the outcome. This might be + outcome-related columns that have been left in the data accidentally. - The presence of invariant predictors. -- Statistical tests now assess differences between batches if batch normalisation is performed, and warns if the outcome in any batch is significantly different from others. +- Statistical tests now assess differences between batches if batch + normalisation is performed, and warns if the outcome in any batch is + significantly different from others. ## Bug fixes -- Fixed errors when creating feature or similarity plots caused by sample or feature names matching internal column names. +- Fixed errors when creating feature or similarity plots caused by sample or + feature names matching internal column names. - The `sample_similarity` evaluation element is now mentioned in the documentation. diff --git a/R/Iterations.R b/R/Iterations.R index 9caea3ef..1b0e731c 100644 --- a/R/Iterations.R +++ b/R/Iterations.R @@ -262,6 +262,13 @@ id_columns <- get_id_columns(id_depth = "series") batch_id_column <- get_id_columns(id_depth = "batch") + # Start random stream + if (is.null(settings$data$iteration_seed)) { + rstream_object <- NULL + } else { + rstream_object <- .start_random_number_stream(seed = settings$data$iteration_seed) + } + # Generate new iterations ---------------------------------------------------- if (is.null(iteration_list)) { @@ -401,7 +408,8 @@ train_or_validate = "train" ), settings = settings, - data = data + data = data, + rstream_object = rstream_object ) # Append runs to the run list @@ -441,7 +449,8 @@ ), n_iter = n_iter, settings = settings, - data = data + data = data, + rstream_object = rstream_object ) # Append runs to the run list @@ -481,7 +490,8 @@ n_rep = n_rep, n_folds = n_folds, settings = settings, - data = data + data = data, + rstream_object = rstream_object ) # Append runs to the run list @@ -518,7 +528,8 @@ cv_iter_list <- .create_loocv( sample_identifiers = sample_identifiers, settings = settings, - data = data + data = data, + rstream_object = rstream_object ) # Append runs to the run list diff --git a/R/ParseSettings.R b/R/ParseSettings.R index d21e9df4..a3282f59 100644 --- a/R/ParseSettings.R +++ b/R/ParseSettings.R @@ -746,6 +746,10 @@ #' @param imbalance_n_partitions (*optional*) Number of times random #' undersampling should be repeated. 10 undersampled subsets with balanced #' classes are formed by default. +#' @param iteration_seed (*optional*) Integer seed used in sampling algorithms +#' specified by the `experimental_design` argument. This allows for creating +#' the same sample assignments across different experiments -- of course +#' provided that the same dataset is used. By default a random seed is used. #' @param ... Unused arguments. #' #' @return List of parameters related to data parsing and the experiment. @@ -773,6 +777,7 @@ experimental_design = waiver(), imbalance_correction_method = waiver(), imbalance_n_partitions = waiver(), + iteration_seed = waiver(), ... ) { settings <- list() @@ -821,6 +826,18 @@ range = c(1L, Inf) ) + # iteration_seed ------------------------------------------------------------- + # Seed for sampling algorithms. + settings$iteration_seed <- .parse_arg( + x_config = config$iteration_seed, + x_var = iteration_seed, + var_name = "iteration_seed", + type = "integer", + optional = TRUE, + default = NULL + ) + + # sample_id_column ----------------------------------------------------------- # Sample identifier column settings$sample_col <- .parse_arg( diff --git a/inst/config.xml b/inst/config.xml index 5b2b67de..ace5de60 100644 --- a/inst/config.xml +++ b/inst/config.xml @@ -17,6 +17,8 @@ + + diff --git a/tests/testthat/test-iteration_seed.R b/tests/testthat/test-iteration_seed.R new file mode 100644 index 00000000..bd9a0679 --- /dev/null +++ b/tests/testthat/test-iteration_seed.R @@ -0,0 +1,84 @@ +# Don't perform any further tests on CRAN due to running time. +testthat::skip_on_cran() + +# Create data.table. +data <- familiar:::test_create_good_data( + outcome_type = "binomial", + to_data_object = FALSE +) + +# Check reproducibility of sample assignment ----------------------------------- + +# Create data assignment object without fixed seed. +experiment_data_assignment_random <- familiar::precompute_data_assignment( + data = data, + experimental_design = "bs(fs+mb,3)", + outcome_type = "binomial", + outcome_column = "outcome", + batch_id_column = "batch_id", + sample_id_column = "sample_id", + series_id_column = "series_id", + class_levels = c("red", "green"), + verbose = FALSE +) + +# Create data assignment object with fixed seed. +experiment_data_assignment_a <- familiar::precompute_data_assignment( + data = data, + experimental_design = "bs(fs+mb,3)", + outcome_type = "binomial", + outcome_column = "outcome", + batch_id_column = "batch_id", + sample_id_column = "sample_id", + series_id_column = "series_id", + class_levels = c("red", "green"), + iteration_seed = 19L, + verbose = FALSE +) + +# Create data assignment object with fixed seed. +experiment_data_assignment_b <- familiar::precompute_data_assignment( + data = data, + experimental_design = "bs(fs+mb,3)", + outcome_type = "binomial", + outcome_column = "outcome", + batch_id_column = "batch_id", + sample_id_column = "sample_id", + series_id_column = "series_id", + class_levels = c("red", "green"), + iteration_seed = 19L, + verbose = FALSE +) + +# Create data assignment object with fixed seed. +experiment_data_assignment_different <- familiar::precompute_data_assignment( + data = data, + experimental_design = "bs(fs+mb,3)", + outcome_type = "binomial", + outcome_column = "outcome", + batch_id_column = "batch_id", + sample_id_column = "sample_id", + series_id_column = "series_id", + class_levels = c("red", "green"), + iteration_seed = 20L, + verbose = FALSE +) + +# Test that the same fixed seed leads to the same sample assignment. +testthat::expect_equal( + experiment_data_assignment_a@iteration_list, + experiment_data_assignment_b@iteration_list, + ignore_attr = TRUE +) + +# Test that the random seed leads to a different sample assignment. +testthat::expect_false(identical( + experiment_data_assignment_random@iteration_list, + experiment_data_assignment_a@iteration_list +)) + +# Test that a different fixed seed leads to a different sample assignment. +testthat::expect_false(identical( + experiment_data_assignment_different@iteration_list, + experiment_data_assignment_a@iteration_list +))