From 6057bff58d7a95caa1678a19e5d47dd909132858 Mon Sep 17 00:00:00 2001 From: Alex Zwanenburg Date: Wed, 2 Oct 2024 17:44:12 +0200 Subject: [PATCH] WIP implementation and test for batch normality plausibility. --- R/BatchNormalisation.R | 76 +++++++++++++++++++ R/DataPreProcessing.R | 6 ++ .../test-batch_normalisation_checks.R | 16 ++++ 3 files changed, 98 insertions(+) create mode 100644 tests/testthat/test-batch_normalisation_checks.R diff --git a/R/BatchNormalisation.R b/R/BatchNormalisation.R index 5fd65107..6df807aa 100644 --- a/R/BatchNormalisation.R +++ b/R/BatchNormalisation.R @@ -588,3 +588,79 @@ setMethod( "instance_mask" = instance_mask )) } + + + +.check_batch_normalisation_assumptions <- function( + data, + normalisation_method +) { + # Check that batches do not differ in outcome data. We can use the following + # tests: + # + # * Continuous: Kruskal-Wallis-test. + # * Binomial / multinomial: Chi-squared test. + # * Survival: Log-rank test. + + if (all(normalisation_method == "none")) return(invisible(TRUE)) + + x <- data@data[, mget(get_outcome_columns(data))] + g <- data@data[[get_id_columns("batch", single_column = TRUE)]] + + # Determine the number of batches. + n_groups <- data.table::uniqueN(g) + # Check that 2 (or more groups) are present. + if (n_groups < 2L) return(invisible(TRUE)) + + if (data@outcome_type == "continuous") { + h <- tryCatch( + stats::kruskal.test(x = x, g = g, na.action = "na.omit"), + error = identity + ) + + # Check if the test statistic could be computed. + if (inherits(h, "error")) return(invisible(TRUE)) + p_value <- h$p.value + + } else if (data@outcome_type %in% c("binomial", "multinomial")) { + h <- tryCatch( + stats::chisq.test(x = x, y = g), + error = identity + ) + + # Check if the test statistic could be computed. + if (inherits(h, "error")) return(invisible(TRUE)) + p_value <- h$p.value + + } else if (data@outcome_type == "survival") { + + # Determine chi-square of log-rank test + chi_sq <- tryCatch( + survival::survdiff( + survival::Surv(time = outcome_time, event = outcome_event) ~ group, + data = data.table::data.table(), + subset = NULL, + na.action = "na.omit" + )$chisq, + error = identity + ) + + # Check if the test statistic could be computed. Causes could be lack of + # events, no events beyond the first time point, etc. + if (inherits(chi_sq, "error")) return(invisible(TRUE)) + + # Derive p-value + p_value <- stats::pchisq( + q = chi_sq, + df = n_groups - 1L, + lower.tail = FALSE + ) + + + } else { + ..error_outcome_type_not_implemented(data@outcome_type) + } + + + return(invisible(TRUE)) +} diff --git a/R/DataPreProcessing.R b/R/DataPreProcessing.R index 8390e07d..e0af9511 100644 --- a/R/DataPreProcessing.R +++ b/R/DataPreProcessing.R @@ -608,6 +608,12 @@ determine_preprocessing_parameters <- function( verbose = verbose && settings$prep$batch_normalisation_method != "none" ) + # Check that assumptions for batch normalisation are fulfilled. + .check_batch_normalisation_assumptions( + data = data, + normalisation_method = settings$prep$batch_normalisation_method + ) + # Add batch normalisation skeletons. feature_info_list <- create_batch_normalisation_parameter_skeleton( feature_info_list = feature_info_list, diff --git a/tests/testthat/test-batch_normalisation_checks.R b/tests/testthat/test-batch_normalisation_checks.R new file mode 100644 index 00000000..e3fe5b54 --- /dev/null +++ b/tests/testthat/test-batch_normalisation_checks.R @@ -0,0 +1,16 @@ +# Continuous outcomes + +# Generate data. +data <- familiar:::test_create_synthetic_series_data( + outcome_type = "survival", + n_batch = 3L, + n_samples = 50L, + n_series = 1L, + n_rep = 1L +) + +# Test results +.check_batch_normalisation_assumptions( + data = data, + normalisation_method = "combat" +)