WIP implementation and test for batch normality plausibility.

alexzwanenburg · Oct 2, 2024 · 6057bff · 6057bff
1 parent 04959f9
commit 6057bff
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 0 deletions.
diff --git a/R/BatchNormalisation.R b/R/BatchNormalisation.R
@@ -588,3 +588,79 @@ setMethod(
     "instance_mask" = instance_mask
   ))
 }
+
+
+
+.check_batch_normalisation_assumptions <- function(
+  data,
+  normalisation_method
+) {
+  # Check that batches do not differ in outcome data. We can use the following
+  # tests: 
+  #
+  # * Continuous: Kruskal-Wallis-test.
+  # * Binomial / multinomial: Chi-squared test.
+  # * Survival: Log-rank test.
+
+  if (all(normalisation_method == "none")) return(invisible(TRUE))
+
+  x <- data@data[, mget(get_outcome_columns(data))]
+  g <- data@data[[get_id_columns("batch", single_column = TRUE)]]
+
+  # Determine the number of batches.
+  n_groups <- data.table::uniqueN(g)
+  # Check that 2 (or more groups) are present.
+  if (n_groups < 2L) return(invisible(TRUE))
+
+  if (data@outcome_type == "continuous") {
+    h <- tryCatch(
+      stats::kruskal.test(x = x, g = g, na.action = "na.omit"),
+      error = identity
+    )
+
+    # Check if the test statistic could be computed.
+    if (inherits(h, "error")) return(invisible(TRUE))
+    p_value <- h$p.value
+
+  } else if (data@outcome_type %in% c("binomial", "multinomial")) {
+    h <- tryCatch(
+      stats::chisq.test(x = x, y = g),
+      error = identity
+    )
+
+    # Check if the test statistic could be computed.
+    if (inherits(h, "error")) return(invisible(TRUE))
+    p_value <- h$p.value
+
+  } else if (data@outcome_type == "survival") {
+
+    # Determine chi-square of log-rank test
+    chi_sq <- tryCatch(
+      survival::survdiff(
+        survival::Surv(time = outcome_time, event = outcome_event) ~ group,
+        data = data.table::data.table(),
+        subset = NULL,
+        na.action = "na.omit"
+      )$chisq,
+      error = identity
+    )
+
+    # Check if the test statistic could be computed. Causes could be lack of
+    # events, no events beyond the first time point, etc.
+    if (inherits(chi_sq, "error")) return(invisible(TRUE))
+
+    # Derive  p-value
+    p_value  <- stats::pchisq(
+      q = chi_sq,
+      df = n_groups - 1L,
+      lower.tail = FALSE
+    )
+
+
+  } else {
+    ..error_outcome_type_not_implemented(data@outcome_type)
+  }
+
+
+  return(invisible(TRUE))
+}
diff --git a/R/DataPreProcessing.R b/R/DataPreProcessing.R
@@ -608,6 +608,12 @@ determine_preprocessing_parameters <- function(
     verbose = verbose && settings$prep$batch_normalisation_method != "none"
   )
 
+  # Check that assumptions for batch normalisation are fulfilled.
+  .check_batch_normalisation_assumptions(
+    data = data,
+    normalisation_method = settings$prep$batch_normalisation_method
+  )
+
   # Add batch normalisation skeletons.
   feature_info_list <- create_batch_normalisation_parameter_skeleton(
     feature_info_list = feature_info_list,

diff --git a/tests/testthat/test-batch_normalisation_checks.R b/tests/testthat/test-batch_normalisation_checks.R
@@ -0,0 +1,16 @@
+# Continuous outcomes
+
+# Generate data.
+data <- familiar:::test_create_synthetic_series_data(
+  outcome_type = "survival",
+  n_batch = 3L,
+  n_samples = 50L,
+  n_series = 1L,
+  n_rep = 1L
+)
+
+# Test results
+.check_batch_normalisation_assumptions(
+    data = data,
+    normalisation_method = "combat"
+)