From 6057bff58d7a95caa1678a19e5d47dd909132858 Mon Sep 17 00:00:00 2001
From: Alex Zwanenburg <alexander.zwanenburg@nct-dresden.de>
Date: Wed, 2 Oct 2024 17:44:12 +0200
Subject: [PATCH] WIP implementation and test for batch normality plausibility.

---
 R/BatchNormalisation.R                        | 76 +++++++++++++++++++
 R/DataPreProcessing.R                         |  6 ++
 .../test-batch_normalisation_checks.R         | 16 ++++
 3 files changed, 98 insertions(+)
 create mode 100644 tests/testthat/test-batch_normalisation_checks.R

diff --git a/R/BatchNormalisation.R b/R/BatchNormalisation.R
index 5fd65107..6df807aa 100644
--- a/R/BatchNormalisation.R
+++ b/R/BatchNormalisation.R
@@ -588,3 +588,79 @@ setMethod(
     "instance_mask" = instance_mask
   ))
 }
+
+
+
+.check_batch_normalisation_assumptions <- function(
+  data,
+  normalisation_method
+) {
+  # Check that batches do not differ in outcome data. We can use the following
+  # tests: 
+  #
+  # * Continuous: Kruskal-Wallis-test.
+  # * Binomial / multinomial: Chi-squared test.
+  # * Survival: Log-rank test.
+  
+  if (all(normalisation_method == "none")) return(invisible(TRUE))
+  
+  x <- data@data[, mget(get_outcome_columns(data))]
+  g <- data@data[[get_id_columns("batch", single_column = TRUE)]]
+  
+  # Determine the number of batches.
+  n_groups <- data.table::uniqueN(g)
+  # Check that 2 (or more groups) are present.
+  if (n_groups < 2L) return(invisible(TRUE))
+  
+  if (data@outcome_type == "continuous") {
+    h <- tryCatch(
+      stats::kruskal.test(x = x, g = g, na.action = "na.omit"),
+      error = identity
+    )
+    
+    # Check if the test statistic could be computed.
+    if (inherits(h, "error")) return(invisible(TRUE))
+    p_value <- h$p.value
+    
+  } else if (data@outcome_type %in% c("binomial", "multinomial")) {
+    h <- tryCatch(
+      stats::chisq.test(x = x, y = g),
+      error = identity
+    )
+    
+    # Check if the test statistic could be computed.
+    if (inherits(h, "error")) return(invisible(TRUE))
+    p_value <- h$p.value
+    
+  } else if (data@outcome_type == "survival") {
+    
+    # Determine chi-square of log-rank test
+    chi_sq <- tryCatch(
+      survival::survdiff(
+        survival::Surv(time = outcome_time, event = outcome_event) ~ group,
+        data = data.table::data.table(),
+        subset = NULL,
+        na.action = "na.omit"
+      )$chisq,
+      error = identity
+    )
+    
+    # Check if the test statistic could be computed. Causes could be lack of
+    # events, no events beyond the first time point, etc.
+    if (inherits(chi_sq, "error")) return(invisible(TRUE))
+    
+    # Derive  p-value
+    p_value  <- stats::pchisq(
+      q = chi_sq,
+      df = n_groups - 1L,
+      lower.tail = FALSE
+    )
+    
+    
+  } else {
+    ..error_outcome_type_not_implemented(data@outcome_type)
+  }
+  
+  
+  return(invisible(TRUE))
+}
diff --git a/R/DataPreProcessing.R b/R/DataPreProcessing.R
index 8390e07d..e0af9511 100644
--- a/R/DataPreProcessing.R
+++ b/R/DataPreProcessing.R
@@ -608,6 +608,12 @@ determine_preprocessing_parameters <- function(
     verbose = verbose && settings$prep$batch_normalisation_method != "none"
   )
   
+  # Check that assumptions for batch normalisation are fulfilled.
+  .check_batch_normalisation_assumptions(
+    data = data,
+    normalisation_method = settings$prep$batch_normalisation_method
+  )
+  
   # Add batch normalisation skeletons.
   feature_info_list <- create_batch_normalisation_parameter_skeleton(
     feature_info_list = feature_info_list,
diff --git a/tests/testthat/test-batch_normalisation_checks.R b/tests/testthat/test-batch_normalisation_checks.R
new file mode 100644
index 00000000..e3fe5b54
--- /dev/null
+++ b/tests/testthat/test-batch_normalisation_checks.R
@@ -0,0 +1,16 @@
+# Continuous outcomes
+
+# Generate data.
+data <- familiar:::test_create_synthetic_series_data(
+  outcome_type = "survival",
+  n_batch = 3L,
+  n_samples = 50L,
+  n_series = 1L,
+  n_rep = 1L
+)
+
+# Test results
+.check_batch_normalisation_assumptions(
+    data = data,
+    normalisation_method = "combat"
+)