Skip to content

Commit

Permalink
WIP implementation and test for batch normality plausibility.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzwanenburg committed Oct 2, 2024
1 parent 04959f9 commit 6057bff
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 0 deletions.
76 changes: 76 additions & 0 deletions R/BatchNormalisation.R
Original file line number Diff line number Diff line change
Expand Up @@ -588,3 +588,79 @@ setMethod(
"instance_mask" = instance_mask
))
}



.check_batch_normalisation_assumptions <- function(
data,
normalisation_method
) {
# Check that batches do not differ in outcome data. We can use the following
# tests:
#
# * Continuous: Kruskal-Wallis-test.
# * Binomial / multinomial: Chi-squared test.
# * Survival: Log-rank test.

if (all(normalisation_method == "none")) return(invisible(TRUE))

x <- data@data[, mget(get_outcome_columns(data))]
g <- data@data[[get_id_columns("batch", single_column = TRUE)]]

# Determine the number of batches.
n_groups <- data.table::uniqueN(g)
# Check that 2 (or more groups) are present.
if (n_groups < 2L) return(invisible(TRUE))

if (data@outcome_type == "continuous") {
h <- tryCatch(
stats::kruskal.test(x = x, g = g, na.action = "na.omit"),
error = identity
)

# Check if the test statistic could be computed.
if (inherits(h, "error")) return(invisible(TRUE))
p_value <- h$p.value

} else if (data@outcome_type %in% c("binomial", "multinomial")) {
h <- tryCatch(
stats::chisq.test(x = x, y = g),
error = identity
)

# Check if the test statistic could be computed.
if (inherits(h, "error")) return(invisible(TRUE))
p_value <- h$p.value

} else if (data@outcome_type == "survival") {

# Determine chi-square of log-rank test
chi_sq <- tryCatch(
survival::survdiff(
survival::Surv(time = outcome_time, event = outcome_event) ~ group,
data = data.table::data.table(),
subset = NULL,
na.action = "na.omit"
)$chisq,
error = identity
)

# Check if the test statistic could be computed. Causes could be lack of
# events, no events beyond the first time point, etc.
if (inherits(chi_sq, "error")) return(invisible(TRUE))

# Derive p-value
p_value <- stats::pchisq(
q = chi_sq,
df = n_groups - 1L,
lower.tail = FALSE
)


} else {
..error_outcome_type_not_implemented(data@outcome_type)
}


return(invisible(TRUE))
}
6 changes: 6 additions & 0 deletions R/DataPreProcessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,12 @@ determine_preprocessing_parameters <- function(
verbose = verbose && settings$prep$batch_normalisation_method != "none"
)

# Check that assumptions for batch normalisation are fulfilled.
.check_batch_normalisation_assumptions(
data = data,
normalisation_method = settings$prep$batch_normalisation_method
)

# Add batch normalisation skeletons.
feature_info_list <- create_batch_normalisation_parameter_skeleton(
feature_info_list = feature_info_list,
Expand Down
16 changes: 16 additions & 0 deletions tests/testthat/test-batch_normalisation_checks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Continuous outcomes

# Generate data.
data <- familiar:::test_create_synthetic_series_data(
outcome_type = "survival",
n_batch = 3L,
n_samples = 50L,
n_series = 1L,
n_rep = 1L
)

# Test results
.check_batch_normalisation_assumptions(
data = data,
normalisation_method = "combat"
)

0 comments on commit 6057bff

Please sign in to comment.