From 111dfd73535b72d02a89269a4870f841cf3542f9 Mon Sep 17 00:00:00 2001 From: Alex Zwanenburg Date: Fri, 4 Oct 2024 17:22:01 +0200 Subject: [PATCH] Added check on batch normalisation assumptions. --- NEWS.md | 10 ++++ R/BatchNormalisation.R | 26 ++++++++- R/Familiar.R | 2 +- .../test-batch_normalisation_checks.R | 56 ++++++++++++++----- 4 files changed, 75 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index d565a02d..f9c3c69e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,16 @@ - Palettes from the `paletteer` package can now be used to customise plots. +- Plausibility of datasets is now checked more thoroughly to detect common issues: + + - The presence of duplicate rows with the same feature values and outcome. + + - The presence of one-to-one predictors of the outcome. This might be outcome-related columns that have been left in the data accidentally. + + - The presence of invariant predictors. + +- Statistical tests now assess differences between batches if batch normalisation is performed, and warns if the outcome in any batch is significantly different from others. + ## Bug fixes - Fixed errors when creating feature or similarity plots caused by sample or feature names matching internal column names. diff --git a/R/BatchNormalisation.R b/R/BatchNormalisation.R index 6df807aa..8a122ab6 100644 --- a/R/BatchNormalisation.R +++ b/R/BatchNormalisation.R @@ -614,7 +614,11 @@ setMethod( if (data@outcome_type == "continuous") { h <- tryCatch( - stats::kruskal.test(x = x, g = g, na.action = "na.omit"), + stats::kruskal.test( + x = x[[1L]], + g = g, + na.action = "na.omit" + ), error = identity ) @@ -638,7 +642,11 @@ setMethod( chi_sq <- tryCatch( survival::survdiff( survival::Surv(time = outcome_time, event = outcome_event) ~ group, - data = data.table::data.table(), + data = data.table::data.table( + outcome_time = x$outcome_time, + outcome_event = x$outcome_event, + group = g + ), subset = NULL, na.action = "na.omit" )$chisq, @@ -656,11 +664,23 @@ setMethod( lower.tail = FALSE ) - } else { ..error_outcome_type_not_implemented(data@outcome_type) } + if (p_value < 0.05) { + logger_warning( + paste0( + "One or more batches have a statistically significant (p < 0.05) different outcome ", + "compared to other batches. Note: a statistically significant outcome does ", + "not mean that the difference is actually relevant. However, please assert ", + "that batch normalisation does not remove important differences between batches." + ), + warn_class = "familiar_batch_outcome_difference" + ) + return(invisible(FALSE)) + } + return(invisible(TRUE)) } diff --git a/R/Familiar.R b/R/Familiar.R index 63354065..76a9cfdf 100644 --- a/R/Familiar.R +++ b/R/Familiar.R @@ -214,7 +214,7 @@ summon_familiar <- function( data = data, settings = settings ) - browser() + # Load experiment data ------------------------------------------------------- if (!is.null(experiment_data)) { diff --git a/tests/testthat/test-batch_normalisation_checks.R b/tests/testthat/test-batch_normalisation_checks.R index e3fe5b54..f16b91dd 100644 --- a/tests/testthat/test-batch_normalisation_checks.R +++ b/tests/testthat/test-batch_normalisation_checks.R @@ -1,16 +1,42 @@ -# Continuous outcomes +# Avoid warnings due to non-standard evaluation in data.table. +outcome <- outcome_time <- NULL -# Generate data. -data <- familiar:::test_create_synthetic_series_data( - outcome_type = "survival", - n_batch = 3L, - n_samples = 50L, - n_series = 1L, - n_rep = 1L -) - -# Test results -.check_batch_normalisation_assumptions( - data = data, - normalisation_method = "combat" -) +for (outcome_type in c("continuous", "binomial", "multinomial", "survival")) { + # Generate data. + data <- familiar:::test_create_good_data(outcome_type = outcome_type) + data@data[, "batch_id" := "A"] + data_b <- familiar:::test_create_good_data(outcome_type = outcome_type) + data_b@data[, "batch_id" := "B"] + data_c <- familiar:::test_create_good_data(outcome_type = outcome_type) + data_c@data[, "batch_id" := "C"] + data@data <- rbind(data@data, data_b@data, data_c@data) + + # Introduce data with offset. + data_offset <- data + data_offset@data <- data.table::copy(data_offset@data) + if (outcome_type == "continuous") { + data_offset@data[batch_id == "A", "outcome" := outcome + 50.0] + + } else if (outcome_type %in% c("binomial", "multinomial")) { + data_offset@data[batch_id == "A", "outcome" := "red"] + + } else if (outcome_type == "survival") { + data_offset@data[batch_id == "A", "outcome_time" := outcome_time + 1000.0] + } + + # Test results + testthat::expect_no_condition( + familiar:::.check_batch_normalisation_assumptions( + data = data, + normalisation_method = "combat" + ) + ) + + testthat::expect_warning( + familiar:::.check_batch_normalisation_assumptions( + data = data_offset, + normalisation_method = "combat" + ), + class = "familiar_batch_outcome_difference" + ) +}