Skip to content

Commit

Permalink
Added check on batch normalisation assumptions.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzwanenburg committed Oct 4, 2024
1 parent 6057bff commit 111dfd7
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 19 deletions.
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@

- Palettes from the `paletteer` package can now be used to customise plots.

- Plausibility of datasets is now checked more thoroughly to detect common issues:

- The presence of duplicate rows with the same feature values and outcome.

- The presence of one-to-one predictors of the outcome. This might be outcome-related columns that have been left in the data accidentally.

- The presence of invariant predictors.

- Statistical tests now assess differences between batches if batch normalisation is performed, and warns if the outcome in any batch is significantly different from others.

## Bug fixes

- Fixed errors when creating feature or similarity plots caused by sample or feature names matching internal column names.
Expand Down
26 changes: 23 additions & 3 deletions R/BatchNormalisation.R
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,11 @@ setMethod(

if (data@outcome_type == "continuous") {
h <- tryCatch(
stats::kruskal.test(x = x, g = g, na.action = "na.omit"),
stats::kruskal.test(
x = x[[1L]],
g = g,
na.action = "na.omit"
),
error = identity
)

Expand All @@ -638,7 +642,11 @@ setMethod(
chi_sq <- tryCatch(
survival::survdiff(
survival::Surv(time = outcome_time, event = outcome_event) ~ group,
data = data.table::data.table(),
data = data.table::data.table(
outcome_time = x$outcome_time,
outcome_event = x$outcome_event,
group = g
),
subset = NULL,
na.action = "na.omit"
)$chisq,
Expand All @@ -656,11 +664,23 @@ setMethod(
lower.tail = FALSE
)


} else {
..error_outcome_type_not_implemented(data@outcome_type)
}

if (p_value < 0.05) {
logger_warning(
paste0(
"One or more batches have a statistically significant (p < 0.05) different outcome ",
"compared to other batches. Note: a statistically significant outcome does ",
"not mean that the difference is actually relevant. However, please assert ",
"that batch normalisation does not remove important differences between batches."
),
warn_class = "familiar_batch_outcome_difference"
)
return(invisible(FALSE))
}


return(invisible(TRUE))
}
2 changes: 1 addition & 1 deletion R/Familiar.R
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ summon_familiar <- function(
data = data,
settings = settings
)
browser()

# Load experiment data -------------------------------------------------------
if (!is.null(experiment_data)) {

Expand Down
56 changes: 41 additions & 15 deletions tests/testthat/test-batch_normalisation_checks.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,42 @@
# Continuous outcomes
# Avoid warnings due to non-standard evaluation in data.table.
outcome <- outcome_time <- NULL

# Generate data.
data <- familiar:::test_create_synthetic_series_data(
outcome_type = "survival",
n_batch = 3L,
n_samples = 50L,
n_series = 1L,
n_rep = 1L
)

# Test results
.check_batch_normalisation_assumptions(
data = data,
normalisation_method = "combat"
)
for (outcome_type in c("continuous", "binomial", "multinomial", "survival")) {
# Generate data.
data <- familiar:::test_create_good_data(outcome_type = outcome_type)
data@data[, "batch_id" := "A"]
data_b <- familiar:::test_create_good_data(outcome_type = outcome_type)
data_b@data[, "batch_id" := "B"]
data_c <- familiar:::test_create_good_data(outcome_type = outcome_type)
data_c@data[, "batch_id" := "C"]
data@data <- rbind(data@data, data_b@data, data_c@data)

# Introduce data with offset.
data_offset <- data
data_offset@data <- data.table::copy(data_offset@data)
if (outcome_type == "continuous") {
data_offset@data[batch_id == "A", "outcome" := outcome + 50.0]

} else if (outcome_type %in% c("binomial", "multinomial")) {
data_offset@data[batch_id == "A", "outcome" := "red"]

} else if (outcome_type == "survival") {
data_offset@data[batch_id == "A", "outcome_time" := outcome_time + 1000.0]
}

# Test results
testthat::expect_no_condition(
familiar:::.check_batch_normalisation_assumptions(
data = data,
normalisation_method = "combat"
)
)

testthat::expect_warning(
familiar:::.check_batch_normalisation_assumptions(
data = data_offset,
normalisation_method = "combat"
),
class = "familiar_batch_outcome_difference"
)
}

0 comments on commit 111dfd7

Please sign in to comment.