From f69703bcfbc0531d9aa5a5730c6535bdec938d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Fri, 13 Dec 2024 19:45:45 +0000 Subject: [PATCH 01/10] init script --- .../validate_model/input_test/contrasts.yml | 9 ++ .../input_test/validate_model.R | 112 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 modules/local/validate_model/input_test/contrasts.yml create mode 100644 modules/local/validate_model/input_test/validate_model.R diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml new file mode 100644 index 00000000..3bfa9350 --- /dev/null +++ b/modules/local/validate_model/input_test/contrasts.yml @@ -0,0 +1,9 @@ +models: + - formula: "~ treatment" + contrasts: + - id: treatment_mCherry_hND6_ + comparison: ["treatment", "mCherry", "hND6"] + + - id: treatment_mCherry_hND6_sample_number + comparison: ["treatment", "mCherry", "hND6"] + blocking_factors: ["sample_number"] diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R new file mode 100644 index 00000000..4ff1207f --- /dev/null +++ b/modules/local/validate_model/input_test/validate_model.R @@ -0,0 +1,112 @@ +#!/usr/bin/env Rscript + +## TOD: add make contrast arguments + +# Load required libraries +suppressWarnings(suppressMessages({ + library(optparse) + library(yaml) + library(readr) + library(dplyr) +})) + +# Define command-line arguments +option_list <- list( + make_option(c("-m", "--models"), type = "character", default = NULL, + help = "Path to the models.yml file", metavar = "character"), + make_option(c("-s", "--samplesheet"), type = "character", default = NULL, + help = "Path to the samplesheet CSV file", metavar = "character") +) + +# Parse command-line arguments +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +# Validate input arguments +if (is.null(opt$models) || is.null(opt$samplesheet)) { + stop("Both --models and --samplesheet arguments are required.", call. = FALSE) +} + +# Load models.yml file +tryCatch({ + models <- yaml.load_file(opt$models) + cat("Loaded models.yml successfully.\n") +}, error = function(e) { + stop("Error loading models.yml: ", e$message) +}) + +# Load samplesheet CSV file +tryCatch({ + samplesheet <- read_csv(opt$samplesheet, show_col_types = FALSE) + cat("Loaded samplesheet CSV successfully.\n") +}, error = function(e) { + stop("Error loading samplesheet CSV: ", e$message) +}) + +# --- Step 1: Collect all variables from contrasts in the YAML --- +cat("\n--- Collecting Variables from Models ---\n") +variables <- unique(unlist(lapply(models$models, function(model) { + unlist(lapply(model$contrasts, function(contrast) contrast$comparison)) +}))) + +cat("Variables in contrasts:", paste(variables, collapse = ", "), "\n") + +# --- Step 2: Validate these variables are present as columns in the samplesheet --- +cat("\n--- Validating Variables in Samplesheet ---\n") +missing_columns <- setdiff(variables, colnames(samplesheet)) + +if (length(missing_columns) > 0) { + stop("The following required columns are missing in the samplesheet: ", + paste(missing_columns, collapse = ", ")) +} else { + cat("All required variables are present in the samplesheet.\n") +} + +# --- Step 3: Validate components in samplesheet columns --- +cat("\n--- Validating Column Components ---\n") +for (variable in variables) { + expected_values <- unique(unlist(lapply(models$models, function(model) { + unlist(lapply(model$contrasts, function(contrast) { + if (variable %in% contrast$comparison) return(contrast$comparison) + })) + }))) + expected_values <- expected_values[!is.na(expected_values)] # Remove NAs + + actual_values <- unique(samplesheet[[variable]]) + if (!all(actual_values %in% expected_values)) { + cat("Column:", variable, "\n") + cat("Expected values:", paste(expected_values, collapse = ", "), "\n") + cat("Actual values:", paste(actual_values, collapse = ", "), "\n") + stop("Mismatch in expected and actual values for column: ", variable) + } else { + cat("Column '", variable, "' has valid components.\n", sep = "") + } +} + +# --- Step 4: Validate column names for recommended characters --- +cat("\n--- Checking Column Names for Recommended Characters ---\n") +invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE) + +if (length(invalid_columns) > 0) { + stop("The following column names contain non-recommended characters: ", + paste(invalid_columns, collapse = ", ")) +} else { + cat("All column names are valid.\n") +} + +# --- Step 5: Validate column contents for recommended characters and NAs --- +cat("\n--- Checking Column Contents for Recommended Characters and NAs ---\n") +for (variable in variables) { + invalid_values <- grep("[^a-zA-Z0-9_]", samplesheet[[variable]], value = TRUE) + if (any(is.na(samplesheet[[variable]]))) { + stop("Column '", variable, "' contains missing values (NA).") + } + if (length(invalid_values) > 0) { + stop("Column '", variable, "' contains invalid values: ", + paste(invalid_values, collapse = ", ")) + } else { + cat("Column '", variable, "' contents are valid.\n") + } +} + +cat("\nValidation completed successfully.\n") From 352a6fcb177370d82c44ad036d5005552b6415e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Mon, 16 Dec 2024 20:15:18 +0000 Subject: [PATCH 02/10] add function to check samplesheet --- .../validate_model/input_test/contrasts.yml | 9 +- .../input_test/validate_model.R | 231 ++++++++++++++++-- 2 files changed, 220 insertions(+), 20 deletions(-) diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml index 3bfa9350..b362c14e 100644 --- a/modules/local/validate_model/input_test/contrasts.yml +++ b/modules/local/validate_model/input_test/contrasts.yml @@ -1,9 +1,12 @@ models: - formula: "~ treatment" contrasts: - - id: treatment_mCherry_hND6_ + - id: "treatment_mCherry_hND6" comparison: ["treatment", "mCherry", "hND6"] - - id: treatment_mCherry_hND6_sample_number + - id: "treatment_mCherry_hND6_sample_number" comparison: ["treatment", "mCherry", "hND6"] - blocking_factors: ["sample_number"] + blocking_factors: ["sample_number23"] + + - id: "treatment234" + comparison: ["treatment", "GFP", "hND1000"] diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R index 4ff1207f..2aef6385 100644 --- a/modules/local/validate_model/input_test/validate_model.R +++ b/modules/local/validate_model/input_test/validate_model.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript -## TOD: add make contrast arguments +## TODO: add make contrast arguments # Load required libraries suppressWarnings(suppressMessages({ @@ -12,10 +12,12 @@ suppressWarnings(suppressMessages({ # Define command-line arguments option_list <- list( - make_option(c("-m", "--models"), type = "character", default = NULL, + make_option(c("-y", "--yml"), type = "character", default = NULL, help = "Path to the models.yml file", metavar = "character"), make_option(c("-s", "--samplesheet"), type = "character", default = NULL, - help = "Path to the samplesheet CSV file", metavar = "character") + help = "Path to the samplesheet CSV file", metavar = "character"), + make_option(c("-i", "--sample"), type = "character", default = "sample", + help = "Column that contains sample identificators", metavar = "character") ) # Parse command-line arguments @@ -23,13 +25,18 @@ opt_parser <- OptionParser(option_list = option_list) opt <- parse_args(opt_parser) # Validate input arguments -if (is.null(opt$models) || is.null(opt$samplesheet)) { - stop("Both --models and --samplesheet arguments are required.", call. = FALSE) +if (is.null(opt$models) || is.null(opt$samplesheet) ) { + stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE) } +## Collect parameters +path_yml <- "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #opt$models +path_samplesheet <- "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #opt$samplesheet +sample_column <- "sample" #opt$sample + # Load models.yml file tryCatch({ - models <- yaml.load_file(opt$models) + models <- read_yaml(path_yml) cat("Loaded models.yml successfully.\n") }, error = function(e) { stop("Error loading models.yml: ", e$message) @@ -37,21 +44,211 @@ tryCatch({ # Load samplesheet CSV file tryCatch({ - samplesheet <- read_csv(opt$samplesheet, show_col_types = FALSE) - cat("Loaded samplesheet CSV successfully.\n") + samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) + cat("Loaded samplesheet successfully.\n") }, error = function(e) { stop("Error loading samplesheet CSV: ", e$message) }) -# --- Step 1: Collect all variables from contrasts in the YAML --- -cat("\n--- Collecting Variables from Models ---\n") -variables <- unique(unlist(lapply(models$models, function(model) { - unlist(lapply(model$contrasts, function(contrast) contrast$comparison)) -}))) +## Collect all variables and factors from the yml file +var <- list(); for (FORMULA in models$models) { + ## Populate list + for (CONTRAST in FORMULA$contrasts) { + ## Get column name: first comparison's element + name <- CONTRAST$comparison[1] + + ## Get factors + variables <- CONTRAST$comparison[2:3] + + ## If the column is already present in the list, add more components to it + if (name %in% names(var) ) { + variables <- unique( c(var[[ name ]], CONTRAST$comparison[2:3] )) + var[[ name ]] <- variables + cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") + ## Default to new variable + } else { + cat("Detected variable:", name, "\n") + var[[ length(var) + 1 ]] <- variables + names(var)[length(var) ] <- name + } + + ## Get blocking factors + blocking <- c() + if ( !is.null(CONTRAST$blocking_factors) ) { + cat("Blocking factors detected for variable", name, "\n") + blocking <- CONTRAST$blocking_factors + cat(paste(blocking, collapse = ""), "\n") + } + + ## Add blockig variables + if ( "blocking_factors" %in% names(var) ) { + var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking )) + } else { + var[[ "blocking_factors" ]] <- unique( blocking ) + } + } +} + +## Print explicit message +for (INDEX in 1:length(var)) { + cat("Detected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\n", sep = "") +} + + +## Function to validate the samplesheet against the yml file +validate_model <- function(variables, samplesheet) { # variables: list; samplesheet: df + + variables <- var + samplesheet <- samplesheet + + undesired_chars <- "[^a-zA-Z0-9_]" + ####################################################### + ## + ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet. + ## + ## The function evaluates that: + ## * Every variable is present in the sample sheet. + ## * Every variable has valid name + ## * There are no NAs + ## * All expected values are present + ## * All samples contains a valid level. + ## + ## Finally, the function returns + ## * An error vector containing all errors found. + ## * A warnings vector + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ## Initialize errors vector + errors <- c() + + ## Initialize warning errors + warnings <- c() + + # Initialize vector to report continuos variables + continuous <- c() + + ## Check samplesheet names for invalid characters + df_colnames <- names(samplesheet) + true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars) + + if ( sum(true_columnms) > 0 ) { + errors <- c(errors, + paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " ")) + ) + } + + ## Check that blocking variables exists and do not contain NAs, if they were specified + blocking_factors <- c() + if ( !is.null(variables$blocking_factors) ) { + blocking_factors <- variables$blocking_factors + + for (VARIABLE in blocking_factors) { + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + ## Check if there are NAs + na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) ) + if ( na_rows > 0 ) { + errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " "))) + } + } else { + errors <- c( + errors, + paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- c(continuous, VARIABLE) + } else { + paste(VARIABLE, "not continuous") + } + } + } + + ## Check variables and levels specified + for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here + + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- unique(c(continuous, VARIABLE)) + } else { + paste(VARIABLE, "is not continuous") + } + + ## Check if there are NAs + na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) ) + if ( na_rows > 0 ) { + errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " "))) + } + + ## Check that the column data does not contain undesidered characters + if (stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) { + errors <- c(errors, + paste0("Column ", VARIABLE, " contains undesired characters\n") + ) + } + + ## Extract the expected levels according to the YML file + expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) ) + + ## Assign names + names(expected_factors_levels) <- variables[[ VARIABLE ]] + + ## Extract the levels from the sample sheet + obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]]) + + ## Check that all expected levels for this variable are effectively present + for (LEVEL in names(expected_factors_levels) ) { + expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels + } + + if( !all(expected_factors_levels) ) { + errors <- c( + errors, + paste0( + "Missing factor levels for variable '", VARIABLE, "'. ", + "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ", + "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " ) + ) + ) + } + + ## Check whether there are levels in the table that are not present in the models definitions + if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) { + warnings <- c(warnings, + paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) )) + } + + ## Report that a column is not even present in the table + } else { + errors <- c( + errors, + paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + + ## Check that the column name is valid + ## Check that the column content is valid -> done + ## Check that the column does not have NAs -> Done + ## Check that all factors exists -> Done + } + + if ( !is.null(continuous)) { + warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) + } + return(list( 'errors' = errors, 'warnings' = warnings) ) + +} + +errors_warnings <- validate_model(var, samplesheet) -cat("Variables in contrasts:", paste(variables, collapse = ", "), "\n") -# --- Step 2: Validate these variables are present as columns in the samplesheet --- +# Validate that these variables are present as columns in the samplesheet cat("\n--- Validating Variables in Samplesheet ---\n") missing_columns <- setdiff(variables, colnames(samplesheet)) @@ -62,7 +259,7 @@ if (length(missing_columns) > 0) { cat("All required variables are present in the samplesheet.\n") } -# --- Step 3: Validate components in samplesheet columns --- +# Validate components in samplesheet columns cat("\n--- Validating Column Components ---\n") for (variable in variables) { expected_values <- unique(unlist(lapply(models$models, function(model) { @@ -83,7 +280,7 @@ for (variable in variables) { } } -# --- Step 4: Validate column names for recommended characters --- +# Validate column names for recommended characters cat("\n--- Checking Column Names for Recommended Characters ---\n") invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE) From afa2472c9c629ead4b336182408e7efe9bb15364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Tue, 17 Dec 2024 14:57:54 +0000 Subject: [PATCH 03/10] add check model code --- .../validate_model/input_test/contrasts.yml | 4 +- .../input_test/validate_model.R | 184 ++++++++++-------- 2 files changed, 109 insertions(+), 79 deletions(-) diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml index b362c14e..05ee8e5e 100644 --- a/modules/local/validate_model/input_test/contrasts.yml +++ b/modules/local/validate_model/input_test/contrasts.yml @@ -6,7 +6,7 @@ models: - id: "treatment_mCherry_hND6_sample_number" comparison: ["treatment", "mCherry", "hND6"] - blocking_factors: ["sample_number23"] + blocking_factors: ["sample_number"] - id: "treatment234" - comparison: ["treatment", "GFP", "hND1000"] + comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R index 2aef6385..90e73df6 100644 --- a/modules/local/validate_model/input_test/validate_model.R +++ b/modules/local/validate_model/input_test/validate_model.R @@ -4,10 +4,11 @@ # Load required libraries suppressWarnings(suppressMessages({ + library(tidyverse) library(optparse) library(yaml) - library(readr) - library(dplyr) + #library(readr) + #library(dplyr) })) # Define command-line arguments @@ -25,6 +26,7 @@ opt_parser <- OptionParser(option_list = option_list) opt <- parse_args(opt_parser) # Validate input arguments +## Required arguments if (is.null(opt$models) || is.null(opt$samplesheet) ) { stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE) } @@ -37,28 +39,40 @@ sample_column <- "sample" #opt$sample # Load models.yml file tryCatch({ models <- read_yaml(path_yml) - cat("Loaded models.yml successfully.\n") + cat("Loaded YML file successfully.\n") }, error = function(e) { - stop("Error loading models.yml: ", e$message) + stop("Error loading YML file: ", e$message) }) # Load samplesheet CSV file tryCatch({ samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) + if ( !sample_column %in% colnames(samplesheet) ) { + stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") ) + } cat("Loaded samplesheet successfully.\n") }, error = function(e) { stop("Error loading samplesheet CSV: ", e$message) }) ## Collect all variables and factors from the yml file -var <- list(); for (FORMULA in models$models) { +var <- list(); contrasts_list <- list(); for (FORMULA in models$models) { + ## Populate list for (CONTRAST in FORMULA$contrasts) { ## Get column name: first comparison's element name <- CONTRAST$comparison[1] ## Get factors - variables <- CONTRAST$comparison[2:3] + variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + + ## Populate contrasts_list for model validation + contrasts_list[[ CONTRAST$id ]] <- list( + "formula" = FORMULA$formula, + "variable" = name, + "contrast" = variables, + "blocking_factors" = CONTRAST$blocking_factors + ) ## If the column is already present in the list, add more components to it if (name %in% names(var) ) { @@ -96,12 +110,12 @@ for (INDEX in 1:length(var)) { ## Function to validate the samplesheet against the yml file -validate_model <- function(variables, samplesheet) { # variables: list; samplesheet: df - - variables <- var - samplesheet <- samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # variables: list; samplesheet: df + #variables <- var + #samplesheet <- samplesheet undesired_chars <- "[^a-zA-Z0-9_]" + ####################################################### ## ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet. @@ -132,9 +146,9 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh ## Check samplesheet names for invalid characters df_colnames <- names(samplesheet) - true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars) + true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) - if ( sum(true_columnms) > 0 ) { + if ( sum(true_columns) > 0 ) { errors <- c(errors, paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " ")) ) @@ -149,9 +163,9 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh ## Check that the column exists if ( VARIABLE %in% colnames(samplesheet) ) { ## Check if there are NAs - na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) ) - if ( na_rows > 0 ) { - errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " "))) + na_rows <- is.na( samplesheet[[ VARIABLE ]]) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) } } else { errors <- c( @@ -182,13 +196,13 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh } ## Check if there are NAs - na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) ) - if ( na_rows > 0 ) { - errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " "))) + na_rows <- is.na( samplesheet[[ VARIABLE ]] ) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) } ## Check that the column data does not contain undesidered characters - if (stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) { + if ( sum( stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) > 0 ) { errors <- c(errors, paste0("Column ", VARIABLE, " contains undesired characters\n") ) @@ -231,79 +245,95 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh errors, paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) } - - ## Check that the column name is valid - ## Check that the column content is valid -> done - ## Check that the column does not have NAs -> Done - ## Check that all factors exists -> Done } if ( !is.null(continuous)) { warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) } - return(list( 'errors' = errors, 'warnings' = warnings) ) + + if (length(errors) > 0 ) { + stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = "")) + } + + ## Generate validated samplesheet + tryCatch({ + selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) + pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + }, error = function(e) { + stop("Error generating validated samplesheet: ", e$message) + }) + + return(list( 'pheno_table' = pheno_table, 'errors' = errors, 'warnings' = warnings) ) } -errors_warnings <- validate_model(var, samplesheet) +## Collect all outputs from function: errors must be empty, warnings can have messages +phenotable_errors_warnings <- validate_model(sample_column, var, samplesheet) +## Cat warnings messages +cat(phenotable_errors_warnings[[ 3 ]], "\n") -# Validate that these variables are present as columns in the samplesheet -cat("\n--- Validating Variables in Samplesheet ---\n") -missing_columns <- setdiff(variables, colnames(samplesheet)) +## Get validated sample sheet +pheno_table <- phenotable_errors_warnings[[ 1 ]] -if (length(missing_columns) > 0) { - stop("The following required columns are missing in the samplesheet: ", - paste(missing_columns, collapse = ", ")) -} else { - cat("All required variables are present in the samplesheet.\n") -} +## CHECK THAT THE MODELS ARE FULL RANKED +check_model_contrasts <- function(contrasts_list, colData) { + for (model_name in names(contrasts_list)) { -# Validate components in samplesheet columns -cat("\n--- Validating Column Components ---\n") -for (variable in variables) { - expected_values <- unique(unlist(lapply(models$models, function(model) { - unlist(lapply(model$contrasts, function(contrast) { - if (variable %in% contrast$comparison) return(contrast$comparison) - })) - }))) - expected_values <- expected_values[!is.na(expected_values)] # Remove NAs - - actual_values <- unique(samplesheet[[variable]]) - if (!all(actual_values %in% expected_values)) { - cat("Column:", variable, "\n") - cat("Expected values:", paste(expected_values, collapse = ", "), "\n") - cat("Actual values:", paste(actual_values, collapse = ", "), "\n") - stop("Mismatch in expected and actual values for column: ", variable) - } else { - cat("Column '", variable, "' has valid components.\n", sep = "") - } -} + # Extract model components + model <- contrasts_list[[model_name]] + base_formula <- as.formula(model$formula) # Ensure formula object + variable <- model$variable + contrast <- model$contrast + blocking <- model$blocking_factor # Extract blocking factor -# Validate column names for recommended characters -cat("\n--- Checking Column Names for Recommended Characters ---\n") -invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE) + # Check if blocking factor is already in the formula + if (!is.null(blocking)) { + # Get terms from the formula + formula_terms <- all.vars(base_formula) -if (length(invalid_columns) > 0) { - stop("The following column names contain non-recommended characters: ", - paste(invalid_columns, collapse = ", ")) -} else { - cat("All column names are valid.\n") -} + # If blocking factor is not already in the formula, add it + if (!(blocking %in% formula_terms)) { + #cat("\nAdding blocking factor:", blocking, "to the model.\n") + updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) + } else { + #cat("\nBlocking factor:", blocking, "is already in the formula. Skipping addition.\n") + updated_formula <- base_formula + } + } else { + updated_formula <- base_formula + } -# --- Step 5: Validate column contents for recommended characters and NAs --- -cat("\n--- Checking Column Contents for Recommended Characters and NAs ---\n") -for (variable in variables) { - invalid_values <- grep("[^a-zA-Z0-9_]", samplesheet[[variable]], value = TRUE) - if (any(is.na(samplesheet[[variable]]))) { - stop("Column '", variable, "' contains missing values (NA).") - } - if (length(invalid_values) > 0) { - stop("Column '", variable, "' contains invalid values: ", - paste(invalid_values, collapse = ", ")) - } else { - cat("Column '", variable, "' contents are valid.\n") + cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n") + + # Build the design matrix + design_matrix <- model.matrix(updated_formula, data = colData) + cat("Design matrix:\n") + print(design_matrix) + + # Check the rank of the design matrix + rank <- qr(design_matrix)$rank + expected_rank <- ncol(design_matrix) + + if (rank == expected_rank) { + cat("The design matrix is full rank.\n") + } else { + cat("WARNING: The design matrix is NOT full rank!\n") + } + + # Check if contrast variable exists in colData + if (variable %in% colnames(colData)) { + factor_levels <- levels(factor(colData[[variable]], levels = contrast)) + if (all(contrast %in% factor_levels)) { + cat("Contrast levels are valid:", paste(contrast, collapse = " vs "), "\n") + } else { + cat("ERROR: Contrast levels", paste(contrast, collapse = " vs "), + "do not exist in", variable, "\n") + } + } else { + cat("ERROR: Contrast variable", variable, "is not in colData.\n") + } } } -cat("\nValidation completed successfully.\n") +check_model_contrasts(contrasts_list, pheno_table) From 9e42bdd8af30c12cb3909426fa51e04fcd17f7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Wed, 18 Dec 2024 11:47:02 +0000 Subject: [PATCH 04/10] POC ready --- conf/modules.config | 9 + modules/local/validatemodel/environment.yml | 8 + modules/local/validatemodel/main.nf | 57 +++++ .../resources/usr/bin}/validate_model.R | 213 +++++++++++------- .../tests}/contrasts.yml | 0 .../tests/contrasts_maxquant.yml | 12 + .../local/validatemodel/tests/main.nf.test | 39 ++++ .../local/validatemodel/tests/nextflow.config | 6 + nextflow_schema.json | 2 +- workflows/differentialabundance.nf | 8 + 10 files changed, 270 insertions(+), 84 deletions(-) create mode 100644 modules/local/validatemodel/environment.yml create mode 100644 modules/local/validatemodel/main.nf rename modules/local/{validate_model/input_test => validatemodel/resources/usr/bin}/validate_model.R (60%) mode change 100644 => 100755 rename modules/local/{validate_model/input_test => validatemodel/tests}/contrasts.yml (100%) create mode 100644 modules/local/validatemodel/tests/contrasts_maxquant.yml create mode 100644 modules/local/validatemodel/tests/main.nf.test create mode 100644 modules/local/validatemodel/tests/nextflow.config diff --git a/conf/modules.config b/conf/modules.config index f26c1f9b..34e23407 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -42,6 +42,15 @@ process { ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" } + withName: VALIDATE_MODEL { + ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + publishDir = [ + path: { "${params.outdir}/validate" }, + mode: params.publish_dir_mode, + ] + + } + withName: AFFY_JUSTRMA_RAW { publishDir = [ [ diff --git a/modules/local/validatemodel/environment.yml b/modules/local/validatemodel/environment.yml new file mode 100644 index 00000000..9a8af4a9 --- /dev/null +++ b/modules/local/validatemodel/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-jsonlite + - conda-forge::r-optparse + - conda-forge::r-tidyverse + - conda-forge::r-yaml diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf new file mode 100644 index 00000000..b8cf31ce --- /dev/null +++ b/modules/local/validatemodel/main.nf @@ -0,0 +1,57 @@ +process VALIDATE_MODEL { + tag "${models_yml}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f5/f5ef116da0b04fc6359efa48f08b9acfc040a02d7a1ec62476c77b6db0fc1499/data' : + 'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }" + + input: + tuple path(samplesheet) + tuple val(meta), path(models_yml) + + output: + tuple val(meta), path("pheno_table.csv"), emit: pheno_table + tuple val(meta), path("designs.json") , emit: designs + tuple val(meta), path("warnings.json") , emit: warnings, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + validate_model.R \\ + --yml "${models_yml}" \\ + --samplesheet "${samplesheet}" \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + + stub: + """ + touch pheno_table.csv + touch designs.json + touch warnings.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + +} diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R old mode 100644 new mode 100755 similarity index 60% rename from modules/local/validate_model/input_test/validate_model.R rename to modules/local/validatemodel/resources/usr/bin/validate_model.R index 90e73df6..bde457b1 --- a/modules/local/validate_model/input_test/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -2,41 +2,52 @@ ## TODO: add make contrast arguments -# Load required libraries +# LOAD LIBRARIES ---------------------------------------------------- suppressWarnings(suppressMessages({ library(tidyverse) library(optparse) library(yaml) - #library(readr) - #library(dplyr) + library(jsonlite) })) -# Define command-line arguments +# PARSE ARGUMENTS --------------------------------------------------- +## Generate list option_list <- list( make_option(c("-y", "--yml"), type = "character", default = NULL, help = "Path to the models.yml file", metavar = "character"), make_option(c("-s", "--samplesheet"), type = "character", default = NULL, help = "Path to the samplesheet CSV file", metavar = "character"), - make_option(c("-i", "--sample"), type = "character", default = "sample", + make_option(c("-i", "--sample_id_col"), type = "character", default = "sample", help = "Column that contains sample identificators", metavar = "character") ) -# Parse command-line arguments -opt_parser <- OptionParser(option_list = option_list) +## Parse command-line arguments +opt_parser <- + OptionParser( + option_list = option_list, + description = "Validate a sample sheet against a YML file for model designs.", + epilogue = + " + The process will look for variables and factors in the YML file and validate that they are present in the sample sheet. + Will also look for unwanted characters within columns and colnames. + Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not. + " + ) opt <- parse_args(opt_parser) -# Validate input arguments -## Required arguments -if (is.null(opt$models) || is.null(opt$samplesheet) ) { - stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE) +## Validate input arguments +### Required arguments +if (is.null(opt$yml) || is.null(opt$samplesheet) ) { + stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE) } -## Collect parameters -path_yml <- "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #opt$models -path_samplesheet <- "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #opt$samplesheet -sample_column <- "sample" #opt$sample +### Collect parameters (easier for dev and testing) +path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" # +sample_column <- opt$sample_id_col # "sample" # -# Load models.yml file +# LOAD FILES -------------------------------------------------------- +## Load models.yml file tryCatch({ models <- read_yaml(path_yml) cat("Loaded YML file successfully.\n") @@ -44,7 +55,7 @@ tryCatch({ stop("Error loading YML file: ", e$message) }) -# Load samplesheet CSV file +## Load samplesheet CSV file tryCatch({ samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) if ( !sample_column %in% colnames(samplesheet) ) { @@ -55,47 +66,56 @@ tryCatch({ stop("Error loading samplesheet CSV: ", e$message) }) +# PARSE FACTORS/LEVELS FROM YML FILE -------------------------------- ## Collect all variables and factors from the yml file -var <- list(); contrasts_list <- list(); for (FORMULA in models$models) { +## * var_list: collects all factors levels for each variable. +## * contrast_list: collects info for each contrast - ## Populate list +### Initialize empty lists +var <- list() +contrasts_list <- list() + +## Iterate through models (formula) +for (FORMULA in models$models) { + + ## Iterate through contrasts for each model (formula) for (CONTRAST in FORMULA$contrasts) { - ## Get column name: first comparison's element + ### Get column name: first comparison's element name <- CONTRAST$comparison[1] - ## Get factors + ### Get factors (rest of components) variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] - ## Populate contrasts_list for model validation - contrasts_list[[ CONTRAST$id ]] <- list( + ### Populate contrasts_list for later model validation + contrasts_list[[ CONTRAST$id ]] <- list( ## Adds the ID as main identifier "formula" = FORMULA$formula, "variable" = name, "contrast" = variables, - "blocking_factors" = CONTRAST$blocking_factors + "blocking_factors" = CONTRAST$blocking_factors ## This list can be later extended for "make_contrast" option is required ) - ## If the column is already present in the list, add more components to it + ### Start populating the variables_list + #### If the column is already present in the list, add more components to it if (name %in% names(var) ) { - variables <- unique( c(var[[ name ]], CONTRAST$comparison[2:3] )) - var[[ name ]] <- variables - cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") - ## Default to new variable + variables <- unique( c(var[[ name ]], variables )) ## Combine previous levels with the new ones without duplicating them + var[[ name ]] <- variables ## assign the values to the list + #cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") + #### Default to new element } else { - cat("Detected variable:", name, "\n") - var[[ length(var) + 1 ]] <- variables - names(var)[length(var) ] <- name + #cat("Detected variable:", name, "\n") + var[[ length(var) + 1 ]] <- variables ## Asign levels to a new entry + names(var)[length(var) ] <- name ## Specify the variable name } ## Get blocking factors blocking <- c() - if ( !is.null(CONTRAST$blocking_factors) ) { - cat("Blocking factors detected for variable", name, "\n") + if ( !is.null(CONTRAST$blocking_factors) ) { ## Check if they were defined in the yml + #cat("Blocking factors detected for variable", name, "\n") blocking <- CONTRAST$blocking_factors - cat(paste(blocking, collapse = ""), "\n") } - ## Add blockig variables - if ( "blocking_factors" %in% names(var) ) { + ## Add blocking variables + if ( "blocking_factors" %in% names(var) ) { ## Check if the category exists from previous iterations var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking )) } else { var[[ "blocking_factors" ]] <- unique( blocking ) @@ -103,18 +123,14 @@ var <- list(); contrasts_list <- list(); for (FORMULA in models$models) { } } -## Print explicit message +## Print explicit messages for (INDEX in 1:length(var)) { - cat("Detected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\n", sep = "") + cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") } - -## Function to validate the samplesheet against the yml file -validate_model <- function(sample_column, variables, samplesheet) { # variables: list; samplesheet: df - - #variables <- var - #samplesheet <- samplesheet - undesired_chars <- "[^a-zA-Z0-9_]" +# VALIDATE SAMPLESHEET BASED ON YML FILE ---------------------------- +## Define function to compare YML and samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df ####################################################### ## @@ -144,7 +160,10 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables: # Initialize vector to report continuos variables continuous <- c() - ## Check samplesheet names for invalid characters + ## Do not allow special characters + undesired_chars <- "[^a-zA-Z0-9_.]" + + ## Check samplesheet names for invalid colnames df_colnames <- names(samplesheet) true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) @@ -247,57 +266,80 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables: } } + ## Report if any variable was imported as continuous if ( !is.null(continuous)) { warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) } + ## Report ERRORS and stop if (length(errors) > 0 ) { stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = "")) } - ## Generate validated samplesheet + ## Generate validated phenotypic table tryCatch({ + ## Get desired columns, starting with the one containing sample identificators selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + }, error = function(e) { - stop("Error generating validated samplesheet: ", e$message) + stop("Error generating validated phenotypic table: ", e$message) }) - return(list( 'pheno_table' = pheno_table, 'errors' = errors, 'warnings' = warnings) ) + ## Return list + return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) ) } ## Collect all outputs from function: errors must be empty, warnings can have messages -phenotable_errors_warnings <- validate_model(sample_column, var, samplesheet) +phenotable_warnings <- validate_model(sample_column, var, samplesheet) ## Cat warnings messages -cat(phenotable_errors_warnings[[ 3 ]], "\n") +cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n") -## Get validated sample sheet -pheno_table <- phenotable_errors_warnings[[ 1 ]] +## Get validated pheno table +pheno_table <- phenotable_warnings[[1]] -## CHECK THAT THE MODELS ARE FULL RANKED +## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED check_model_contrasts <- function(contrasts_list, colData) { + + ####################################################### + ## + ## Function that takes a list with models + variables, and a phenotypic table + ## + ## The function evaluates that: + ## * The formulas are usable + ## * The models generated are full ranked + ## + ## Finally, the function returns + ## * An list with all designs + ## * full ranked: true/false + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ### Initialize list for models descriptors + design_list <- list() + ### Iterate over each model for (model_name in names(contrasts_list)) { - # Extract model components + ## Extract model components model <- contrasts_list[[model_name]] - base_formula <- as.formula(model$formula) # Ensure formula object + base_formula <- as.formula(model$formula) variable <- model$variable contrast <- model$contrast - blocking <- model$blocking_factor # Extract blocking factor + blocking <- model$blocking_factor - # Check if blocking factor is already in the formula + ## Check if blocking factor is already in the formula if (!is.null(blocking)) { - # Get terms from the formula + ## Get terms from the formula formula_terms <- all.vars(base_formula) - # If blocking factor is not already in the formula, add it + ## If blocking factor is not already in the formula, add it if (!(blocking %in% formula_terms)) { - #cat("\nAdding blocking factor:", blocking, "to the model.\n") updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) } else { - #cat("\nBlocking factor:", blocking, "is already in the formula. Skipping addition.\n") updated_formula <- base_formula } } else { @@ -308,32 +350,37 @@ check_model_contrasts <- function(contrasts_list, colData) { # Build the design matrix design_matrix <- model.matrix(updated_formula, data = colData) - cat("Design matrix:\n") print(design_matrix) # Check the rank of the design matrix rank <- qr(design_matrix)$rank expected_rank <- ncol(design_matrix) - if (rank == expected_rank) { - cat("The design matrix is full rank.\n") - } else { - cat("WARNING: The design matrix is NOT full rank!\n") - } - - # Check if contrast variable exists in colData - if (variable %in% colnames(colData)) { - factor_levels <- levels(factor(colData[[variable]], levels = contrast)) - if (all(contrast %in% factor_levels)) { - cat("Contrast levels are valid:", paste(contrast, collapse = " vs "), "\n") - } else { - cat("ERROR: Contrast levels", paste(contrast, collapse = " vs "), - "do not exist in", variable, "\n") - } - } else { - cat("ERROR: Contrast variable", variable, "is not in colData.\n") - } + ## Add results to list + design_list[[ length(design_list) + 1 ]] <- list( + "formula" = deparse(updated_formula), + "design_matrix" = design_matrix, + "full_rank" = rank == expected_rank + ) + names(design_list)[ length(design_list) ] <- model_name } + + return(design_list) +} + +design_results <- check_model_contrasts(contrasts_list, pheno_table) + +# EXPORT DATA ------------------------------------------------------- +## Export pheno table +write_csv(pheno_table, "pheno_table.csv") + +## Export designs to JSON +jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE) + +## Export warnings +if ( !is.null(phenotable_warnings[[ 2 ]]) ) { + jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE) } -check_model_contrasts(contrasts_list, pheno_table) +## Export RData +save.image("models.RData") diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml similarity index 100% rename from modules/local/validate_model/input_test/contrasts.yml rename to modules/local/validatemodel/tests/contrasts.yml diff --git a/modules/local/validatemodel/tests/contrasts_maxquant.yml b/modules/local/validatemodel/tests/contrasts_maxquant.yml new file mode 100644 index 00000000..05ee8e5e --- /dev/null +++ b/modules/local/validatemodel/tests/contrasts_maxquant.yml @@ -0,0 +1,12 @@ +models: + - formula: "~ treatment" + contrasts: + - id: "treatment_mCherry_hND6" + comparison: ["treatment", "mCherry", "hND6"] + + - id: "treatment_mCherry_hND6_sample_number" + comparison: ["treatment", "mCherry", "hND6"] + blocking_factors: ["sample_number"] + + - id: "treatment234" + comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test new file mode 100644 index 00000000..d3f2101c --- /dev/null +++ b/modules/local/validatemodel/tests/main.nf.test @@ -0,0 +1,39 @@ +nextflow_process { + + name "Test Process VALIDATE_MODEL" + script "../main.nf" + process "VALIDATE_MODEL" + config "./nextflow.config" + + tag "modules" + tag "models" + tag "yml" + tag "samplesheet" + + test("Samplesheet - YML file - RNASEQ TEST") { + + when { + params { + observations_id_col = 'sample' + } + process { + """ + // Samplesheet + input[0] = + Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true) + + // Models_yml sheet + input[1] = + Channel.fromPath('${moduleDir}/tests/contrasts.yml', checkIfExists: true) + .map{ [ [id: "test_yml"], it ] } + """ + } + } + + then { + assertAll( + { assert process.success } + ) + } + } +} diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config new file mode 100644 index 00000000..52a943d5 --- /dev/null +++ b/modules/local/validatemodel/tests/nextflow.config @@ -0,0 +1,6 @@ +nextflow.enable.moduleBinaries = true +process { + withName: 'VALIDATE_MODEL' { + ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 70af98c8..6a51601d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,7 +42,7 @@ "type": "string", "description": "A CSV file describing sample contrasts", "help_text": "This file is used to define groups of samples from 'input' to compare. It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)", - "pattern": "^\\S+\\.(csv|tsv)$", + "pattern": "^\\S+\\.(csv|tsv|yml|yaml)$", "format": "file-path", "mimetype": "text/csv", "fa_icon": "fas fa-adjust" diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index e49bebda..e70270b1 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -132,6 +132,7 @@ include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/n include { GEOQUERY_GETGEO } from '../modules/nf-core/geoquery/getgeo/main' include { ZIP as MAKE_REPORT_BUNDLE } from '../modules/nf-core/zip/main' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { VALIDATE_MODEL } from '../modules/local/validatemodel/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,6 +148,13 @@ workflow DIFFERENTIALABUNDANCE { // Channel for the contrasts file ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) + + // Run module to validate models + VALIDATE_MODEL( + ch_input, + ch_contrasts_file + ) + // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them From c2f666fa27987448c0277e834cde53fac7e0f233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Wed, 18 Dec 2024 14:25:34 +0000 Subject: [PATCH 05/10] change function structure --- .../resources/usr/bin/validate_model.R | 135 ++++++++++++------ 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R index bde457b1..f7470ff1 100755 --- a/modules/local/validatemodel/resources/usr/bin/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -42,8 +42,8 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) { } ### Collect parameters (easier for dev and testing) -path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" # -path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" # +path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" # sample_column <- opt$sample_id_col # "sample" # # LOAD FILES -------------------------------------------------------- @@ -71,58 +71,99 @@ tryCatch({ ## * var_list: collects all factors levels for each variable. ## * contrast_list: collects info for each contrast -### Initialize empty lists -var <- list() -contrasts_list <- list() +process_models <- function(models) { + # Initialize lists to store the output + contrasts_list <- list() + var <- list() -## Iterate through models (formula) -for (FORMULA in models$models) { - - ## Iterate through contrasts for each model (formula) - for (CONTRAST in FORMULA$contrasts) { - ### Get column name: first comparison's element - name <- CONTRAST$comparison[1] + # Ensure models$models is not null or empty + if (is.null(models$models) || length(models$models) == 0) { + stop("models$models is null or empty. Please provide valid input.") + } - ### Get factors (rest of components) - variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + # Temporary storage for gathering contrasts per variable + temp_var <- list() + blocking_vars <- c() - ### Populate contrasts_list for later model validation - contrasts_list[[ CONTRAST$id ]] <- list( ## Adds the ID as main identifier - "formula" = FORMULA$formula, - "variable" = name, - "contrast" = variables, - "blocking_factors" = CONTRAST$blocking_factors ## This list can be later extended for "make_contrast" option is required - ) + # Iterate through models (formula) + for (FORMULA in models$models) { - ### Start populating the variables_list - #### If the column is already present in the list, add more components to it - if (name %in% names(var) ) { - variables <- unique( c(var[[ name ]], variables )) ## Combine previous levels with the new ones without duplicating them - var[[ name ]] <- variables ## assign the values to the list - #cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") - #### Default to new element - } else { - #cat("Detected variable:", name, "\n") - var[[ length(var) + 1 ]] <- variables ## Asign levels to a new entry - names(var)[length(var) ] <- name ## Specify the variable name + # Ensure FORMULA$contrasts is not null or empty + if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) { + stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.") } - ## Get blocking factors - blocking <- c() - if ( !is.null(CONTRAST$blocking_factors) ) { ## Check if they were defined in the yml - #cat("Blocking factors detected for variable", name, "\n") - blocking <- CONTRAST$blocking_factors + # Iterate through contrasts for each model (formula) + for (CONTRAST in FORMULA$contrasts) { + + # Validate CONTRAST$id + if (is.null(CONTRAST$id)) { + stop("Missing CONTRAST$id detected. Unable to proceed.") + } + + if (CONTRAST$id %in% names(contrasts_list)) { + stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed.")) + } + + # Get column name: first comparison's element + name <- CONTRAST$comparison[1] + + # Check if name is valid + if (is.null(name) || name == "") { + stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.") + } + + # Get factors (rest of components) + variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + + # Populate contrasts_list for later model validation + contrasts_list[[CONTRAST$id]] <- list( + "formula" = FORMULA$formula, + "variable" = name, + "contrast" = variables, + "blocking_factors" = CONTRAST$blocking_factors + ) + + # Gather contrasts per variable into temporary storage + if (name %in% names(temp_var)) { + temp_var[[name]] <- unique(c(temp_var[[name]], variables)) + } else { + temp_var[[name]] <- variables + } + + # Gather blocking factors + if (!is.null(CONTRAST$blocking_factors)) { + blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) + } } + } - ## Add blocking variables - if ( "blocking_factors" %in% names(var) ) { ## Check if the category exists from previous iterations - var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking )) + # Consolidate gathered contrasts into `var` + for (name in names(temp_var)) { + if (name %in% names(var)) { + var[[name]] <- unique(c(var[[name]], temp_var[[name]])) } else { - var[[ "blocking_factors" ]] <- unique( blocking ) + var[[name]] <- temp_var[[name]] } } + + # Add blocking variables to `var` + var[[ "blocking_factors" ]] <- blocking_vars + + # Return the populated lists + return(list( + contrasts_list = contrasts_list, + var = var + )) } +## Process models +models_lists <- process_models(models) +### Extract models contrasts +contrasts_list <- models_lists[["contrasts_list"]] +### Extract variables and levels +var <- models_lists[["var"]] + ## Print explicit messages for (INDEX in 1:length(var)) { cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") @@ -221,7 +262,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col } ## Check that the column data does not contain undesidered characters - if ( sum( stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) > 0 ) { + if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) { errors <- c(errors, paste0("Column ", VARIABLE, " contains undesired characters\n") ) @@ -273,7 +314,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col ## Report ERRORS and stop if (length(errors) > 0 ) { - stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = "")) + stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = "")) } ## Generate validated phenotypic table @@ -350,7 +391,6 @@ check_model_contrasts <- function(contrasts_list, colData) { # Build the design matrix design_matrix <- model.matrix(updated_formula, data = colData) - print(design_matrix) # Check the rank of the design matrix rank <- qr(design_matrix)$rank @@ -365,6 +405,13 @@ check_model_contrasts <- function(contrasts_list, colData) { names(design_list)[ length(design_list) ] <- model_name } + for (DESIGN in names(design_list)) { + cat("Design ", DESIGN, + if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "}, + "full ranked\n", sep = "" + ) + } + return(design_list) } From 6614f4d3927d29b612b0de245033be8e489eca33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Wed, 18 Dec 2024 16:36:57 +0000 Subject: [PATCH 06/10] fix bug in module --- modules/local/validatemodel/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf index b8cf31ce..c7962219 100644 --- a/modules/local/validatemodel/main.nf +++ b/modules/local/validatemodel/main.nf @@ -8,7 +8,7 @@ process VALIDATE_MODEL { 'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }" input: - tuple path(samplesheet) + path samplesheet tuple val(meta), path(models_yml) output: From d9d18c5c4724d3377e2179f67ee98ea019faffbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Thu, 19 Dec 2024 17:03:21 +0000 Subject: [PATCH 07/10] update yml structure input limited to contrasts only --- .../resources/usr/bin/validate_model.R | 125 +++-- .../resources/usr/bin/validate_model.R.bkp | 433 ++++++++++++++++++ .../local/validatemodel/tests/contrasts.yml | 18 +- .../tests/contrasts_maxquant.yml | 12 - .../local/validatemodel/tests/main.nf.test | 3 +- .../validatemodel/tests/main.nf.test.snap | 57 +++ 6 files changed, 555 insertions(+), 93 deletions(-) create mode 100755 modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp delete mode 100644 modules/local/validatemodel/tests/contrasts_maxquant.yml create mode 100644 modules/local/validatemodel/tests/main.nf.test.snap diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R index f7470ff1..62e86932 100755 --- a/modules/local/validatemodel/resources/usr/bin/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -43,7 +43,7 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) { ### Collect parameters (easier for dev and testing) path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" # -path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" # sample_column <- opt$sample_id_col # "sample" # # LOAD FILES -------------------------------------------------------- @@ -59,7 +59,7 @@ tryCatch({ tryCatch({ samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) if ( !sample_column %in% colnames(samplesheet) ) { - stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") ) + stop(paste0("The sample identificator column '", sample_column, "' is not present in the samplesheet file") ) } cat("Loaded samplesheet successfully.\n") }, error = function(e) { @@ -72,78 +72,68 @@ tryCatch({ ## * contrast_list: collects info for each contrast process_models <- function(models) { + # Initialize lists to store the output contrasts_list <- list() var <- list() # Ensure models$models is not null or empty - if (is.null(models$models) || length(models$models) == 0) { - stop("models$models is null or empty. Please provide valid input.") + if (is.null(models$contrasts) || length(models$contrasts) == 0) { + stop("models$contrasts is null or empty. Please provide valid input.") } # Temporary storage for gathering contrasts per variable temp_var <- list() blocking_vars <- c() - # Iterate through models (formula) - for (FORMULA in models$models) { + for (CONTRAST in models$contrasts) { - # Ensure FORMULA$contrasts is not null or empty - if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) { - stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.") + # Validate CONTRAST$id + if (is.null(CONTRAST$id)) { + stop("Missing CONTRAST$id detected.") } - # Iterate through contrasts for each model (formula) - for (CONTRAST in FORMULA$contrasts) { - - # Validate CONTRAST$id - if (is.null(CONTRAST$id)) { - stop("Missing CONTRAST$id detected. Unable to proceed.") - } - - if (CONTRAST$id %in% names(contrasts_list)) { - stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed.")) - } + if (CONTRAST$id %in% names(contrasts_list)) { + stop(paste0("Duplicate CONTRAST$id detected: ", CONTRAST$id, ".")) + } - # Get column name: first comparison's element - name <- CONTRAST$comparison[1] + # Get column name: first comparison's element + variable <- CONTRAST$comparison[1] - # Check if name is valid - if (is.null(name) || name == "") { - stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.") - } + # Check if name is valid + if (is.null(variable) || variable == "") { + stop("Invalid `variable` defined in `CONTRAST$comparison[1]`.") + } - # Get factors (rest of components) - variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + # Get factors (rest of components) + levels <- CONTRAST$comparison[2:length(CONTRAST$comparison)] - # Populate contrasts_list for later model validation - contrasts_list[[CONTRAST$id]] <- list( - "formula" = FORMULA$formula, - "variable" = name, - "contrast" = variables, - "blocking_factors" = CONTRAST$blocking_factors - ) + # Populate contrasts_list for later model validation + contrasts_list[[CONTRAST$id]] <- list( + "variable" = variable, + "contrast" = levels, + "blocking_factors" = CONTRAST$blocking_factors + ) - # Gather contrasts per variable into temporary storage - if (name %in% names(temp_var)) { - temp_var[[name]] <- unique(c(temp_var[[name]], variables)) - } else { - temp_var[[name]] <- variables - } + # Gather contrasts per variable into temporary storage + if (variable %in% names(temp_var)) { + temp_var[[variable]] <- unique(c(temp_var[[variable]], levels)) + } else { + temp_var[[variable]] <- levels + } - # Gather blocking factors - if (!is.null(CONTRAST$blocking_factors)) { - blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) - } + # Gather blocking factors + if (!is.null(CONTRAST$blocking_factors)) { + blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) } } # Consolidate gathered contrasts into `var` - for (name in names(temp_var)) { - if (name %in% names(var)) { - var[[name]] <- unique(c(var[[name]], temp_var[[name]])) + for (variable in names(temp_var)) { + if (variable %in% names(var)) { + var[[variable]] <- unique(c(var[[variable]], temp_var[[variable]])) } else { - var[[name]] <- temp_var[[name]] + var[[variable]] <- temp_var[[variable]] } } @@ -160,9 +150,9 @@ process_models <- function(models) { ## Process models models_lists <- process_models(models) ### Extract models contrasts -contrasts_list <- models_lists[["contrasts_list"]] +contrasts_list <- models_lists[[ "contrasts_list" ]] ### Extract variables and levels -var <- models_lists[["var"]] +var <- models_lists[[ "var" ]] ## Print explicit messages for (INDEX in 1:length(var)) { @@ -261,7 +251,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) } - ## Check that the column data does not contain undesidered characters + ## Check that the column data does not contain undesired characters if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) { errors <- c(errors, paste0("Column ", VARIABLE, " contains undesired characters\n") @@ -344,6 +334,8 @@ pheno_table <- phenotable_warnings[[1]] ## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED check_model_contrasts <- function(contrasts_list, colData) { + #contrasts_list <- contrasts_list + #colData <- pheno_table ####################################################### ## ## Function that takes a list with models + variables, and a phenotypic table @@ -364,33 +356,26 @@ check_model_contrasts <- function(contrasts_list, colData) { design_list <- list() ### Iterate over each model for (model_name in names(contrasts_list)) { + #model_name <- names(contrasts_list[2]) ## Extract model components model <- contrasts_list[[model_name]] - base_formula <- as.formula(model$formula) variable <- model$variable contrast <- model$contrast - blocking <- model$blocking_factor + blocking <- model$blocking_factors - ## Check if blocking factor is already in the formula - if (!is.null(blocking)) { - ## Get terms from the formula - formula_terms <- all.vars(base_formula) + ## Create formula + ### Gather components + formula_terms <- c(variable, blocking) - ## If blocking factor is not already in the formula, add it - if (!(blocking %in% formula_terms)) { - updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) - } else { - updated_formula <- base_formula - } - } else { - updated_formula <- base_formula - } + ### Convert into formula + # TODO: Add flexibility for interactive terms! + dynamic_formula <- as.formula(paste("~", paste(formula_terms, collapse = "+"))) - cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n") + cat("\nChecking:", model_name, "\nFormula:", deparse(dynamic_formula), "\n") # Build the design matrix - design_matrix <- model.matrix(updated_formula, data = colData) + design_matrix <- model.matrix(dynamic_formula, data = colData) # Check the rank of the design matrix rank <- qr(design_matrix)$rank @@ -398,7 +383,7 @@ check_model_contrasts <- function(contrasts_list, colData) { ## Add results to list design_list[[ length(design_list) + 1 ]] <- list( - "formula" = deparse(updated_formula), + "formula" = deparse(dynamic_formula), "design_matrix" = design_matrix, "full_rank" = rank == expected_rank ) diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp new file mode 100755 index 00000000..f7470ff1 --- /dev/null +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp @@ -0,0 +1,433 @@ +#!/usr/bin/env Rscript + +## TODO: add make contrast arguments + +# LOAD LIBRARIES ---------------------------------------------------- +suppressWarnings(suppressMessages({ + library(tidyverse) + library(optparse) + library(yaml) + library(jsonlite) +})) + +# PARSE ARGUMENTS --------------------------------------------------- +## Generate list +option_list <- list( + make_option(c("-y", "--yml"), type = "character", default = NULL, + help = "Path to the models.yml file", metavar = "character"), + make_option(c("-s", "--samplesheet"), type = "character", default = NULL, + help = "Path to the samplesheet CSV file", metavar = "character"), + make_option(c("-i", "--sample_id_col"), type = "character", default = "sample", + help = "Column that contains sample identificators", metavar = "character") +) + +## Parse command-line arguments +opt_parser <- + OptionParser( + option_list = option_list, + description = "Validate a sample sheet against a YML file for model designs.", + epilogue = + " + The process will look for variables and factors in the YML file and validate that they are present in the sample sheet. + Will also look for unwanted characters within columns and colnames. + Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not. + " + ) +opt <- parse_args(opt_parser) + +## Validate input arguments +### Required arguments +if (is.null(opt$yml) || is.null(opt$samplesheet) ) { + stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE) +} + +### Collect parameters (easier for dev and testing) +path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" # +sample_column <- opt$sample_id_col # "sample" # + +# LOAD FILES -------------------------------------------------------- +## Load models.yml file +tryCatch({ + models <- read_yaml(path_yml) + cat("Loaded YML file successfully.\n") +}, error = function(e) { + stop("Error loading YML file: ", e$message) +}) + +## Load samplesheet CSV file +tryCatch({ + samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) + if ( !sample_column %in% colnames(samplesheet) ) { + stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") ) + } + cat("Loaded samplesheet successfully.\n") +}, error = function(e) { + stop("Error loading samplesheet CSV: ", e$message) +}) + +# PARSE FACTORS/LEVELS FROM YML FILE -------------------------------- +## Collect all variables and factors from the yml file +## * var_list: collects all factors levels for each variable. +## * contrast_list: collects info for each contrast + +process_models <- function(models) { + # Initialize lists to store the output + contrasts_list <- list() + var <- list() + + # Ensure models$models is not null or empty + if (is.null(models$models) || length(models$models) == 0) { + stop("models$models is null or empty. Please provide valid input.") + } + + # Temporary storage for gathering contrasts per variable + temp_var <- list() + blocking_vars <- c() + + # Iterate through models (formula) + for (FORMULA in models$models) { + + # Ensure FORMULA$contrasts is not null or empty + if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) { + stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.") + } + + # Iterate through contrasts for each model (formula) + for (CONTRAST in FORMULA$contrasts) { + + # Validate CONTRAST$id + if (is.null(CONTRAST$id)) { + stop("Missing CONTRAST$id detected. Unable to proceed.") + } + + if (CONTRAST$id %in% names(contrasts_list)) { + stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed.")) + } + + # Get column name: first comparison's element + name <- CONTRAST$comparison[1] + + # Check if name is valid + if (is.null(name) || name == "") { + stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.") + } + + # Get factors (rest of components) + variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + + # Populate contrasts_list for later model validation + contrasts_list[[CONTRAST$id]] <- list( + "formula" = FORMULA$formula, + "variable" = name, + "contrast" = variables, + "blocking_factors" = CONTRAST$blocking_factors + ) + + # Gather contrasts per variable into temporary storage + if (name %in% names(temp_var)) { + temp_var[[name]] <- unique(c(temp_var[[name]], variables)) + } else { + temp_var[[name]] <- variables + } + + # Gather blocking factors + if (!is.null(CONTRAST$blocking_factors)) { + blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) + } + } + } + + # Consolidate gathered contrasts into `var` + for (name in names(temp_var)) { + if (name %in% names(var)) { + var[[name]] <- unique(c(var[[name]], temp_var[[name]])) + } else { + var[[name]] <- temp_var[[name]] + } + } + + # Add blocking variables to `var` + var[[ "blocking_factors" ]] <- blocking_vars + + # Return the populated lists + return(list( + contrasts_list = contrasts_list, + var = var + )) +} + +## Process models +models_lists <- process_models(models) +### Extract models contrasts +contrasts_list <- models_lists[["contrasts_list"]] +### Extract variables and levels +var <- models_lists[["var"]] + +## Print explicit messages +for (INDEX in 1:length(var)) { + cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") +} + +# VALIDATE SAMPLESHEET BASED ON YML FILE ---------------------------- +## Define function to compare YML and samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df + + ####################################################### + ## + ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet. + ## + ## The function evaluates that: + ## * Every variable is present in the sample sheet. + ## * Every variable has valid name + ## * There are no NAs + ## * All expected values are present + ## * All samples contains a valid level. + ## + ## Finally, the function returns + ## * An error vector containing all errors found. + ## * A warnings vector + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ## Initialize errors vector + errors <- c() + + ## Initialize warning errors + warnings <- c() + + # Initialize vector to report continuos variables + continuous <- c() + + ## Do not allow special characters + undesired_chars <- "[^a-zA-Z0-9_.]" + + ## Check samplesheet names for invalid colnames + df_colnames <- names(samplesheet) + true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) + + if ( sum(true_columns) > 0 ) { + errors <- c(errors, + paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " ")) + ) + } + + ## Check that blocking variables exists and do not contain NAs, if they were specified + blocking_factors <- c() + if ( !is.null(variables$blocking_factors) ) { + blocking_factors <- variables$blocking_factors + + for (VARIABLE in blocking_factors) { + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]]) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + } else { + errors <- c( + errors, + paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- c(continuous, VARIABLE) + } else { + paste(VARIABLE, "not continuous") + } + } + } + + ## Check variables and levels specified + for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here + + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- unique(c(continuous, VARIABLE)) + } else { + paste(VARIABLE, "is not continuous") + } + + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]] ) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + + ## Check that the column data does not contain undesidered characters + if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) { + errors <- c(errors, + paste0("Column ", VARIABLE, " contains undesired characters\n") + ) + } + + ## Extract the expected levels according to the YML file + expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) ) + + ## Assign names + names(expected_factors_levels) <- variables[[ VARIABLE ]] + + ## Extract the levels from the sample sheet + obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]]) + + ## Check that all expected levels for this variable are effectively present + for (LEVEL in names(expected_factors_levels) ) { + expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels + } + + if( !all(expected_factors_levels) ) { + errors <- c( + errors, + paste0( + "Missing factor levels for variable '", VARIABLE, "'. ", + "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ", + "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " ) + ) + ) + } + + ## Check whether there are levels in the table that are not present in the models definitions + if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) { + warnings <- c(warnings, + paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) )) + } + + ## Report that a column is not even present in the table + } else { + errors <- c( + errors, + paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + } + + ## Report if any variable was imported as continuous + if ( !is.null(continuous)) { + warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) + } + + ## Report ERRORS and stop + if (length(errors) > 0 ) { + stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = "")) + } + + ## Generate validated phenotypic table + tryCatch({ + ## Get desired columns, starting with the one containing sample identificators + selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) + pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + + }, error = function(e) { + stop("Error generating validated phenotypic table: ", e$message) + }) + + ## Return list + return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) ) + +} + +## Collect all outputs from function: errors must be empty, warnings can have messages +phenotable_warnings <- validate_model(sample_column, var, samplesheet) + +## Cat warnings messages +cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n") + +## Get validated pheno table +pheno_table <- phenotable_warnings[[1]] + +## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED +check_model_contrasts <- function(contrasts_list, colData) { + + ####################################################### + ## + ## Function that takes a list with models + variables, and a phenotypic table + ## + ## The function evaluates that: + ## * The formulas are usable + ## * The models generated are full ranked + ## + ## Finally, the function returns + ## * An list with all designs + ## * full ranked: true/false + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ### Initialize list for models descriptors + design_list <- list() + ### Iterate over each model + for (model_name in names(contrasts_list)) { + + ## Extract model components + model <- contrasts_list[[model_name]] + base_formula <- as.formula(model$formula) + variable <- model$variable + contrast <- model$contrast + blocking <- model$blocking_factor + + ## Check if blocking factor is already in the formula + if (!is.null(blocking)) { + ## Get terms from the formula + formula_terms <- all.vars(base_formula) + + ## If blocking factor is not already in the formula, add it + if (!(blocking %in% formula_terms)) { + updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) + } else { + updated_formula <- base_formula + } + } else { + updated_formula <- base_formula + } + + cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n") + + # Build the design matrix + design_matrix <- model.matrix(updated_formula, data = colData) + + # Check the rank of the design matrix + rank <- qr(design_matrix)$rank + expected_rank <- ncol(design_matrix) + + ## Add results to list + design_list[[ length(design_list) + 1 ]] <- list( + "formula" = deparse(updated_formula), + "design_matrix" = design_matrix, + "full_rank" = rank == expected_rank + ) + names(design_list)[ length(design_list) ] <- model_name + } + + for (DESIGN in names(design_list)) { + cat("Design ", DESIGN, + if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "}, + "full ranked\n", sep = "" + ) + } + + return(design_list) +} + +design_results <- check_model_contrasts(contrasts_list, pheno_table) + +# EXPORT DATA ------------------------------------------------------- +## Export pheno table +write_csv(pheno_table, "pheno_table.csv") + +## Export designs to JSON +jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE) + +## Export warnings +if ( !is.null(phenotable_warnings[[ 2 ]]) ) { + jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE) +} + +## Export RData +save.image("models.RData") diff --git a/modules/local/validatemodel/tests/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml index 05ee8e5e..5e893629 100644 --- a/modules/local/validatemodel/tests/contrasts.yml +++ b/modules/local/validatemodel/tests/contrasts.yml @@ -1,12 +1,10 @@ -models: - - formula: "~ treatment" - contrasts: - - id: "treatment_mCherry_hND6" - comparison: ["treatment", "mCherry", "hND6"] +contrasts: + - id: "treatment_mCherry_hND6" + comparison: ["treatment", "mCherry", "hND6"] - - id: "treatment_mCherry_hND6_sample_number" - comparison: ["treatment", "mCherry", "hND6"] - blocking_factors: ["sample_number"] + - id: "treatment_mCherry_hND6_sample_number" + comparison: ["treatment", "mCherry", "hND6"] + blocking_factors: ["sample_number"] - - id: "treatment234" - comparison: ["treatment", "mCherry", "hND6"] + - id: "treatment234" + comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validatemodel/tests/contrasts_maxquant.yml b/modules/local/validatemodel/tests/contrasts_maxquant.yml deleted file mode 100644 index 05ee8e5e..00000000 --- a/modules/local/validatemodel/tests/contrasts_maxquant.yml +++ /dev/null @@ -1,12 +0,0 @@ -models: - - formula: "~ treatment" - contrasts: - - id: "treatment_mCherry_hND6" - comparison: ["treatment", "mCherry", "hND6"] - - - id: "treatment_mCherry_hND6_sample_number" - comparison: ["treatment", "mCherry", "hND6"] - blocking_factors: ["sample_number"] - - - id: "treatment234" - comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test index d3f2101c..7bc7f316 100644 --- a/modules/local/validatemodel/tests/main.nf.test +++ b/modules/local/validatemodel/tests/main.nf.test @@ -32,7 +32,8 @@ nextflow_process { then { assertAll( - { assert process.success } + { assert process.success }, + { assert snapshot(process.out).match() } ) } } diff --git a/modules/local/validatemodel/tests/main.nf.test.snap b/modules/local/validatemodel/tests/main.nf.test.snap new file mode 100644 index 00000000..24f19d55 --- /dev/null +++ b/modules/local/validatemodel/tests/main.nf.test.snap @@ -0,0 +1,57 @@ +{ + "Samplesheet - YML file - RNASEQ TEST": { + "content": [ + { + "0": [ + [ + { + "id": "test_yml" + }, + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" + ] + ], + "1": [ + [ + { + "id": "test_yml" + }, + "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "designs": [ + [ + { + "id": "test_yml" + }, + "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0" + ] + ], + "pheno_table": [ + [ + { + "id": "test_yml" + }, + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" + ] + ], + "versions": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "warnings": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2024-12-19T16:59:52.146249557" + } +} \ No newline at end of file From 6dafd957c9698ddfb4c9d02fca754d9d4ea0af1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Fri, 20 Dec 2024 20:18:02 +0000 Subject: [PATCH 08/10] Add condition to execute the module and update snaps --- conf/modules.config | 8 ++++++- modules/local/validatemodel/main.nf | 4 ++-- .../resources/usr/bin/validate_model.R | 22 ++++++++++++++++--- nextflow.config | 3 +++ tests/test.nf.test.snap | 17 +++++++++----- tests/test_maxquant.nf.test.snap | 17 +++++++++----- workflows/differentialabundance.nf | 14 +++++++----- 7 files changed, 64 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6a1f1cb3..a0230c78 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,13 @@ process { } withName: VALIDATE_MODEL { - ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + ext.args = { + params.observations_name_col ? + "--sample_id_col ${params.observations_name_col}" : + params.observations_id_col ? + "--sample_id_col ${params.observations_id_col}" : + '' + } publishDir = [ path: { "${params.outdir}/validate" }, mode: params.publish_dir_mode, diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf index c7962219..afbb664b 100644 --- a/modules/local/validatemodel/main.nf +++ b/modules/local/validatemodel/main.nf @@ -8,8 +8,8 @@ process VALIDATE_MODEL { 'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }" input: - path samplesheet - tuple val(meta), path(models_yml) + tuple val(meta) , path(samplesheet) + tuple val(meta2), path(models_yml) output: tuple val(meta), path("pheno_table.csv"), emit: pheno_table diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R index 62e86932..f02df2ae 100755 --- a/modules/local/validatemodel/resources/usr/bin/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -49,15 +49,31 @@ sample_column <- opt$sample_id_col # "sample" # # LOAD FILES -------------------------------------------------------- ## Load models.yml file tryCatch({ - models <- read_yaml(path_yml) + if (!file.exists(path_yml)) { + stop(sprintf("File '%s' does not exist.", path_yml), call. = FALSE) + } + if (!grepl("\\.(yaml|yml)$", path_yml, ignore.case = TRUE)) { + stop(sprintf("File '%s' is not a YAML file (expected .yaml or .yml extension).", path_yml), call. = FALSE) + } + models <- yaml::read_yaml(path_yml) cat("Loaded YML file successfully.\n") }, error = function(e) { - stop("Error loading YML file: ", e$message) + stop(sprintf("Error loading YML file '%s': %s", path_yml, e$message), call. = FALSE) }) ## Load samplesheet CSV file tryCatch({ - samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) + # Set the separator based on the file extension + sep <- if (str_detect(path_samplesheet, "\\.(csv|CSV)$")) { + "," + } else if (str_detect(path_samplesheet, "\\.(tsv|TSV)$")) { + "\t" + } else { + stop("Unsupported file extension. Please provide a .csv or .tsv file.") + } + + # Read the file using the determined separator + samplesheet <- read_delim(path_samplesheet, delim = sep, show_col_types = FALSE) if ( !sample_column %in% colnames(samplesheet) ) { stop(paste0("The sample identificator column '", sample_column, "' is not present in the samplesheet file") ) } diff --git a/nextflow.config b/nextflow.config index 230100ac..2ebb7960 100644 --- a/nextflow.config +++ b/nextflow.config @@ -377,6 +377,9 @@ set -C # No clobber - prevent output redirection from overwriting files. // Disable process selector warnings by default. Use debug profile to enable warnings. nextflow.enable.configProcessNamesValidation = false +// Enable loading modules bin files +nextflow.enable.moduleBinaries = true + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index 37f61c5e..bc141b95 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -190,7 +190,7 @@ }, "Test profile with yaml contrasts": { "content": [ - 21, + 22, { "DESEQ2_DIFFERENTIAL": { "r-base": "4.1.3", @@ -321,7 +321,11 @@ "tables/differential/treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv", "tables/processed_abundance", "tables/processed_abundance/all.normalised_counts.tsv", - "tables/processed_abundance/all.vst.tsv" + "tables/processed_abundance/all.vst.tsv", + "validate", + "validate/designs.json", + "validate/pheno_table.csv", + "validate/versions.yml" ], [ "treatment_mCherry_hND6_.dds.rld.rds:md5,dfe51910e230ae9cc2bc9fa5651666ba", @@ -368,13 +372,16 @@ "treatment_mCherry_hND6_sample_number.deseq2.results.tsv:md5,b41cbaacd04a25b45295f3f987ff7500", "treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv:md5,98fa3ec0992859d2a524ecce1259f0ed", "all.normalised_counts.tsv:md5,1d7ad0c02b483f2eff1a5b357a74d011", - "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd" + "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd", + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636", + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403", + "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1" ] ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T13:41:10.4647836" + "timestamp": "2024-12-20T19:25:36.541448998" } } \ No newline at end of file diff --git a/tests/test_maxquant.nf.test.snap b/tests/test_maxquant.nf.test.snap index 619ba2b7..9da3a787 100644 --- a/tests/test_maxquant.nf.test.snap +++ b/tests/test_maxquant.nf.test.snap @@ -177,7 +177,7 @@ }, "Test maxquant profile with yaml contrasts": { "content": [ - 20, + 21, { "LIMMA_DIFFERENTIAL": { "r-base": "4.3.3", @@ -289,7 +289,11 @@ "tables/proteus/Celltype/raw_proteingroups_tab.tsv", "tables/proteus/fakeBatch", "tables/proteus/fakeBatch/normalizeMedian.normalized_proteingroups_tab.tsv", - "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv" + "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv", + "validate", + "validate/designs.json", + "validate/pheno_table.csv", + "validate/versions.yml" ], [ "fakebatch_fakeBatch_b1_b2.MArrayLM.limma.rds:md5,f321bf1e4fd44827810df9a9c38776d6", @@ -342,13 +346,16 @@ "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e", "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a", "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e", - "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a" + "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a", + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006", + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a", + "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1" ] ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T13:45:34.62418232" + "timestamp": "2024-12-20T20:09:08.805617631" } } \ No newline at end of file diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index fad64752..3bad756b 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -150,11 +150,15 @@ workflow DIFFERENTIALABUNDANCE { ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - // Run module to validate models - VALIDATE_MODEL( - ch_input, - ch_contrasts_file - ) + // Run module to validate models from yml file + if ( params.contrasts.endsWith(".yaml") || params.contrasts.endsWith(".yml") ) { + + VALIDATE_MODEL ( + ch_input, + ch_contrasts_file + ) + + } // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them From 17e1e378b9012f3bcc7db0127e8939f1d65c0a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Sat, 21 Dec 2024 15:41:25 +0000 Subject: [PATCH 09/10] address PR comments --- .../resources/usr/bin/validate_model.R | 15 ++-- .../local/validatemodel/tests/contrasts.yml | 10 --- .../local/validatemodel/tests/main.nf.test | 36 +++++++++- .../validatemodel/tests/main.nf.test.snap | 71 ++++++++++++++++--- .../local/validatemodel/tests/nextflow.config | 2 +- tests/test.nf.test.snap | 11 ++- workflows/differentialabundance.nf | 2 + 7 files changed, 114 insertions(+), 33 deletions(-) delete mode 100644 modules/local/validatemodel/tests/contrasts.yml diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R index f02df2ae..05dbf087 100755 --- a/modules/local/validatemodel/resources/usr/bin/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -42,9 +42,9 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) { } ### Collect parameters (easier for dev and testing) -path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" # -path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" # -sample_column <- opt$sample_id_col # "sample" # +path_yml <- opt$yml +path_samplesheet <- opt$samplesheet +sample_column <- opt$sample_id_col # LOAD FILES -------------------------------------------------------- ## Load models.yml file @@ -208,11 +208,11 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col continuous <- c() ## Do not allow special characters - undesired_chars <- "[^a-zA-Z0-9_.]" + undesired_chars <- regex("[^a-zA-Z0-9_.]") ## Check samplesheet names for invalid colnames df_colnames <- names(samplesheet) - true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) + true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars) if ( sum(true_columns) > 0 ) { errors <- c(errors, @@ -268,7 +268,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col } ## Check that the column data does not contain undesired characters - if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) { + if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = undesired_chars ), na.rm = TRUE) > 0 ) { errors <- c(errors, paste0("Column ", VARIABLE, " contains undesired characters\n") ) @@ -350,8 +350,6 @@ pheno_table <- phenotable_warnings[[1]] ## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED check_model_contrasts <- function(contrasts_list, colData) { - #contrasts_list <- contrasts_list - #colData <- pheno_table ####################################################### ## ## Function that takes a list with models + variables, and a phenotypic table @@ -372,7 +370,6 @@ check_model_contrasts <- function(contrasts_list, colData) { design_list <- list() ### Iterate over each model for (model_name in names(contrasts_list)) { - #model_name <- names(contrasts_list[2]) ## Extract model components model <- contrasts_list[[model_name]] diff --git a/modules/local/validatemodel/tests/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml deleted file mode 100644 index 5e893629..00000000 --- a/modules/local/validatemodel/tests/contrasts.yml +++ /dev/null @@ -1,10 +0,0 @@ -contrasts: - - id: "treatment_mCherry_hND6" - comparison: ["treatment", "mCherry", "hND6"] - - - id: "treatment_mCherry_hND6_sample_number" - comparison: ["treatment", "mCherry", "hND6"] - blocking_factors: ["sample_number"] - - - id: "treatment234" - comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test index 7bc7f316..e35f5de9 100644 --- a/modules/local/validatemodel/tests/main.nf.test +++ b/modules/local/validatemodel/tests/main.nf.test @@ -10,21 +10,22 @@ nextflow_process { tag "yml" tag "samplesheet" - test("Samplesheet - YML file - RNASEQ TEST") { + test("Samplesheet - YML file - TEST") { when { params { - observations_id_col = 'sample' + module_args = "--sample_id_col 'sample'" } process { """ // Samplesheet input[0] = Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true) + .map{ [ [id: "test_samplesheet"], it ] } // Models_yml sheet input[1] = - Channel.fromPath('${moduleDir}/tests/contrasts.yml', checkIfExists: true) + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/SRP254919.contrasts.yaml", checkIfExists: true) .map{ [ [id: "test_yml"], it ] } """ } @@ -37,4 +38,33 @@ nextflow_process { ) } } + + test("Samplesheet - YML file - MaxQuant TEST") { + + when { + params { + module_args = "--sample_id_col 'Name'" + } + process { + """ + // Samplesheet + input[0] = + Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv', checkIfExists: true) + .map{ [ [id: "maxquant_samplesheet"], it ] } + + // Models_yml sheet + input[1] = + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/MaxQuant_contrasts.yaml", checkIfExists: true) + .map{ [ [id: "maxquant_yml"], it ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } } diff --git a/modules/local/validatemodel/tests/main.nf.test.snap b/modules/local/validatemodel/tests/main.nf.test.snap index 24f19d55..24d37912 100644 --- a/modules/local/validatemodel/tests/main.nf.test.snap +++ b/modules/local/validatemodel/tests/main.nf.test.snap @@ -1,11 +1,66 @@ { - "Samplesheet - YML file - RNASEQ TEST": { + "Samplesheet - YML file - MaxQuant TEST": { "content": [ { "0": [ [ { - "id": "test_yml" + "id": "maxquant_samplesheet" + }, + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a" + ] + ], + "1": [ + [ + { + "id": "maxquant_samplesheet" + }, + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "designs": [ + [ + { + "id": "maxquant_samplesheet" + }, + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006" + ] + ], + "pheno_table": [ + [ + { + "id": "maxquant_samplesheet" + }, + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a" + ] + ], + "versions": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "warnings": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2024-12-21T15:37:37.285885311" + }, + "Samplesheet - YML file - TEST": { + "content": [ + { + "0": [ + [ + { + "id": "test_samplesheet" }, "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" ] @@ -13,9 +68,9 @@ "1": [ [ { - "id": "test_yml" + "id": "test_samplesheet" }, - "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0" + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636" ] ], "2": [ @@ -27,15 +82,15 @@ "designs": [ [ { - "id": "test_yml" + "id": "test_samplesheet" }, - "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0" + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636" ] ], "pheno_table": [ [ { - "id": "test_yml" + "id": "test_samplesheet" }, "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" ] @@ -52,6 +107,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-19T16:59:52.146249557" + "timestamp": "2024-12-21T15:37:15.759211744" } } \ No newline at end of file diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config index 52a943d5..32115f62 100644 --- a/modules/local/validatemodel/tests/nextflow.config +++ b/modules/local/validatemodel/tests/nextflow.config @@ -1,6 +1,6 @@ nextflow.enable.moduleBinaries = true process { withName: 'VALIDATE_MODEL' { - ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + ext.args = params.module_args } } diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index 142e1497..777b30c1 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -215,6 +215,13 @@ "PLOT_EXPLORATORY": { "r-shinyngs": "2.0.0" }, + "VALIDATE_MODEL": { + "r-base": "4.4.2", + "tidyverse": "2.0.0", + "optparse": "1.7.5", + "yaml": "2.3.10", + "jsonlite": "1.8.9" + }, "VALIDATOR": { "r-base": "4.3.3", "r-shinyngs": "2.0.0" @@ -382,6 +389,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T19:25:36.541448998" + "timestamp": "2024-12-21T15:26:26.473702144" } -} +} \ No newline at end of file diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index ce527d66..707395dc 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -158,6 +158,8 @@ workflow DIFFERENTIALABUNDANCE { ch_contrasts_file ) + ch_versions = ch_versions + .mix(VALIDATE_MODEL.out.versions) } // If we have affy array data in the form of CEL files we'll be deriving From 6ed079319d408163cb0ef3e337db6d5b7ec9578c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Sat, 21 Dec 2024 16:06:27 +0000 Subject: [PATCH 10/10] update snaps --- tests/test_maxquant.nf.test.snap | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_maxquant.nf.test.snap b/tests/test_maxquant.nf.test.snap index 9da3a787..3cc0edfa 100644 --- a/tests/test_maxquant.nf.test.snap +++ b/tests/test_maxquant.nf.test.snap @@ -196,6 +196,13 @@ "r-plotly": "4.10.2", "bioconductor-limma": "3.54.0" }, + "VALIDATE_MODEL": { + "r-base": "4.4.2", + "tidyverse": "2.0.0", + "optparse": "1.7.5", + "yaml": "2.3.10", + "jsonlite": "1.8.9" + }, "VALIDATOR": { "r-base": "4.3.3", "r-shinyngs": "2.0.0" @@ -356,6 +363,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T20:09:08.805617631" + "timestamp": "2024-12-21T15:58:50.7683424" } } \ No newline at end of file