diff --git a/conf/modules.config b/conf/modules.config index b18c8b1d..94f6db2b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -42,6 +42,21 @@ process { ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" } + withName: VALIDATE_MODEL { + ext.args = { + params.observations_name_col ? + "--sample_id_col ${params.observations_name_col}" : + params.observations_id_col ? + "--sample_id_col ${params.observations_id_col}" : + '' + } + publishDir = [ + path: { "${params.outdir}/validate" }, + mode: params.publish_dir_mode, + ] + + } + withName: AFFY_JUSTRMA_RAW { publishDir = [ [ diff --git a/modules/local/validatemodel/environment.yml b/modules/local/validatemodel/environment.yml new file mode 100644 index 00000000..9a8af4a9 --- /dev/null +++ b/modules/local/validatemodel/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-jsonlite + - conda-forge::r-optparse + - conda-forge::r-tidyverse + - conda-forge::r-yaml diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf new file mode 100644 index 00000000..afbb664b --- /dev/null +++ b/modules/local/validatemodel/main.nf @@ -0,0 +1,57 @@ +process VALIDATE_MODEL { + tag "${models_yml}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f5/f5ef116da0b04fc6359efa48f08b9acfc040a02d7a1ec62476c77b6db0fc1499/data' : + 'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }" + + input: + tuple val(meta) , path(samplesheet) + tuple val(meta2), path(models_yml) + + output: + tuple val(meta), path("pheno_table.csv"), emit: pheno_table + tuple val(meta), path("designs.json") , emit: designs + tuple val(meta), path("warnings.json") , emit: warnings, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + validate_model.R \\ + --yml "${models_yml}" \\ + --samplesheet "${samplesheet}" \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + + stub: + """ + touch pheno_table.csv + touch designs.json + touch warnings.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + +} diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R new file mode 100755 index 00000000..05dbf087 --- /dev/null +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -0,0 +1,431 @@ +#!/usr/bin/env Rscript + +## TODO: add make contrast arguments + +# LOAD LIBRARIES ---------------------------------------------------- +suppressWarnings(suppressMessages({ + library(tidyverse) + library(optparse) + library(yaml) + library(jsonlite) +})) + +# PARSE ARGUMENTS --------------------------------------------------- +## Generate list +option_list <- list( + make_option(c("-y", "--yml"), type = "character", default = NULL, + help = "Path to the models.yml file", metavar = "character"), + make_option(c("-s", "--samplesheet"), type = "character", default = NULL, + help = "Path to the samplesheet CSV file", metavar = "character"), + make_option(c("-i", "--sample_id_col"), type = "character", default = "sample", + help = "Column that contains sample identificators", metavar = "character") +) + +## Parse command-line arguments +opt_parser <- + OptionParser( + option_list = option_list, + description = "Validate a sample sheet against a YML file for model designs.", + epilogue = + " + The process will look for variables and factors in the YML file and validate that they are present in the sample sheet. + Will also look for unwanted characters within columns and colnames. + Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not. + " + ) +opt <- parse_args(opt_parser) + +## Validate input arguments +### Required arguments +if (is.null(opt$yml) || is.null(opt$samplesheet) ) { + stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE) +} + +### Collect parameters (easier for dev and testing) +path_yml <- opt$yml +path_samplesheet <- opt$samplesheet +sample_column <- opt$sample_id_col + +# LOAD FILES -------------------------------------------------------- +## Load models.yml file +tryCatch({ + if (!file.exists(path_yml)) { + stop(sprintf("File '%s' does not exist.", path_yml), call. = FALSE) + } + if (!grepl("\\.(yaml|yml)$", path_yml, ignore.case = TRUE)) { + stop(sprintf("File '%s' is not a YAML file (expected .yaml or .yml extension).", path_yml), call. = FALSE) + } + models <- yaml::read_yaml(path_yml) + cat("Loaded YML file successfully.\n") +}, error = function(e) { + stop(sprintf("Error loading YML file '%s': %s", path_yml, e$message), call. = FALSE) +}) + +## Load samplesheet CSV file +tryCatch({ + # Set the separator based on the file extension + sep <- if (str_detect(path_samplesheet, "\\.(csv|CSV)$")) { + "," + } else if (str_detect(path_samplesheet, "\\.(tsv|TSV)$")) { + "\t" + } else { + stop("Unsupported file extension. Please provide a .csv or .tsv file.") + } + + # Read the file using the determined separator + samplesheet <- read_delim(path_samplesheet, delim = sep, show_col_types = FALSE) + if ( !sample_column %in% colnames(samplesheet) ) { + stop(paste0("The sample identificator column '", sample_column, "' is not present in the samplesheet file") ) + } + cat("Loaded samplesheet successfully.\n") +}, error = function(e) { + stop("Error loading samplesheet CSV: ", e$message) +}) + +# PARSE FACTORS/LEVELS FROM YML FILE -------------------------------- +## Collect all variables and factors from the yml file +## * var_list: collects all factors levels for each variable. +## * contrast_list: collects info for each contrast + +process_models <- function(models) { + + # Initialize lists to store the output + contrasts_list <- list() + var <- list() + + # Ensure models$models is not null or empty + if (is.null(models$contrasts) || length(models$contrasts) == 0) { + stop("models$contrasts is null or empty. Please provide valid input.") + } + + # Temporary storage for gathering contrasts per variable + temp_var <- list() + blocking_vars <- c() + + for (CONTRAST in models$contrasts) { + + # Validate CONTRAST$id + if (is.null(CONTRAST$id)) { + stop("Missing CONTRAST$id detected.") + } + + if (CONTRAST$id %in% names(contrasts_list)) { + stop(paste0("Duplicate CONTRAST$id detected: ", CONTRAST$id, ".")) + } + + # Get column name: first comparison's element + variable <- CONTRAST$comparison[1] + + # Check if name is valid + if (is.null(variable) || variable == "") { + stop("Invalid `variable` defined in `CONTRAST$comparison[1]`.") + } + + # Get factors (rest of components) + levels <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + + # Populate contrasts_list for later model validation + contrasts_list[[CONTRAST$id]] <- list( + "variable" = variable, + "contrast" = levels, + "blocking_factors" = CONTRAST$blocking_factors + ) + + # Gather contrasts per variable into temporary storage + if (variable %in% names(temp_var)) { + temp_var[[variable]] <- unique(c(temp_var[[variable]], levels)) + } else { + temp_var[[variable]] <- levels + } + + # Gather blocking factors + if (!is.null(CONTRAST$blocking_factors)) { + blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) + } + } + + # Consolidate gathered contrasts into `var` + for (variable in names(temp_var)) { + if (variable %in% names(var)) { + var[[variable]] <- unique(c(var[[variable]], temp_var[[variable]])) + } else { + var[[variable]] <- temp_var[[variable]] + } + } + + # Add blocking variables to `var` + var[[ "blocking_factors" ]] <- blocking_vars + + # Return the populated lists + return(list( + contrasts_list = contrasts_list, + var = var + )) +} + +## Process models +models_lists <- process_models(models) +### Extract models contrasts +contrasts_list <- models_lists[[ "contrasts_list" ]] +### Extract variables and levels +var <- models_lists[[ "var" ]] + +## Print explicit messages +for (INDEX in 1:length(var)) { + cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") +} + +# VALIDATE SAMPLESHEET BASED ON YML FILE ---------------------------- +## Define function to compare YML and samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df + + ####################################################### + ## + ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet. + ## + ## The function evaluates that: + ## * Every variable is present in the sample sheet. + ## * Every variable has valid name + ## * There are no NAs + ## * All expected values are present + ## * All samples contains a valid level. + ## + ## Finally, the function returns + ## * An error vector containing all errors found. + ## * A warnings vector + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ## Initialize errors vector + errors <- c() + + ## Initialize warning errors + warnings <- c() + + # Initialize vector to report continuos variables + continuous <- c() + + ## Do not allow special characters + undesired_chars <- regex("[^a-zA-Z0-9_.]") + + ## Check samplesheet names for invalid colnames + df_colnames <- names(samplesheet) + true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars) + + if ( sum(true_columns) > 0 ) { + errors <- c(errors, + paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " ")) + ) + } + + ## Check that blocking variables exists and do not contain NAs, if they were specified + blocking_factors <- c() + if ( !is.null(variables$blocking_factors) ) { + blocking_factors <- variables$blocking_factors + + for (VARIABLE in blocking_factors) { + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]]) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + } else { + errors <- c( + errors, + paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- c(continuous, VARIABLE) + } else { + paste(VARIABLE, "not continuous") + } + } + } + + ## Check variables and levels specified + for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here + + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- unique(c(continuous, VARIABLE)) + } else { + paste(VARIABLE, "is not continuous") + } + + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]] ) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + + ## Check that the column data does not contain undesired characters + if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = undesired_chars ), na.rm = TRUE) > 0 ) { + errors <- c(errors, + paste0("Column ", VARIABLE, " contains undesired characters\n") + ) + } + + ## Extract the expected levels according to the YML file + expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) ) + + ## Assign names + names(expected_factors_levels) <- variables[[ VARIABLE ]] + + ## Extract the levels from the sample sheet + obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]]) + + ## Check that all expected levels for this variable are effectively present + for (LEVEL in names(expected_factors_levels) ) { + expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels + } + + if( !all(expected_factors_levels) ) { + errors <- c( + errors, + paste0( + "Missing factor levels for variable '", VARIABLE, "'. ", + "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ", + "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " ) + ) + ) + } + + ## Check whether there are levels in the table that are not present in the models definitions + if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) { + warnings <- c(warnings, + paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) )) + } + + ## Report that a column is not even present in the table + } else { + errors <- c( + errors, + paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + } + + ## Report if any variable was imported as continuous + if ( !is.null(continuous)) { + warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) + } + + ## Report ERRORS and stop + if (length(errors) > 0 ) { + stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = "")) + } + + ## Generate validated phenotypic table + tryCatch({ + ## Get desired columns, starting with the one containing sample identificators + selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) + pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + + }, error = function(e) { + stop("Error generating validated phenotypic table: ", e$message) + }) + + ## Return list + return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) ) + +} + +## Collect all outputs from function: errors must be empty, warnings can have messages +phenotable_warnings <- validate_model(sample_column, var, samplesheet) + +## Cat warnings messages +cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n") + +## Get validated pheno table +pheno_table <- phenotable_warnings[[1]] + +## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED +check_model_contrasts <- function(contrasts_list, colData) { + + ####################################################### + ## + ## Function that takes a list with models + variables, and a phenotypic table + ## + ## The function evaluates that: + ## * The formulas are usable + ## * The models generated are full ranked + ## + ## Finally, the function returns + ## * An list with all designs + ## * full ranked: true/false + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ### Initialize list for models descriptors + design_list <- list() + ### Iterate over each model + for (model_name in names(contrasts_list)) { + + ## Extract model components + model <- contrasts_list[[model_name]] + variable <- model$variable + contrast <- model$contrast + blocking <- model$blocking_factors + + ## Create formula + ### Gather components + formula_terms <- c(variable, blocking) + + ### Convert into formula + # TODO: Add flexibility for interactive terms! + dynamic_formula <- as.formula(paste("~", paste(formula_terms, collapse = "+"))) + + cat("\nChecking:", model_name, "\nFormula:", deparse(dynamic_formula), "\n") + + # Build the design matrix + design_matrix <- model.matrix(dynamic_formula, data = colData) + + # Check the rank of the design matrix + rank <- qr(design_matrix)$rank + expected_rank <- ncol(design_matrix) + + ## Add results to list + design_list[[ length(design_list) + 1 ]] <- list( + "formula" = deparse(dynamic_formula), + "design_matrix" = design_matrix, + "full_rank" = rank == expected_rank + ) + names(design_list)[ length(design_list) ] <- model_name + } + + for (DESIGN in names(design_list)) { + cat("Design ", DESIGN, + if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "}, + "full ranked\n", sep = "" + ) + } + + return(design_list) +} + +design_results <- check_model_contrasts(contrasts_list, pheno_table) + +# EXPORT DATA ------------------------------------------------------- +## Export pheno table +write_csv(pheno_table, "pheno_table.csv") + +## Export designs to JSON +jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE) + +## Export warnings +if ( !is.null(phenotable_warnings[[ 2 ]]) ) { + jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE) +} + +## Export RData +save.image("models.RData") diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp new file mode 100755 index 00000000..f7470ff1 --- /dev/null +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp @@ -0,0 +1,433 @@ +#!/usr/bin/env Rscript + +## TODO: add make contrast arguments + +# LOAD LIBRARIES ---------------------------------------------------- +suppressWarnings(suppressMessages({ + library(tidyverse) + library(optparse) + library(yaml) + library(jsonlite) +})) + +# PARSE ARGUMENTS --------------------------------------------------- +## Generate list +option_list <- list( + make_option(c("-y", "--yml"), type = "character", default = NULL, + help = "Path to the models.yml file", metavar = "character"), + make_option(c("-s", "--samplesheet"), type = "character", default = NULL, + help = "Path to the samplesheet CSV file", metavar = "character"), + make_option(c("-i", "--sample_id_col"), type = "character", default = "sample", + help = "Column that contains sample identificators", metavar = "character") +) + +## Parse command-line arguments +opt_parser <- + OptionParser( + option_list = option_list, + description = "Validate a sample sheet against a YML file for model designs.", + epilogue = + " + The process will look for variables and factors in the YML file and validate that they are present in the sample sheet. + Will also look for unwanted characters within columns and colnames. + Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not. + " + ) +opt <- parse_args(opt_parser) + +## Validate input arguments +### Required arguments +if (is.null(opt$yml) || is.null(opt$samplesheet) ) { + stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE) +} + +### Collect parameters (easier for dev and testing) +path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" # +sample_column <- opt$sample_id_col # "sample" # + +# LOAD FILES -------------------------------------------------------- +## Load models.yml file +tryCatch({ + models <- read_yaml(path_yml) + cat("Loaded YML file successfully.\n") +}, error = function(e) { + stop("Error loading YML file: ", e$message) +}) + +## Load samplesheet CSV file +tryCatch({ + samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) + if ( !sample_column %in% colnames(samplesheet) ) { + stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") ) + } + cat("Loaded samplesheet successfully.\n") +}, error = function(e) { + stop("Error loading samplesheet CSV: ", e$message) +}) + +# PARSE FACTORS/LEVELS FROM YML FILE -------------------------------- +## Collect all variables and factors from the yml file +## * var_list: collects all factors levels for each variable. +## * contrast_list: collects info for each contrast + +process_models <- function(models) { + # Initialize lists to store the output + contrasts_list <- list() + var <- list() + + # Ensure models$models is not null or empty + if (is.null(models$models) || length(models$models) == 0) { + stop("models$models is null or empty. Please provide valid input.") + } + + # Temporary storage for gathering contrasts per variable + temp_var <- list() + blocking_vars <- c() + + # Iterate through models (formula) + for (FORMULA in models$models) { + + # Ensure FORMULA$contrasts is not null or empty + if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) { + stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.") + } + + # Iterate through contrasts for each model (formula) + for (CONTRAST in FORMULA$contrasts) { + + # Validate CONTRAST$id + if (is.null(CONTRAST$id)) { + stop("Missing CONTRAST$id detected. Unable to proceed.") + } + + if (CONTRAST$id %in% names(contrasts_list)) { + stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed.")) + } + + # Get column name: first comparison's element + name <- CONTRAST$comparison[1] + + # Check if name is valid + if (is.null(name) || name == "") { + stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.") + } + + # Get factors (rest of components) + variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] + + # Populate contrasts_list for later model validation + contrasts_list[[CONTRAST$id]] <- list( + "formula" = FORMULA$formula, + "variable" = name, + "contrast" = variables, + "blocking_factors" = CONTRAST$blocking_factors + ) + + # Gather contrasts per variable into temporary storage + if (name %in% names(temp_var)) { + temp_var[[name]] <- unique(c(temp_var[[name]], variables)) + } else { + temp_var[[name]] <- variables + } + + # Gather blocking factors + if (!is.null(CONTRAST$blocking_factors)) { + blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors)) + } + } + } + + # Consolidate gathered contrasts into `var` + for (name in names(temp_var)) { + if (name %in% names(var)) { + var[[name]] <- unique(c(var[[name]], temp_var[[name]])) + } else { + var[[name]] <- temp_var[[name]] + } + } + + # Add blocking variables to `var` + var[[ "blocking_factors" ]] <- blocking_vars + + # Return the populated lists + return(list( + contrasts_list = contrasts_list, + var = var + )) +} + +## Process models +models_lists <- process_models(models) +### Extract models contrasts +contrasts_list <- models_lists[["contrasts_list"]] +### Extract variables and levels +var <- models_lists[["var"]] + +## Print explicit messages +for (INDEX in 1:length(var)) { + cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") +} + +# VALIDATE SAMPLESHEET BASED ON YML FILE ---------------------------- +## Define function to compare YML and samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df + + ####################################################### + ## + ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet. + ## + ## The function evaluates that: + ## * Every variable is present in the sample sheet. + ## * Every variable has valid name + ## * There are no NAs + ## * All expected values are present + ## * All samples contains a valid level. + ## + ## Finally, the function returns + ## * An error vector containing all errors found. + ## * A warnings vector + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ## Initialize errors vector + errors <- c() + + ## Initialize warning errors + warnings <- c() + + # Initialize vector to report continuos variables + continuous <- c() + + ## Do not allow special characters + undesired_chars <- "[^a-zA-Z0-9_.]" + + ## Check samplesheet names for invalid colnames + df_colnames <- names(samplesheet) + true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) + + if ( sum(true_columns) > 0 ) { + errors <- c(errors, + paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " ")) + ) + } + + ## Check that blocking variables exists and do not contain NAs, if they were specified + blocking_factors <- c() + if ( !is.null(variables$blocking_factors) ) { + blocking_factors <- variables$blocking_factors + + for (VARIABLE in blocking_factors) { + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]]) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + } else { + errors <- c( + errors, + paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- c(continuous, VARIABLE) + } else { + paste(VARIABLE, "not continuous") + } + } + } + + ## Check variables and levels specified + for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here + + ## Check that the column exists + if ( VARIABLE %in% colnames(samplesheet) ) { + + ## Alert about continuous variables + if ( is.numeric(samplesheet[[ VARIABLE ]]) ) { + continuous <- unique(c(continuous, VARIABLE)) + } else { + paste(VARIABLE, "is not continuous") + } + + ## Check if there are NAs + na_rows <- is.na( samplesheet[[ VARIABLE ]] ) + if ( sum(na_rows) > 0 ) { + errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " "))) + } + + ## Check that the column data does not contain undesidered characters + if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) { + errors <- c(errors, + paste0("Column ", VARIABLE, " contains undesired characters\n") + ) + } + + ## Extract the expected levels according to the YML file + expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) ) + + ## Assign names + names(expected_factors_levels) <- variables[[ VARIABLE ]] + + ## Extract the levels from the sample sheet + obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]]) + + ## Check that all expected levels for this variable are effectively present + for (LEVEL in names(expected_factors_levels) ) { + expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels + } + + if( !all(expected_factors_levels) ) { + errors <- c( + errors, + paste0( + "Missing factor levels for variable '", VARIABLE, "'. ", + "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ", + "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " ) + ) + ) + } + + ## Check whether there are levels in the table that are not present in the models definitions + if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) { + warnings <- c(warnings, + paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) )) + } + + ## Report that a column is not even present in the table + } else { + errors <- c( + errors, + paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files.")) + } + } + + ## Report if any variable was imported as continuous + if ( !is.null(continuous)) { + warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) + } + + ## Report ERRORS and stop + if (length(errors) > 0 ) { + stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = "")) + } + + ## Generate validated phenotypic table + tryCatch({ + ## Get desired columns, starting with the one containing sample identificators + selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) + pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + + }, error = function(e) { + stop("Error generating validated phenotypic table: ", e$message) + }) + + ## Return list + return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) ) + +} + +## Collect all outputs from function: errors must be empty, warnings can have messages +phenotable_warnings <- validate_model(sample_column, var, samplesheet) + +## Cat warnings messages +cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n") + +## Get validated pheno table +pheno_table <- phenotable_warnings[[1]] + +## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED +check_model_contrasts <- function(contrasts_list, colData) { + + ####################################################### + ## + ## Function that takes a list with models + variables, and a phenotypic table + ## + ## The function evaluates that: + ## * The formulas are usable + ## * The models generated are full ranked + ## + ## Finally, the function returns + ## * An list with all designs + ## * full ranked: true/false + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ### Initialize list for models descriptors + design_list <- list() + ### Iterate over each model + for (model_name in names(contrasts_list)) { + + ## Extract model components + model <- contrasts_list[[model_name]] + base_formula <- as.formula(model$formula) + variable <- model$variable + contrast <- model$contrast + blocking <- model$blocking_factor + + ## Check if blocking factor is already in the formula + if (!is.null(blocking)) { + ## Get terms from the formula + formula_terms <- all.vars(base_formula) + + ## If blocking factor is not already in the formula, add it + if (!(blocking %in% formula_terms)) { + updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) + } else { + updated_formula <- base_formula + } + } else { + updated_formula <- base_formula + } + + cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n") + + # Build the design matrix + design_matrix <- model.matrix(updated_formula, data = colData) + + # Check the rank of the design matrix + rank <- qr(design_matrix)$rank + expected_rank <- ncol(design_matrix) + + ## Add results to list + design_list[[ length(design_list) + 1 ]] <- list( + "formula" = deparse(updated_formula), + "design_matrix" = design_matrix, + "full_rank" = rank == expected_rank + ) + names(design_list)[ length(design_list) ] <- model_name + } + + for (DESIGN in names(design_list)) { + cat("Design ", DESIGN, + if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "}, + "full ranked\n", sep = "" + ) + } + + return(design_list) +} + +design_results <- check_model_contrasts(contrasts_list, pheno_table) + +# EXPORT DATA ------------------------------------------------------- +## Export pheno table +write_csv(pheno_table, "pheno_table.csv") + +## Export designs to JSON +jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE) + +## Export warnings +if ( !is.null(phenotable_warnings[[ 2 ]]) ) { + jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE) +} + +## Export RData +save.image("models.RData") diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test new file mode 100644 index 00000000..e35f5de9 --- /dev/null +++ b/modules/local/validatemodel/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + + name "Test Process VALIDATE_MODEL" + script "../main.nf" + process "VALIDATE_MODEL" + config "./nextflow.config" + + tag "modules" + tag "models" + tag "yml" + tag "samplesheet" + + test("Samplesheet - YML file - TEST") { + + when { + params { + module_args = "--sample_id_col 'sample'" + } + process { + """ + // Samplesheet + input[0] = + Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true) + .map{ [ [id: "test_samplesheet"], it ] } + + // Models_yml sheet + input[1] = + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/SRP254919.contrasts.yaml", checkIfExists: true) + .map{ [ [id: "test_yml"], it ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("Samplesheet - YML file - MaxQuant TEST") { + + when { + params { + module_args = "--sample_id_col 'Name'" + } + process { + """ + // Samplesheet + input[0] = + Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv', checkIfExists: true) + .map{ [ [id: "maxquant_samplesheet"], it ] } + + // Models_yml sheet + input[1] = + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/MaxQuant_contrasts.yaml", checkIfExists: true) + .map{ [ [id: "maxquant_yml"], it ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/validatemodel/tests/main.nf.test.snap b/modules/local/validatemodel/tests/main.nf.test.snap new file mode 100644 index 00000000..24d37912 --- /dev/null +++ b/modules/local/validatemodel/tests/main.nf.test.snap @@ -0,0 +1,112 @@ +{ + "Samplesheet - YML file - MaxQuant TEST": { + "content": [ + { + "0": [ + [ + { + "id": "maxquant_samplesheet" + }, + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a" + ] + ], + "1": [ + [ + { + "id": "maxquant_samplesheet" + }, + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "designs": [ + [ + { + "id": "maxquant_samplesheet" + }, + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006" + ] + ], + "pheno_table": [ + [ + { + "id": "maxquant_samplesheet" + }, + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a" + ] + ], + "versions": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "warnings": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2024-12-21T15:37:37.285885311" + }, + "Samplesheet - YML file - TEST": { + "content": [ + { + "0": [ + [ + { + "id": "test_samplesheet" + }, + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" + ] + ], + "1": [ + [ + { + "id": "test_samplesheet" + }, + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "designs": [ + [ + { + "id": "test_samplesheet" + }, + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636" + ] + ], + "pheno_table": [ + [ + { + "id": "test_samplesheet" + }, + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403" + ] + ], + "versions": [ + "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950" + ], + "warnings": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2024-12-21T15:37:15.759211744" + } +} \ No newline at end of file diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config new file mode 100644 index 00000000..32115f62 --- /dev/null +++ b/modules/local/validatemodel/tests/nextflow.config @@ -0,0 +1,6 @@ +nextflow.enable.moduleBinaries = true +process { + withName: 'VALIDATE_MODEL' { + ext.args = params.module_args + } +} diff --git a/nextflow.config b/nextflow.config index 230100ac..2ebb7960 100644 --- a/nextflow.config +++ b/nextflow.config @@ -377,6 +377,9 @@ set -C # No clobber - prevent output redirection from overwriting files. // Disable process selector warnings by default. Use debug profile to enable warnings. nextflow.enable.configProcessNamesValidation = false +// Enable loading modules bin files +nextflow.enable.moduleBinaries = true + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index bbd6b2de..777b30c1 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -190,7 +190,7 @@ }, "Test profile with yaml contrasts": { "content": [ - 21, + 22, { "CUSTOM_TABULARTOGSEACHIP": { "gawk": "5.1.0" @@ -215,6 +215,13 @@ "PLOT_EXPLORATORY": { "r-shinyngs": "2.0.0" }, + "VALIDATE_MODEL": { + "r-base": "4.4.2", + "tidyverse": "2.0.0", + "optparse": "1.7.5", + "yaml": "2.3.10", + "jsonlite": "1.8.9" + }, "VALIDATOR": { "r-base": "4.3.3", "r-shinyngs": "2.0.0" @@ -321,7 +328,11 @@ "tables/differential/treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv", "tables/processed_abundance", "tables/processed_abundance/all.normalised_counts.tsv", - "tables/processed_abundance/all.vst.tsv" + "tables/processed_abundance/all.vst.tsv", + "validate", + "validate/designs.json", + "validate/pheno_table.csv", + "validate/versions.yml" ], [ "treatment_mCherry_hND6_.dds.rld.rds:md5,dfe51910e230ae9cc2bc9fa5651666ba", @@ -368,13 +379,16 @@ "treatment_mCherry_hND6_sample_number.deseq2.results.tsv:md5,b41cbaacd04a25b45295f3f987ff7500", "treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv:md5,98fa3ec0992859d2a524ecce1259f0ed", "all.normalised_counts.tsv:md5,1d7ad0c02b483f2eff1a5b357a74d011", - "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd" + "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd", + "designs.json:md5,088d536fdff430dc12f4f8b0395a9636", + "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403", + "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1" ] ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T16:16:49.167979669" + "timestamp": "2024-12-21T15:26:26.473702144" } } \ No newline at end of file diff --git a/tests/test_maxquant.nf.test.snap b/tests/test_maxquant.nf.test.snap index 619ba2b7..3cc0edfa 100644 --- a/tests/test_maxquant.nf.test.snap +++ b/tests/test_maxquant.nf.test.snap @@ -177,7 +177,7 @@ }, "Test maxquant profile with yaml contrasts": { "content": [ - 20, + 21, { "LIMMA_DIFFERENTIAL": { "r-base": "4.3.3", @@ -196,6 +196,13 @@ "r-plotly": "4.10.2", "bioconductor-limma": "3.54.0" }, + "VALIDATE_MODEL": { + "r-base": "4.4.2", + "tidyverse": "2.0.0", + "optparse": "1.7.5", + "yaml": "2.3.10", + "jsonlite": "1.8.9" + }, "VALIDATOR": { "r-base": "4.3.3", "r-shinyngs": "2.0.0" @@ -289,7 +296,11 @@ "tables/proteus/Celltype/raw_proteingroups_tab.tsv", "tables/proteus/fakeBatch", "tables/proteus/fakeBatch/normalizeMedian.normalized_proteingroups_tab.tsv", - "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv" + "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv", + "validate", + "validate/designs.json", + "validate/pheno_table.csv", + "validate/versions.yml" ], [ "fakebatch_fakeBatch_b1_b2.MArrayLM.limma.rds:md5,f321bf1e4fd44827810df9a9c38776d6", @@ -342,13 +353,16 @@ "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e", "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a", "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e", - "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a" + "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a", + "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006", + "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a", + "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1" ] ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.9.2", "nextflow": "24.10.3" }, - "timestamp": "2024-12-20T13:45:34.62418232" + "timestamp": "2024-12-21T15:58:50.7683424" } } \ No newline at end of file diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 5f376041..707395dc 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -133,6 +133,7 @@ include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/n include { GEOQUERY_GETGEO } from '../modules/nf-core/geoquery/getgeo/main' include { ZIP as MAKE_REPORT_BUNDLE } from '../modules/nf-core/zip/main' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { VALIDATE_MODEL } from '../modules/local/validatemodel/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -148,6 +149,19 @@ workflow DIFFERENTIALABUNDANCE { // Channel for the contrasts file ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) + + // Run module to validate models from yml file + if ( params.contrasts.endsWith(".yaml") || params.contrasts.endsWith(".yml") ) { + + VALIDATE_MODEL ( + ch_input, + ch_contrasts_file + ) + + ch_versions = ch_versions + .mix(VALIDATE_MODEL.out.versions) + } + // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them