From f69703bcfbc0531d9aa5a5730c6535bdec938d14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Fri, 13 Dec 2024 19:45:45 +0000
Subject: [PATCH 01/10] init script

---
 .../validate_model/input_test/contrasts.yml   |   9 ++
 .../input_test/validate_model.R               | 112 ++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 modules/local/validate_model/input_test/contrasts.yml
 create mode 100644 modules/local/validate_model/input_test/validate_model.R

diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml
new file mode 100644
index 00000000..3bfa9350
--- /dev/null
+++ b/modules/local/validate_model/input_test/contrasts.yml
@@ -0,0 +1,9 @@
+models:
+  - formula: "~ treatment"
+    contrasts:
+      - id: treatment_mCherry_hND6_
+        comparison: ["treatment", "mCherry", "hND6"]
+
+      - id: treatment_mCherry_hND6_sample_number
+        comparison: ["treatment", "mCherry", "hND6"]
+        blocking_factors: ["sample_number"]
diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R
new file mode 100644
index 00000000..4ff1207f
--- /dev/null
+++ b/modules/local/validate_model/input_test/validate_model.R
@@ -0,0 +1,112 @@
+#!/usr/bin/env Rscript
+
+## TOD: add make contrast arguments
+
+# Load required libraries
+suppressWarnings(suppressMessages({
+    library(optparse)
+    library(yaml)
+    library(readr)
+    library(dplyr)
+}))
+
+# Define command-line arguments
+option_list <- list(
+    make_option(c("-m", "--models"), type = "character", default = NULL,
+                help = "Path to the models.yml file", metavar = "character"),
+    make_option(c("-s", "--samplesheet"), type = "character", default = NULL,
+                help = "Path to the samplesheet CSV file", metavar = "character")
+)
+
+# Parse command-line arguments
+opt_parser <- OptionParser(option_list = option_list)
+opt <- parse_args(opt_parser)
+
+# Validate input arguments
+if (is.null(opt$models) || is.null(opt$samplesheet)) {
+    stop("Both --models and --samplesheet arguments are required.", call. = FALSE)
+}
+
+# Load models.yml file
+tryCatch({
+    models <- yaml.load_file(opt$models)
+    cat("Loaded models.yml successfully.\n")
+}, error = function(e) {
+    stop("Error loading models.yml: ", e$message)
+})
+
+# Load samplesheet CSV file
+tryCatch({
+    samplesheet <- read_csv(opt$samplesheet, show_col_types = FALSE)
+    cat("Loaded samplesheet CSV successfully.\n")
+}, error = function(e) {
+    stop("Error loading samplesheet CSV: ", e$message)
+})
+
+# --- Step 1: Collect all variables from contrasts in the YAML ---
+cat("\n--- Collecting Variables from Models ---\n")
+variables <- unique(unlist(lapply(models$models, function(model) {
+    unlist(lapply(model$contrasts, function(contrast) contrast$comparison))
+})))
+
+cat("Variables in contrasts:", paste(variables, collapse = ", "), "\n")
+
+# --- Step 2: Validate these variables are present as columns in the samplesheet ---
+cat("\n--- Validating Variables in Samplesheet ---\n")
+missing_columns <- setdiff(variables, colnames(samplesheet))
+
+if (length(missing_columns) > 0) {
+    stop("The following required columns are missing in the samplesheet: ",
+        paste(missing_columns, collapse = ", "))
+} else {
+    cat("All required variables are present in the samplesheet.\n")
+}
+
+# --- Step 3: Validate components in samplesheet columns ---
+cat("\n--- Validating Column Components ---\n")
+for (variable in variables) {
+    expected_values <- unique(unlist(lapply(models$models, function(model) {
+        unlist(lapply(model$contrasts, function(contrast) {
+        if (variable %in% contrast$comparison) return(contrast$comparison)
+        }))
+    })))
+    expected_values <- expected_values[!is.na(expected_values)]  # Remove NAs
+
+    actual_values <- unique(samplesheet[[variable]])
+    if (!all(actual_values %in% expected_values)) {
+        cat("Column:", variable, "\n")
+        cat("Expected values:", paste(expected_values, collapse = ", "), "\n")
+        cat("Actual values:", paste(actual_values, collapse = ", "), "\n")
+        stop("Mismatch in expected and actual values for column: ", variable)
+    } else {
+        cat("Column '", variable, "' has valid components.\n", sep = "")
+    }
+}
+
+# --- Step 4: Validate column names for recommended characters ---
+cat("\n--- Checking Column Names for Recommended Characters ---\n")
+invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE)
+
+if (length(invalid_columns) > 0) {
+    stop("The following column names contain non-recommended characters: ",
+        paste(invalid_columns, collapse = ", "))
+} else {
+    cat("All column names are valid.\n")
+}
+
+# --- Step 5: Validate column contents for recommended characters and NAs ---
+cat("\n--- Checking Column Contents for Recommended Characters and NAs ---\n")
+for (variable in variables) {
+    invalid_values <- grep("[^a-zA-Z0-9_]", samplesheet[[variable]], value = TRUE)
+    if (any(is.na(samplesheet[[variable]]))) {
+        stop("Column '", variable, "' contains missing values (NA).")
+    }
+    if (length(invalid_values) > 0) {
+        stop("Column '", variable, "' contains invalid values: ",
+            paste(invalid_values, collapse = ", "))
+    } else {
+        cat("Column '", variable, "' contents are valid.\n")
+    }
+}
+
+cat("\nValidation completed successfully.\n")

From 352a6fcb177370d82c44ad036d5005552b6415e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Mon, 16 Dec 2024 20:15:18 +0000
Subject: [PATCH 02/10] add function to check samplesheet

---
 .../validate_model/input_test/contrasts.yml   |   9 +-
 .../input_test/validate_model.R               | 231 ++++++++++++++++--
 2 files changed, 220 insertions(+), 20 deletions(-)

diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml
index 3bfa9350..b362c14e 100644
--- a/modules/local/validate_model/input_test/contrasts.yml
+++ b/modules/local/validate_model/input_test/contrasts.yml
@@ -1,9 +1,12 @@
 models:
   - formula: "~ treatment"
     contrasts:
-      - id: treatment_mCherry_hND6_
+      - id: "treatment_mCherry_hND6"
         comparison: ["treatment", "mCherry", "hND6"]
 
-      - id: treatment_mCherry_hND6_sample_number
+      - id: "treatment_mCherry_hND6_sample_number"
         comparison: ["treatment", "mCherry", "hND6"]
-        blocking_factors: ["sample_number"]
+        blocking_factors: ["sample_number23"]
+
+      - id: "treatment234"
+        comparison: ["treatment", "GFP", "hND1000"]
diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R
index 4ff1207f..2aef6385 100644
--- a/modules/local/validate_model/input_test/validate_model.R
+++ b/modules/local/validate_model/input_test/validate_model.R
@@ -1,6 +1,6 @@
 #!/usr/bin/env Rscript
 
-## TOD: add make contrast arguments
+## TODO: add make contrast arguments
 
 # Load required libraries
 suppressWarnings(suppressMessages({
@@ -12,10 +12,12 @@ suppressWarnings(suppressMessages({
 
 # Define command-line arguments
 option_list <- list(
-    make_option(c("-m", "--models"), type = "character", default = NULL,
+    make_option(c("-y", "--yml"), type = "character", default = NULL,
                 help = "Path to the models.yml file", metavar = "character"),
     make_option(c("-s", "--samplesheet"), type = "character", default = NULL,
-                help = "Path to the samplesheet CSV file", metavar = "character")
+                help = "Path to the samplesheet CSV file", metavar = "character"),
+    make_option(c("-i", "--sample"), type = "character", default = "sample",
+                help = "Column that contains sample identificators", metavar = "character")
 )
 
 # Parse command-line arguments
@@ -23,13 +25,18 @@ opt_parser <- OptionParser(option_list = option_list)
 opt <- parse_args(opt_parser)
 
 # Validate input arguments
-if (is.null(opt$models) || is.null(opt$samplesheet)) {
-    stop("Both --models and --samplesheet arguments are required.", call. = FALSE)
+if (is.null(opt$models) || is.null(opt$samplesheet) ) {
+    stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE)
 }
 
+## Collect parameters
+path_yml <- "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #opt$models
+path_samplesheet <- "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #opt$samplesheet
+sample_column <- "sample" #opt$sample
+
 # Load models.yml file
 tryCatch({
-    models <- yaml.load_file(opt$models)
+    models <- read_yaml(path_yml)
     cat("Loaded models.yml successfully.\n")
 }, error = function(e) {
     stop("Error loading models.yml: ", e$message)
@@ -37,21 +44,211 @@ tryCatch({
 
 # Load samplesheet CSV file
 tryCatch({
-    samplesheet <- read_csv(opt$samplesheet, show_col_types = FALSE)
-    cat("Loaded samplesheet CSV successfully.\n")
+    samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
+    cat("Loaded samplesheet successfully.\n")
 }, error = function(e) {
     stop("Error loading samplesheet CSV: ", e$message)
 })
 
-# --- Step 1: Collect all variables from contrasts in the YAML ---
-cat("\n--- Collecting Variables from Models ---\n")
-variables <- unique(unlist(lapply(models$models, function(model) {
-    unlist(lapply(model$contrasts, function(contrast) contrast$comparison))
-})))
+## Collect all variables and factors from the yml file
+var <- list(); for (FORMULA in models$models) {
+    ## Populate list
+    for (CONTRAST in FORMULA$contrasts) {
+        ## Get column name: first comparison's element
+        name <- CONTRAST$comparison[1]
+
+        ## Get factors
+        variables <- CONTRAST$comparison[2:3]
+
+        ## If the column is already present in the list, add more components to it
+        if (name %in% names(var) ) {
+            variables <- unique( c(var[[ name ]], CONTRAST$comparison[2:3] ))
+            var[[ name ]] <- variables
+            cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "")
+        ## Default to new variable
+        } else {
+            cat("Detected variable:", name, "\n")
+            var[[ length(var) + 1 ]] <- variables
+            names(var)[length(var) ] <- name
+        }
+
+        ## Get blocking factors
+        blocking <- c()
+        if ( !is.null(CONTRAST$blocking_factors) ) {
+            cat("Blocking factors detected for variable", name, "\n")
+            blocking <- CONTRAST$blocking_factors
+            cat(paste(blocking, collapse = ""), "\n")
+        }
+
+        ## Add blockig variables
+        if ( "blocking_factors" %in% names(var) ) {
+            var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking ))
+        } else {
+            var[[ "blocking_factors" ]] <- unique( blocking )
+        }
+    }
+}
+
+## Print explicit message
+for (INDEX in 1:length(var)) {
+    cat("Detected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\n", sep = "")
+}
+
+
+## Function to validate the samplesheet against the yml file
+validate_model <- function(variables, samplesheet) { # variables: list; samplesheet: df
+
+    variables   <- var
+    samplesheet <- samplesheet
+
+    undesired_chars <- "[^a-zA-Z0-9_]"
+    #######################################################
+    ##
+    ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet.
+    ##
+    ## The function evaluates that:
+    ## * Every variable is present in the sample sheet.
+    ## * Every variable has valid name
+    ## * There are no NAs
+    ## * All expected values are present
+    ## * All samples contains a valid level.
+    ##
+    ## Finally, the function returns
+    ## * An error vector containing all errors found.
+    ## * A warnings vector
+    ##
+    ## If the error vector contains values, the script will end with a non-zero status.
+    ##
+    #######################################################
+
+    ## Initialize errors vector
+    errors <- c()
+
+    ## Initialize warning errors
+    warnings <- c()
+
+    # Initialize vector to report continuos variables
+    continuous <- c()
+
+    ## Check samplesheet names for invalid characters
+    df_colnames <- names(samplesheet)
+    true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars)
+
+    if ( sum(true_columnms) > 0 ) {
+        errors <- c(errors,
+            paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " "))
+        )
+    }
+
+    ## Check that blocking variables exists and do not contain NAs, if they were specified
+    blocking_factors <- c()
+    if ( !is.null(variables$blocking_factors) ) {
+        blocking_factors <- variables$blocking_factors
+
+        for (VARIABLE in blocking_factors) {
+            ## Check that the column exists
+            if ( VARIABLE %in% colnames(samplesheet) ) {
+                ## Check if there are NAs
+                na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) )
+                if ( na_rows > 0 ) {
+                    errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " ")))
+                }
+            } else {
+                errors <- c(
+                    errors,
+                    paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files."))
+            }
+
+            ## Alert about continuous variables
+            if ( is.numeric(samplesheet[[ VARIABLE ]]) ) {
+                continuous <- c(continuous, VARIABLE)
+            } else {
+                paste(VARIABLE, "not continuous")
+            }
+        }
+    }
+
+    ## Check variables and levels specified
+    for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here
+
+        ## Check that the column exists
+        if ( VARIABLE %in% colnames(samplesheet) ) {
+
+            ## Alert about continuous variables
+            if ( is.numeric(samplesheet[[ VARIABLE ]]) ) {
+                continuous <- unique(c(continuous, VARIABLE))
+            } else {
+                paste(VARIABLE, "is not continuous")
+            }
+
+            ## Check if there are NAs
+            na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) )
+            if ( na_rows > 0 ) {
+                errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " ")))
+            }
+
+            ## Check that the column data does not contain undesidered characters
+            if (stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) {
+                errors <- c(errors,
+                    paste0("Column ", VARIABLE, " contains undesired characters\n")
+                )
+            }
+
+            ## Extract the expected levels according to the YML file
+            expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) )
+
+            ## Assign names
+            names(expected_factors_levels) <- variables[[ VARIABLE ]]
+
+            ## Extract the levels from the sample sheet
+            obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]])
+
+            ## Check that all expected levels for this variable are effectively present
+            for (LEVEL in names(expected_factors_levels) ) {
+                expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels
+            }
+
+            if( !all(expected_factors_levels) ) {
+                errors <- c(
+                    errors,
+                    paste0(
+                        "Missing factor levels for variable '", VARIABLE, "'. ",
+                        "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ",
+                        "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " )
+                        )
+                    )
+            }
+
+            ## Check whether there are levels in the table that are not present in the models definitions
+            if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) {
+                warnings <- c(warnings,
+                paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) ))
+            }
+
+        ## Report that a column is not even present in the table
+        } else {
+            errors <- c(
+                errors,
+                paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files."))
+        }
+
+        ## Check that the column name is valid
+        ## Check that the column content is valid -> done
+        ## Check that the column does not have NAs -> Done
+        ## Check that all factors exists -> Done
+    }
+
+    if ( !is.null(continuous)) {
+        warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" )))
+    }
+    return(list( 'errors' = errors, 'warnings' = warnings) )
+
+}
+
+errors_warnings <- validate_model(var, samplesheet)
 
-cat("Variables in contrasts:", paste(variables, collapse = ", "), "\n")
 
-# --- Step 2: Validate these variables are present as columns in the samplesheet ---
+# Validate that these variables are present as columns in the samplesheet
 cat("\n--- Validating Variables in Samplesheet ---\n")
 missing_columns <- setdiff(variables, colnames(samplesheet))
 
@@ -62,7 +259,7 @@ if (length(missing_columns) > 0) {
     cat("All required variables are present in the samplesheet.\n")
 }
 
-# --- Step 3: Validate components in samplesheet columns ---
+# Validate components in samplesheet columns
 cat("\n--- Validating Column Components ---\n")
 for (variable in variables) {
     expected_values <- unique(unlist(lapply(models$models, function(model) {
@@ -83,7 +280,7 @@ for (variable in variables) {
     }
 }
 
-# --- Step 4: Validate column names for recommended characters ---
+# Validate column names for recommended characters
 cat("\n--- Checking Column Names for Recommended Characters ---\n")
 invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE)
 

From afa2472c9c629ead4b336182408e7efe9bb15364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Tue, 17 Dec 2024 14:57:54 +0000
Subject: [PATCH 03/10] add check model code

---
 .../validate_model/input_test/contrasts.yml   |   4 +-
 .../input_test/validate_model.R               | 184 ++++++++++--------
 2 files changed, 109 insertions(+), 79 deletions(-)

diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validate_model/input_test/contrasts.yml
index b362c14e..05ee8e5e 100644
--- a/modules/local/validate_model/input_test/contrasts.yml
+++ b/modules/local/validate_model/input_test/contrasts.yml
@@ -6,7 +6,7 @@ models:
 
       - id: "treatment_mCherry_hND6_sample_number"
         comparison: ["treatment", "mCherry", "hND6"]
-        blocking_factors: ["sample_number23"]
+        blocking_factors: ["sample_number"]
 
       - id: "treatment234"
-        comparison: ["treatment", "GFP", "hND1000"]
+        comparison: ["treatment", "mCherry", "hND6"]
diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validate_model/input_test/validate_model.R
index 2aef6385..90e73df6 100644
--- a/modules/local/validate_model/input_test/validate_model.R
+++ b/modules/local/validate_model/input_test/validate_model.R
@@ -4,10 +4,11 @@
 
 # Load required libraries
 suppressWarnings(suppressMessages({
+    library(tidyverse)
     library(optparse)
     library(yaml)
-    library(readr)
-    library(dplyr)
+    #library(readr)
+    #library(dplyr)
 }))
 
 # Define command-line arguments
@@ -25,6 +26,7 @@ opt_parser <- OptionParser(option_list = option_list)
 opt <- parse_args(opt_parser)
 
 # Validate input arguments
+## Required arguments
 if (is.null(opt$models) || is.null(opt$samplesheet) ) {
     stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE)
 }
@@ -37,28 +39,40 @@ sample_column <- "sample" #opt$sample
 # Load models.yml file
 tryCatch({
     models <- read_yaml(path_yml)
-    cat("Loaded models.yml successfully.\n")
+    cat("Loaded YML file successfully.\n")
 }, error = function(e) {
-    stop("Error loading models.yml: ", e$message)
+    stop("Error loading YML file: ", e$message)
 })
 
 # Load samplesheet CSV file
 tryCatch({
     samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
+    if ( !sample_column %in% colnames(samplesheet) ) {
+        stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") )
+    }
     cat("Loaded samplesheet successfully.\n")
 }, error = function(e) {
     stop("Error loading samplesheet CSV: ", e$message)
 })
 
 ## Collect all variables and factors from the yml file
-var <- list(); for (FORMULA in models$models) {
+var <- list(); contrasts_list <- list(); for (FORMULA in models$models) {
+
     ## Populate list
     for (CONTRAST in FORMULA$contrasts) {
         ## Get column name: first comparison's element
         name <- CONTRAST$comparison[1]
 
         ## Get factors
-        variables <- CONTRAST$comparison[2:3]
+        variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
+
+        ## Populate contrasts_list for model validation
+        contrasts_list[[ CONTRAST$id ]] <- list(
+            "formula" = FORMULA$formula,
+            "variable" = name,
+            "contrast" = variables,
+            "blocking_factors" = CONTRAST$blocking_factors
+        )
 
         ## If the column is already present in the list, add more components to it
         if (name %in% names(var) ) {
@@ -96,12 +110,12 @@ for (INDEX in 1:length(var)) {
 
 
 ## Function to validate the samplesheet against the yml file
-validate_model <- function(variables, samplesheet) { # variables: list; samplesheet: df
-
-    variables   <- var
-    samplesheet <- samplesheet
+validate_model <- function(sample_column, variables, samplesheet) { # variables: list; samplesheet: df
 
+    #variables   <- var
+    #samplesheet <- samplesheet
     undesired_chars <- "[^a-zA-Z0-9_]"
+
     #######################################################
     ##
     ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet.
@@ -132,9 +146,9 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh
 
     ## Check samplesheet names for invalid characters
     df_colnames <- names(samplesheet)
-    true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars)
+    true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars))
 
-    if ( sum(true_columnms) > 0 ) {
+    if ( sum(true_columns) > 0 ) {
         errors <- c(errors,
             paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " "))
         )
@@ -149,9 +163,9 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh
             ## Check that the column exists
             if ( VARIABLE %in% colnames(samplesheet) ) {
                 ## Check if there are NAs
-                na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) )
-                if ( na_rows > 0 ) {
-                    errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " ")))
+                na_rows <- is.na( samplesheet[[ VARIABLE ]])
+                if ( sum(na_rows) > 0 ) {
+                    errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " ")))
                 }
             } else {
                 errors <- c(
@@ -182,13 +196,13 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh
             }
 
             ## Check if there are NAs
-            na_rows <- which(is.na( samplesheet[[ VARIABLE ]]) )
-            if ( na_rows > 0 ) {
-                errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(na_rows, collapse = " ")))
+            na_rows <- is.na( samplesheet[[ VARIABLE ]] )
+            if ( sum(na_rows) > 0 ) {
+                errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " ")))
             }
 
             ## Check that the column data does not contain undesidered characters
-            if (stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) {
+            if ( sum( stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) > 0 ) {
                 errors <- c(errors,
                     paste0("Column ", VARIABLE, " contains undesired characters\n")
                 )
@@ -231,79 +245,95 @@ validate_model <- function(variables, samplesheet) { # variables: list; samplesh
                 errors,
                 paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files."))
         }
-
-        ## Check that the column name is valid
-        ## Check that the column content is valid -> done
-        ## Check that the column does not have NAs -> Done
-        ## Check that all factors exists -> Done
     }
 
     if ( !is.null(continuous)) {
         warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" )))
     }
-    return(list( 'errors' = errors, 'warnings' = warnings) )
+
+    if (length(errors) > 0 ) {
+        stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = ""))
+    }
+
+    ## Generate validated samplesheet
+    tryCatch({
+        selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors)
+        pheno_table      <- samplesheet %>% dplyr::select(all_of(selected_columns))
+    }, error = function(e) {
+        stop("Error generating validated samplesheet: ", e$message)
+    })
+
+    return(list( 'pheno_table' = pheno_table, 'errors' = errors, 'warnings' = warnings) )
 
 }
 
-errors_warnings <- validate_model(var, samplesheet)
+## Collect all outputs from function: errors must be empty, warnings can have messages
+phenotable_errors_warnings <- validate_model(sample_column, var, samplesheet)
 
+## Cat warnings messages
+cat(phenotable_errors_warnings[[ 3 ]], "\n")
 
-# Validate that these variables are present as columns in the samplesheet
-cat("\n--- Validating Variables in Samplesheet ---\n")
-missing_columns <- setdiff(variables, colnames(samplesheet))
+## Get validated sample sheet
+pheno_table <- phenotable_errors_warnings[[ 1 ]]
 
-if (length(missing_columns) > 0) {
-    stop("The following required columns are missing in the samplesheet: ",
-        paste(missing_columns, collapse = ", "))
-} else {
-    cat("All required variables are present in the samplesheet.\n")
-}
+## CHECK THAT THE MODELS ARE FULL RANKED
+check_model_contrasts <- function(contrasts_list, colData) {
+    for (model_name in names(contrasts_list)) {
 
-# Validate components in samplesheet columns
-cat("\n--- Validating Column Components ---\n")
-for (variable in variables) {
-    expected_values <- unique(unlist(lapply(models$models, function(model) {
-        unlist(lapply(model$contrasts, function(contrast) {
-        if (variable %in% contrast$comparison) return(contrast$comparison)
-        }))
-    })))
-    expected_values <- expected_values[!is.na(expected_values)]  # Remove NAs
-
-    actual_values <- unique(samplesheet[[variable]])
-    if (!all(actual_values %in% expected_values)) {
-        cat("Column:", variable, "\n")
-        cat("Expected values:", paste(expected_values, collapse = ", "), "\n")
-        cat("Actual values:", paste(actual_values, collapse = ", "), "\n")
-        stop("Mismatch in expected and actual values for column: ", variable)
-    } else {
-        cat("Column '", variable, "' has valid components.\n", sep = "")
-    }
-}
+        # Extract model components
+        model        <- contrasts_list[[model_name]]
+        base_formula <- as.formula(model$formula)  # Ensure formula object
+        variable     <- model$variable
+        contrast     <- model$contrast
+        blocking     <- model$blocking_factor  # Extract blocking factor
 
-# Validate column names for recommended characters
-cat("\n--- Checking Column Names for Recommended Characters ---\n")
-invalid_columns <- grep("[^a-zA-Z0-9_]", colnames(samplesheet), value = TRUE)
+        # Check if blocking factor is already in the formula
+        if (!is.null(blocking)) {
+            # Get terms from the formula
+            formula_terms <- all.vars(base_formula)
 
-if (length(invalid_columns) > 0) {
-    stop("The following column names contain non-recommended characters: ",
-        paste(invalid_columns, collapse = ", "))
-} else {
-    cat("All column names are valid.\n")
-}
+            # If blocking factor is not already in the formula, add it
+            if (!(blocking %in% formula_terms)) {
+                #cat("\nAdding blocking factor:", blocking, "to the model.\n")
+                updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking))
+            } else {
+                #cat("\nBlocking factor:", blocking, "is already in the formula. Skipping addition.\n")
+                updated_formula <- base_formula
+            }
+        } else {
+            updated_formula <- base_formula
+        }
 
-# --- Step 5: Validate column contents for recommended characters and NAs ---
-cat("\n--- Checking Column Contents for Recommended Characters and NAs ---\n")
-for (variable in variables) {
-    invalid_values <- grep("[^a-zA-Z0-9_]", samplesheet[[variable]], value = TRUE)
-    if (any(is.na(samplesheet[[variable]]))) {
-        stop("Column '", variable, "' contains missing values (NA).")
-    }
-    if (length(invalid_values) > 0) {
-        stop("Column '", variable, "' contains invalid values: ",
-            paste(invalid_values, collapse = ", "))
-    } else {
-        cat("Column '", variable, "' contents are valid.\n")
+        cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n")
+
+        # Build the design matrix
+        design_matrix <- model.matrix(updated_formula, data = colData)
+        cat("Design matrix:\n")
+        print(design_matrix)
+
+        # Check the rank of the design matrix
+        rank <- qr(design_matrix)$rank
+        expected_rank <- ncol(design_matrix)
+
+        if (rank == expected_rank) {
+            cat("The design matrix is full rank.\n")
+        } else {
+            cat("WARNING: The design matrix is NOT full rank!\n")
+        }
+
+        # Check if contrast variable exists in colData
+        if (variable %in% colnames(colData)) {
+            factor_levels <- levels(factor(colData[[variable]], levels = contrast))
+            if (all(contrast %in% factor_levels)) {
+                cat("Contrast levels are valid:", paste(contrast, collapse = " vs "), "\n")
+            } else {
+                cat("ERROR: Contrast levels", paste(contrast, collapse = " vs "),
+                    "do not exist in", variable, "\n")
+            }
+        } else {
+            cat("ERROR: Contrast variable", variable, "is not in colData.\n")
+        }
     }
 }
 
-cat("\nValidation completed successfully.\n")
+check_model_contrasts(contrasts_list, pheno_table)

From 9e42bdd8af30c12cb3909426fa51e04fcd17f7c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Wed, 18 Dec 2024 11:47:02 +0000
Subject: [PATCH 04/10] POC ready

---
 conf/modules.config                           |   9 +
 modules/local/validatemodel/environment.yml   |   8 +
 modules/local/validatemodel/main.nf           |  57 +++++
 .../resources/usr/bin}/validate_model.R       | 213 +++++++++++-------
 .../tests}/contrasts.yml                      |   0
 .../tests/contrasts_maxquant.yml              |  12 +
 .../local/validatemodel/tests/main.nf.test    |  39 ++++
 .../local/validatemodel/tests/nextflow.config |   6 +
 nextflow_schema.json                          |   2 +-
 workflows/differentialabundance.nf            |   8 +
 10 files changed, 270 insertions(+), 84 deletions(-)
 create mode 100644 modules/local/validatemodel/environment.yml
 create mode 100644 modules/local/validatemodel/main.nf
 rename modules/local/{validate_model/input_test => validatemodel/resources/usr/bin}/validate_model.R (60%)
 mode change 100644 => 100755
 rename modules/local/{validate_model/input_test => validatemodel/tests}/contrasts.yml (100%)
 create mode 100644 modules/local/validatemodel/tests/contrasts_maxquant.yml
 create mode 100644 modules/local/validatemodel/tests/main.nf.test
 create mode 100644 modules/local/validatemodel/tests/nextflow.config

diff --git a/conf/modules.config b/conf/modules.config
index f26c1f9b..34e23407 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -42,6 +42,15 @@ process {
         ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'"
     }
 
+    withName: VALIDATE_MODEL {
+        ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' }
+        publishDir = [
+            path: { "${params.outdir}/validate" },
+            mode: params.publish_dir_mode,
+        ]
+
+    }
+
     withName: AFFY_JUSTRMA_RAW {
         publishDir = [
             [
diff --git a/modules/local/validatemodel/environment.yml b/modules/local/validatemodel/environment.yml
new file mode 100644
index 00000000..9a8af4a9
--- /dev/null
+++ b/modules/local/validatemodel/environment.yml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::r-jsonlite
+  - conda-forge::r-optparse
+  - conda-forge::r-tidyverse
+  - conda-forge::r-yaml
diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf
new file mode 100644
index 00000000..b8cf31ce
--- /dev/null
+++ b/modules/local/validatemodel/main.nf
@@ -0,0 +1,57 @@
+process VALIDATE_MODEL {
+    tag "${models_yml}"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f5/f5ef116da0b04fc6359efa48f08b9acfc040a02d7a1ec62476c77b6db0fc1499/data' :
+        'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }"
+
+    input:
+    tuple path(samplesheet)
+    tuple val(meta), path(models_yml)
+
+    output:
+    tuple val(meta), path("pheno_table.csv"), emit: pheno_table
+    tuple val(meta), path("designs.json")   , emit: designs
+    tuple val(meta), path("warnings.json")  , emit: warnings, optional: true
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    validate_model.R \\
+        --yml "${models_yml}" \\
+        --samplesheet "${samplesheet}" \\
+        ${args}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p')
+        tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))")
+        optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))")
+        yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))")
+        jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))")
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch pheno_table.csv
+    touch designs.json
+    touch warnings.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p')
+        tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))")
+        optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))")
+        yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))")
+        jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))")
+    END_VERSIONS
+    """
+
+}
diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R
old mode 100644
new mode 100755
similarity index 60%
rename from modules/local/validate_model/input_test/validate_model.R
rename to modules/local/validatemodel/resources/usr/bin/validate_model.R
index 90e73df6..bde457b1
--- a/modules/local/validate_model/input_test/validate_model.R
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R
@@ -2,41 +2,52 @@
 
 ## TODO: add make contrast arguments
 
-# Load required libraries
+# LOAD LIBRARIES ----------------------------------------------------
 suppressWarnings(suppressMessages({
     library(tidyverse)
     library(optparse)
     library(yaml)
-    #library(readr)
-    #library(dplyr)
+    library(jsonlite)
 }))
 
-# Define command-line arguments
+# PARSE ARGUMENTS ---------------------------------------------------
+## Generate list
 option_list <- list(
     make_option(c("-y", "--yml"), type = "character", default = NULL,
                 help = "Path to the models.yml file", metavar = "character"),
     make_option(c("-s", "--samplesheet"), type = "character", default = NULL,
                 help = "Path to the samplesheet CSV file", metavar = "character"),
-    make_option(c("-i", "--sample"), type = "character", default = "sample",
+    make_option(c("-i", "--sample_id_col"), type = "character", default = "sample",
                 help = "Column that contains sample identificators", metavar = "character")
 )
 
-# Parse command-line arguments
-opt_parser <- OptionParser(option_list = option_list)
+## Parse command-line arguments
+opt_parser <-
+    OptionParser(
+        option_list = option_list,
+        description = "Validate a sample sheet against a YML file for model designs.",
+        epilogue =
+        "
+        The process will look for variables and factors in the YML file and validate that they are present in the sample sheet.
+        Will also look for unwanted characters within columns and colnames.
+        Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not.
+        "
+    )
 opt <- parse_args(opt_parser)
 
-# Validate input arguments
-## Required arguments
-if (is.null(opt$models) || is.null(opt$samplesheet) ) {
-    stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE)
+## Validate input arguments
+### Required arguments
+if (is.null(opt$yml) || is.null(opt$samplesheet) ) {
+    stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE)
 }
 
-## Collect parameters
-path_yml <- "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #opt$models
-path_samplesheet <- "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #opt$samplesheet
-sample_column <- "sample" #opt$sample
+### Collect parameters (easier for dev and testing)
+path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #
+path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #
+sample_column    <- opt$sample_id_col   # "sample" #
 
-# Load models.yml file
+# LOAD FILES --------------------------------------------------------
+## Load models.yml file
 tryCatch({
     models <- read_yaml(path_yml)
     cat("Loaded YML file successfully.\n")
@@ -44,7 +55,7 @@ tryCatch({
     stop("Error loading YML file: ", e$message)
 })
 
-# Load samplesheet CSV file
+## Load samplesheet CSV file
 tryCatch({
     samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
     if ( !sample_column %in% colnames(samplesheet) ) {
@@ -55,47 +66,56 @@ tryCatch({
     stop("Error loading samplesheet CSV: ", e$message)
 })
 
+# PARSE FACTORS/LEVELS FROM YML FILE --------------------------------
 ## Collect all variables and factors from the yml file
-var <- list(); contrasts_list <- list(); for (FORMULA in models$models) {
+## * var_list:        collects all factors levels for each variable.
+## * contrast_list:   collects info for each contrast
 
-    ## Populate list
+### Initialize empty lists
+var <- list()
+contrasts_list <- list()
+
+## Iterate through models (formula)
+for (FORMULA in models$models) {
+
+    ## Iterate through contrasts for each model (formula)
     for (CONTRAST in FORMULA$contrasts) {
-        ## Get column name: first comparison's element
+        ### Get column name: first comparison's element
         name <- CONTRAST$comparison[1]
 
-        ## Get factors
+        ### Get factors (rest of components)
         variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
 
-        ## Populate contrasts_list for model validation
-        contrasts_list[[ CONTRAST$id ]] <- list(
+        ### Populate contrasts_list for later model validation
+        contrasts_list[[ CONTRAST$id ]] <- list(                ## Adds the ID as main identifier
             "formula" = FORMULA$formula,
             "variable" = name,
             "contrast" = variables,
-            "blocking_factors" = CONTRAST$blocking_factors
+            "blocking_factors" = CONTRAST$blocking_factors      ## This list can be later extended for "make_contrast" option is required
         )
 
-        ## If the column is already present in the list, add more components to it
+        ### Start populating the variables_list
+        #### If the column is already present in the list, add more components to it
         if (name %in% names(var) ) {
-            variables <- unique( c(var[[ name ]], CONTRAST$comparison[2:3] ))
-            var[[ name ]] <- variables
-            cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "")
-        ## Default to new variable
+            variables <- unique( c(var[[ name ]], variables )) ## Combine previous levels with the new ones without duplicating them
+            var[[ name ]] <- variables                         ## assign the values to the list
+            #cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "")
+        #### Default to new element
         } else {
-            cat("Detected variable:", name, "\n")
-            var[[ length(var) + 1 ]] <- variables
-            names(var)[length(var) ] <- name
+            #cat("Detected variable:", name, "\n")
+            var[[ length(var) + 1 ]] <- variables              ## Asign levels to a new entry
+            names(var)[length(var) ] <- name                   ## Specify the variable name
         }
 
         ## Get blocking factors
         blocking <- c()
-        if ( !is.null(CONTRAST$blocking_factors) ) {
-            cat("Blocking factors detected for variable", name, "\n")
+        if ( !is.null(CONTRAST$blocking_factors) ) { ## Check if they were defined in the yml
+            #cat("Blocking factors detected for variable", name, "\n")
             blocking <- CONTRAST$blocking_factors
-            cat(paste(blocking, collapse = ""), "\n")
         }
 
-        ## Add blockig variables
-        if ( "blocking_factors" %in% names(var) ) {
+        ## Add blocking variables
+        if ( "blocking_factors" %in% names(var) ) { ## Check if the category exists from previous iterations
             var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking ))
         } else {
             var[[ "blocking_factors" ]] <- unique( blocking )
@@ -103,18 +123,14 @@ var <- list(); contrasts_list <- list(); for (FORMULA in models$models) {
     }
 }
 
-## Print explicit message
+## Print explicit messages
 for (INDEX in 1:length(var)) {
-    cat("Detected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\n", sep = "")
+    cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "")
 }
 
-
-## Function to validate the samplesheet against the yml file
-validate_model <- function(sample_column, variables, samplesheet) { # variables: list; samplesheet: df
-
-    #variables   <- var
-    #samplesheet <- samplesheet
-    undesired_chars <- "[^a-zA-Z0-9_]"
+# VALIDATE SAMPLESHEET BASED ON YML FILE ----------------------------
+## Define function to compare YML and samplesheet
+validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df
 
     #######################################################
     ##
@@ -144,7 +160,10 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables:
     # Initialize vector to report continuos variables
     continuous <- c()
 
-    ## Check samplesheet names for invalid characters
+    ## Do not allow special characters
+    undesired_chars <- "[^a-zA-Z0-9_.]"
+
+    ## Check samplesheet names for invalid colnames
     df_colnames <- names(samplesheet)
     true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars))
 
@@ -247,57 +266,80 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables:
         }
     }
 
+    ## Report if any variable was imported as continuous
     if ( !is.null(continuous)) {
         warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" )))
     }
 
+    ## Report ERRORS and stop
     if (length(errors) > 0 ) {
         stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = ""))
     }
 
-    ## Generate validated samplesheet
+    ## Generate validated phenotypic table
     tryCatch({
+        ## Get desired columns, starting with the one containing sample identificators
         selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors)
         pheno_table      <- samplesheet %>% dplyr::select(all_of(selected_columns))
+
     }, error = function(e) {
-        stop("Error generating validated samplesheet: ", e$message)
+        stop("Error generating validated phenotypic table: ", e$message)
     })
 
-    return(list( 'pheno_table' = pheno_table, 'errors' = errors, 'warnings' = warnings) )
+    ## Return list
+    return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) )
 
 }
 
 ## Collect all outputs from function: errors must be empty, warnings can have messages
-phenotable_errors_warnings <- validate_model(sample_column, var, samplesheet)
+phenotable_warnings <- validate_model(sample_column, var, samplesheet)
 
 ## Cat warnings messages
-cat(phenotable_errors_warnings[[ 3 ]], "\n")
+cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n")
 
-## Get validated sample sheet
-pheno_table <- phenotable_errors_warnings[[ 1 ]]
+## Get validated pheno table
+pheno_table <- phenotable_warnings[[1]]
 
-## CHECK THAT THE MODELS ARE FULL RANKED
+## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED
 check_model_contrasts <- function(contrasts_list, colData) {
+
+    #######################################################
+    ##
+    ## Function that takes a list with models + variables, and a phenotypic table
+    ##
+    ## The function evaluates that:
+    ## * The formulas are usable
+    ## * The models generated are full ranked
+    ##
+    ## Finally, the function returns
+    ## * An list with all designs
+    ## * full ranked: true/false
+    ##
+    ## If the error vector contains values, the script will end with a non-zero status.
+    ##
+    #######################################################
+
+    ### Initialize list for models descriptors
+    design_list <- list()
+    ### Iterate over each model
     for (model_name in names(contrasts_list)) {
 
-        # Extract model components
+        ## Extract model components
         model        <- contrasts_list[[model_name]]
-        base_formula <- as.formula(model$formula)  # Ensure formula object
+        base_formula <- as.formula(model$formula)
         variable     <- model$variable
         contrast     <- model$contrast
-        blocking     <- model$blocking_factor  # Extract blocking factor
+        blocking     <- model$blocking_factor
 
-        # Check if blocking factor is already in the formula
+        ## Check if blocking factor is already in the formula
         if (!is.null(blocking)) {
-            # Get terms from the formula
+            ## Get terms from the formula
             formula_terms <- all.vars(base_formula)
 
-            # If blocking factor is not already in the formula, add it
+            ## If blocking factor is not already in the formula, add it
             if (!(blocking %in% formula_terms)) {
-                #cat("\nAdding blocking factor:", blocking, "to the model.\n")
                 updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking))
             } else {
-                #cat("\nBlocking factor:", blocking, "is already in the formula. Skipping addition.\n")
                 updated_formula <- base_formula
             }
         } else {
@@ -308,32 +350,37 @@ check_model_contrasts <- function(contrasts_list, colData) {
 
         # Build the design matrix
         design_matrix <- model.matrix(updated_formula, data = colData)
-        cat("Design matrix:\n")
         print(design_matrix)
 
         # Check the rank of the design matrix
         rank <- qr(design_matrix)$rank
         expected_rank <- ncol(design_matrix)
 
-        if (rank == expected_rank) {
-            cat("The design matrix is full rank.\n")
-        } else {
-            cat("WARNING: The design matrix is NOT full rank!\n")
-        }
-
-        # Check if contrast variable exists in colData
-        if (variable %in% colnames(colData)) {
-            factor_levels <- levels(factor(colData[[variable]], levels = contrast))
-            if (all(contrast %in% factor_levels)) {
-                cat("Contrast levels are valid:", paste(contrast, collapse = " vs "), "\n")
-            } else {
-                cat("ERROR: Contrast levels", paste(contrast, collapse = " vs "),
-                    "do not exist in", variable, "\n")
-            }
-        } else {
-            cat("ERROR: Contrast variable", variable, "is not in colData.\n")
-        }
+        ## Add results to list
+        design_list[[ length(design_list) + 1 ]] <- list(
+            "formula"       = deparse(updated_formula),
+            "design_matrix" = design_matrix,
+            "full_rank"     = rank == expected_rank
+        )
+        names(design_list)[ length(design_list) ]  <- model_name
     }
+
+    return(design_list)
+}
+
+design_results <- check_model_contrasts(contrasts_list, pheno_table)
+
+# EXPORT DATA -------------------------------------------------------
+## Export pheno table
+write_csv(pheno_table, "pheno_table.csv")
+
+## Export designs to JSON
+jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE)
+
+## Export warnings
+if ( !is.null(phenotable_warnings[[ 2 ]]) ) {
+    jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE)
 }
 
-check_model_contrasts(contrasts_list, pheno_table)
+## Export RData
+save.image("models.RData")
diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml
similarity index 100%
rename from modules/local/validate_model/input_test/contrasts.yml
rename to modules/local/validatemodel/tests/contrasts.yml
diff --git a/modules/local/validatemodel/tests/contrasts_maxquant.yml b/modules/local/validatemodel/tests/contrasts_maxquant.yml
new file mode 100644
index 00000000..05ee8e5e
--- /dev/null
+++ b/modules/local/validatemodel/tests/contrasts_maxquant.yml
@@ -0,0 +1,12 @@
+models:
+  - formula: "~ treatment"
+    contrasts:
+      - id: "treatment_mCherry_hND6"
+        comparison: ["treatment", "mCherry", "hND6"]
+
+      - id: "treatment_mCherry_hND6_sample_number"
+        comparison: ["treatment", "mCherry", "hND6"]
+        blocking_factors: ["sample_number"]
+
+      - id: "treatment234"
+        comparison: ["treatment", "mCherry", "hND6"]
diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test
new file mode 100644
index 00000000..d3f2101c
--- /dev/null
+++ b/modules/local/validatemodel/tests/main.nf.test
@@ -0,0 +1,39 @@
+nextflow_process {
+
+    name "Test Process VALIDATE_MODEL"
+    script "../main.nf"
+    process "VALIDATE_MODEL"
+    config "./nextflow.config"
+
+    tag "modules"
+    tag "models"
+    tag "yml"
+    tag "samplesheet"
+
+    test("Samplesheet - YML file - RNASEQ TEST") {
+
+        when {
+            params {
+                observations_id_col = 'sample'
+            }
+            process {
+                """
+                // Samplesheet
+                input[0] =
+                    Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true)
+
+                // Models_yml sheet
+                input[1] =
+                    Channel.fromPath('${moduleDir}/tests/contrasts.yml', checkIfExists: true)
+                        .map{ [ [id: "test_yml"], it ] }
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success }
+            )
+        }
+    }
+}
diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config
new file mode 100644
index 00000000..52a943d5
--- /dev/null
+++ b/modules/local/validatemodel/tests/nextflow.config
@@ -0,0 +1,6 @@
+nextflow.enable.moduleBinaries = true
+process {
+    withName: 'VALIDATE_MODEL' {
+        ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' }
+    }
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 70af98c8..6a51601d 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -42,7 +42,7 @@
                     "type": "string",
                     "description": "A CSV file describing sample contrasts",
                     "help_text": "This file is used to define groups of samples from 'input' to compare.  It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)",
-                    "pattern": "^\\S+\\.(csv|tsv)$",
+                    "pattern": "^\\S+\\.(csv|tsv|yml|yaml)$",
                     "format": "file-path",
                     "mimetype": "text/csv",
                     "fa_icon": "fas fa-adjust"
diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf
index e49bebda..e70270b1 100644
--- a/workflows/differentialabundance.nf
+++ b/workflows/differentialabundance.nf
@@ -132,6 +132,7 @@ include { PROTEUS_READPROTEINGROUPS as PROTEUS              } from '../modules/n
 include { GEOQUERY_GETGEO                                   } from '../modules/nf-core/geoquery/getgeo/main'
 include { ZIP as MAKE_REPORT_BUNDLE                         } from '../modules/nf-core/zip/main'
 include { softwareVersionsToYAML                            } from '../subworkflows/nf-core/utils_nfcore_pipeline'
+include { VALIDATE_MODEL                                    } from '../modules/local/validatemodel/main'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -147,6 +148,13 @@ workflow DIFFERENTIALABUNDANCE {
     // Channel for the contrasts file
     ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]])
 
+
+    // Run module to validate models
+    VALIDATE_MODEL(
+        ch_input,
+        ch_contrasts_file
+    )
+
     // If we have affy array data in the form of CEL files we'll be deriving
     // matrix and annotation from them
 

From c2f666fa27987448c0277e834cde53fac7e0f233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Wed, 18 Dec 2024 14:25:34 +0000
Subject: [PATCH 05/10] change function structure

---
 .../resources/usr/bin/validate_model.R        | 135 ++++++++++++------
 1 file changed, 91 insertions(+), 44 deletions(-)

diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R
index bde457b1..f7470ff1 100755
--- a/modules/local/validatemodel/resources/usr/bin/validate_model.R
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R
@@ -42,8 +42,8 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) {
 }
 
 ### Collect parameters (easier for dev and testing)
-path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #
-path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #
+path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" #
+path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" #
 sample_column    <- opt$sample_id_col   # "sample" #
 
 # LOAD FILES --------------------------------------------------------
@@ -71,58 +71,99 @@ tryCatch({
 ## * var_list:        collects all factors levels for each variable.
 ## * contrast_list:   collects info for each contrast
 
-### Initialize empty lists
-var <- list()
-contrasts_list <- list()
+process_models <- function(models) {
+    # Initialize lists to store the output
+    contrasts_list <- list()
+    var <- list()
 
-## Iterate through models (formula)
-for (FORMULA in models$models) {
-
-    ## Iterate through contrasts for each model (formula)
-    for (CONTRAST in FORMULA$contrasts) {
-        ### Get column name: first comparison's element
-        name <- CONTRAST$comparison[1]
+    # Ensure models$models is not null or empty
+    if (is.null(models$models) || length(models$models) == 0) {
+        stop("models$models is null or empty. Please provide valid input.")
+    }
 
-        ### Get factors (rest of components)
-        variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
+    # Temporary storage for gathering contrasts per variable
+    temp_var <- list()
+    blocking_vars <- c()
 
-        ### Populate contrasts_list for later model validation
-        contrasts_list[[ CONTRAST$id ]] <- list(                ## Adds the ID as main identifier
-            "formula" = FORMULA$formula,
-            "variable" = name,
-            "contrast" = variables,
-            "blocking_factors" = CONTRAST$blocking_factors      ## This list can be later extended for "make_contrast" option is required
-        )
+    # Iterate through models (formula)
+    for (FORMULA in models$models) {
 
-        ### Start populating the variables_list
-        #### If the column is already present in the list, add more components to it
-        if (name %in% names(var) ) {
-            variables <- unique( c(var[[ name ]], variables )) ## Combine previous levels with the new ones without duplicating them
-            var[[ name ]] <- variables                         ## assign the values to the list
-            #cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "")
-        #### Default to new element
-        } else {
-            #cat("Detected variable:", name, "\n")
-            var[[ length(var) + 1 ]] <- variables              ## Asign levels to a new entry
-            names(var)[length(var) ] <- name                   ## Specify the variable name
+        # Ensure FORMULA$contrasts is not null or empty
+        if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) {
+            stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.")
         }
 
-        ## Get blocking factors
-        blocking <- c()
-        if ( !is.null(CONTRAST$blocking_factors) ) { ## Check if they were defined in the yml
-            #cat("Blocking factors detected for variable", name, "\n")
-            blocking <- CONTRAST$blocking_factors
+        # Iterate through contrasts for each model (formula)
+        for (CONTRAST in FORMULA$contrasts) {
+
+            # Validate CONTRAST$id
+            if (is.null(CONTRAST$id)) {
+                stop("Missing CONTRAST$id detected. Unable to proceed.")
+            }
+
+            if (CONTRAST$id %in% names(contrasts_list)) {
+                stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed."))
+            }
+
+            # Get column name: first comparison's element
+            name <- CONTRAST$comparison[1]
+
+            # Check if name is valid
+            if (is.null(name) || name == "") {
+                stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.")
+            }
+
+            # Get factors (rest of components)
+            variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
+
+            # Populate contrasts_list for later model validation
+            contrasts_list[[CONTRAST$id]] <- list(
+                "formula" = FORMULA$formula,
+                "variable" = name,
+                "contrast" = variables,
+                "blocking_factors" = CONTRAST$blocking_factors
+            )
+
+            # Gather contrasts per variable into temporary storage
+            if (name %in% names(temp_var)) {
+                temp_var[[name]] <- unique(c(temp_var[[name]], variables))
+            } else {
+                temp_var[[name]] <- variables
+            }
+
+            # Gather blocking factors
+            if (!is.null(CONTRAST$blocking_factors)) {
+                blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors))
+            }
         }
+    }
 
-        ## Add blocking variables
-        if ( "blocking_factors" %in% names(var) ) { ## Check if the category exists from previous iterations
-            var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking ))
+    # Consolidate gathered contrasts into `var`
+    for (name in names(temp_var)) {
+        if (name %in% names(var)) {
+            var[[name]] <- unique(c(var[[name]], temp_var[[name]]))
         } else {
-            var[[ "blocking_factors" ]] <- unique( blocking )
+            var[[name]] <- temp_var[[name]]
         }
     }
+
+    # Add blocking variables to `var`
+    var[[ "blocking_factors" ]] <- blocking_vars
+
+    # Return the populated lists
+    return(list(
+        contrasts_list = contrasts_list,
+        var = var
+    ))
 }
 
+## Process models
+models_lists   <- process_models(models)
+### Extract models contrasts
+contrasts_list <- models_lists[["contrasts_list"]]
+### Extract variables and levels
+var            <- models_lists[["var"]]
+
 ## Print explicit messages
 for (INDEX in 1:length(var)) {
     cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "")
@@ -221,7 +262,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col
             }
 
             ## Check that the column data does not contain undesidered characters
-            if ( sum( stringr::str_detect(samplesheet[[ VARIABLE ]], pattern = undesired_chars )) > 0 ) {
+            if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) {
                 errors <- c(errors,
                     paste0("Column ", VARIABLE, " contains undesired characters\n")
                 )
@@ -273,7 +314,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col
 
     ## Report ERRORS and stop
     if (length(errors) > 0 ) {
-        stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = ""))
+        stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = ""))
     }
 
     ## Generate validated phenotypic table
@@ -350,7 +391,6 @@ check_model_contrasts <- function(contrasts_list, colData) {
 
         # Build the design matrix
         design_matrix <- model.matrix(updated_formula, data = colData)
-        print(design_matrix)
 
         # Check the rank of the design matrix
         rank <- qr(design_matrix)$rank
@@ -365,6 +405,13 @@ check_model_contrasts <- function(contrasts_list, colData) {
         names(design_list)[ length(design_list) ]  <- model_name
     }
 
+    for (DESIGN in names(design_list)) {
+        cat("Design ", DESIGN,
+            if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "},
+            "full ranked\n", sep = ""
+        )
+    }
+
     return(design_list)
 }
 

From 6614f4d3927d29b612b0de245033be8e489eca33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Wed, 18 Dec 2024 16:36:57 +0000
Subject: [PATCH 06/10] fix bug in module

---
 modules/local/validatemodel/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf
index b8cf31ce..c7962219 100644
--- a/modules/local/validatemodel/main.nf
+++ b/modules/local/validatemodel/main.nf
@@ -8,7 +8,7 @@ process VALIDATE_MODEL {
         'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }"
 
     input:
-    tuple path(samplesheet)
+    path samplesheet
     tuple val(meta), path(models_yml)
 
     output:

From d9d18c5c4724d3377e2179f67ee98ea019faffbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Thu, 19 Dec 2024 17:03:21 +0000
Subject: [PATCH 07/10] update yml structure input limited to contrasts only

---
 .../resources/usr/bin/validate_model.R        | 125 +++--
 .../resources/usr/bin/validate_model.R.bkp    | 433 ++++++++++++++++++
 .../local/validatemodel/tests/contrasts.yml   |  18 +-
 .../tests/contrasts_maxquant.yml              |  12 -
 .../local/validatemodel/tests/main.nf.test    |   3 +-
 .../validatemodel/tests/main.nf.test.snap     |  57 +++
 6 files changed, 555 insertions(+), 93 deletions(-)
 create mode 100755 modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp
 delete mode 100644 modules/local/validatemodel/tests/contrasts_maxquant.yml
 create mode 100644 modules/local/validatemodel/tests/main.nf.test.snap

diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R
index f7470ff1..62e86932 100755
--- a/modules/local/validatemodel/resources/usr/bin/validate_model.R
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R
@@ -43,7 +43,7 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) {
 
 ### Collect parameters (easier for dev and testing)
 path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" #
-path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" #
+path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #
 sample_column    <- opt$sample_id_col   # "sample" #
 
 # LOAD FILES --------------------------------------------------------
@@ -59,7 +59,7 @@ tryCatch({
 tryCatch({
     samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
     if ( !sample_column %in% colnames(samplesheet) ) {
-        stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") )
+        stop(paste0("The sample identificator column '", sample_column, "' is not present in the samplesheet file") )
     }
     cat("Loaded samplesheet successfully.\n")
 }, error = function(e) {
@@ -72,78 +72,68 @@ tryCatch({
 ## * contrast_list:   collects info for each contrast
 
 process_models <- function(models) {
+
     # Initialize lists to store the output
     contrasts_list <- list()
     var <- list()
 
     # Ensure models$models is not null or empty
-    if (is.null(models$models) || length(models$models) == 0) {
-        stop("models$models is null or empty. Please provide valid input.")
+    if (is.null(models$contrasts) || length(models$contrasts) == 0) {
+        stop("models$contrasts is null or empty. Please provide valid input.")
     }
 
     # Temporary storage for gathering contrasts per variable
     temp_var <- list()
     blocking_vars <- c()
 
-    # Iterate through models (formula)
-    for (FORMULA in models$models) {
+    for (CONTRAST in models$contrasts) {
 
-        # Ensure FORMULA$contrasts is not null or empty
-        if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) {
-            stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.")
+        # Validate CONTRAST$id
+        if (is.null(CONTRAST$id)) {
+            stop("Missing CONTRAST$id detected.")
         }
 
-        # Iterate through contrasts for each model (formula)
-        for (CONTRAST in FORMULA$contrasts) {
-
-            # Validate CONTRAST$id
-            if (is.null(CONTRAST$id)) {
-                stop("Missing CONTRAST$id detected. Unable to proceed.")
-            }
-
-            if (CONTRAST$id %in% names(contrasts_list)) {
-                stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed."))
-            }
+        if (CONTRAST$id %in% names(contrasts_list)) {
+            stop(paste0("Duplicate CONTRAST$id detected: ", CONTRAST$id, "."))
+        }
 
-            # Get column name: first comparison's element
-            name <- CONTRAST$comparison[1]
+        # Get column name: first comparison's element
+        variable <- CONTRAST$comparison[1]
 
-            # Check if name is valid
-            if (is.null(name) || name == "") {
-                stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.")
-            }
+        # Check if name is valid
+        if (is.null(variable) || variable == "") {
+            stop("Invalid `variable` defined in `CONTRAST$comparison[1]`.")
+        }
 
-            # Get factors (rest of components)
-            variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
+        # Get factors (rest of components)
+        levels <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
 
-            # Populate contrasts_list for later model validation
-            contrasts_list[[CONTRAST$id]] <- list(
-                "formula" = FORMULA$formula,
-                "variable" = name,
-                "contrast" = variables,
-                "blocking_factors" = CONTRAST$blocking_factors
-            )
+        # Populate contrasts_list for later model validation
+        contrasts_list[[CONTRAST$id]] <- list(
+            "variable" = variable,
+            "contrast" = levels,
+            "blocking_factors" = CONTRAST$blocking_factors
+        )
 
-            # Gather contrasts per variable into temporary storage
-            if (name %in% names(temp_var)) {
-                temp_var[[name]] <- unique(c(temp_var[[name]], variables))
-            } else {
-                temp_var[[name]] <- variables
-            }
+        # Gather contrasts per variable into temporary storage
+        if (variable %in% names(temp_var)) {
+            temp_var[[variable]] <- unique(c(temp_var[[variable]], levels))
+        } else {
+            temp_var[[variable]] <- levels
+        }
 
-            # Gather blocking factors
-            if (!is.null(CONTRAST$blocking_factors)) {
-                blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors))
-            }
+        # Gather blocking factors
+        if (!is.null(CONTRAST$blocking_factors)) {
+            blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors))
         }
     }
 
     # Consolidate gathered contrasts into `var`
-    for (name in names(temp_var)) {
-        if (name %in% names(var)) {
-            var[[name]] <- unique(c(var[[name]], temp_var[[name]]))
+    for (variable in names(temp_var)) {
+        if (variable %in% names(var)) {
+            var[[variable]] <- unique(c(var[[variable]], temp_var[[variable]]))
         } else {
-            var[[name]] <- temp_var[[name]]
+            var[[variable]] <- temp_var[[variable]]
         }
     }
 
@@ -160,9 +150,9 @@ process_models <- function(models) {
 ## Process models
 models_lists   <- process_models(models)
 ### Extract models contrasts
-contrasts_list <- models_lists[["contrasts_list"]]
+contrasts_list <- models_lists[[ "contrasts_list" ]]
 ### Extract variables and levels
-var            <- models_lists[["var"]]
+var            <- models_lists[[ "var" ]]
 
 ## Print explicit messages
 for (INDEX in 1:length(var)) {
@@ -261,7 +251,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col
                 errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " ")))
             }
 
-            ## Check that the column data does not contain undesidered characters
+            ## Check that the column data does not contain undesired characters
             if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) {
                 errors <- c(errors,
                     paste0("Column ", VARIABLE, " contains undesired characters\n")
@@ -344,6 +334,8 @@ pheno_table <- phenotable_warnings[[1]]
 ## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED
 check_model_contrasts <- function(contrasts_list, colData) {
 
+    #contrasts_list <- contrasts_list
+    #colData        <- pheno_table
     #######################################################
     ##
     ## Function that takes a list with models + variables, and a phenotypic table
@@ -364,33 +356,26 @@ check_model_contrasts <- function(contrasts_list, colData) {
     design_list <- list()
     ### Iterate over each model
     for (model_name in names(contrasts_list)) {
+        #model_name <- names(contrasts_list[2])
 
         ## Extract model components
         model        <- contrasts_list[[model_name]]
-        base_formula <- as.formula(model$formula)
         variable     <- model$variable
         contrast     <- model$contrast
-        blocking     <- model$blocking_factor
+        blocking     <- model$blocking_factors
 
-        ## Check if blocking factor is already in the formula
-        if (!is.null(blocking)) {
-            ## Get terms from the formula
-            formula_terms <- all.vars(base_formula)
+        ## Create formula
+        ### Gather components
+        formula_terms <- c(variable, blocking)
 
-            ## If blocking factor is not already in the formula, add it
-            if (!(blocking %in% formula_terms)) {
-                updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking))
-            } else {
-                updated_formula <- base_formula
-            }
-        } else {
-            updated_formula <- base_formula
-        }
+        ### Convert into formula
+        # TODO: Add flexibility for interactive terms!
+        dynamic_formula <- as.formula(paste("~", paste(formula_terms, collapse = "+")))
 
-        cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n")
+        cat("\nChecking:", model_name, "\nFormula:", deparse(dynamic_formula), "\n")
 
         # Build the design matrix
-        design_matrix <- model.matrix(updated_formula, data = colData)
+        design_matrix <- model.matrix(dynamic_formula, data = colData)
 
         # Check the rank of the design matrix
         rank <- qr(design_matrix)$rank
@@ -398,7 +383,7 @@ check_model_contrasts <- function(contrasts_list, colData) {
 
         ## Add results to list
         design_list[[ length(design_list) + 1 ]] <- list(
-            "formula"       = deparse(updated_formula),
+            "formula"       = deparse(dynamic_formula),
             "design_matrix" = design_matrix,
             "full_rank"     = rank == expected_rank
         )
diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp
new file mode 100755
index 00000000..f7470ff1
--- /dev/null
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R.bkp
@@ -0,0 +1,433 @@
+#!/usr/bin/env Rscript
+
+## TODO: add make contrast arguments
+
+# LOAD LIBRARIES ----------------------------------------------------
+suppressWarnings(suppressMessages({
+    library(tidyverse)
+    library(optparse)
+    library(yaml)
+    library(jsonlite)
+}))
+
+# PARSE ARGUMENTS ---------------------------------------------------
+## Generate list
+option_list <- list(
+    make_option(c("-y", "--yml"), type = "character", default = NULL,
+                help = "Path to the models.yml file", metavar = "character"),
+    make_option(c("-s", "--samplesheet"), type = "character", default = NULL,
+                help = "Path to the samplesheet CSV file", metavar = "character"),
+    make_option(c("-i", "--sample_id_col"), type = "character", default = "sample",
+                help = "Column that contains sample identificators", metavar = "character")
+)
+
+## Parse command-line arguments
+opt_parser <-
+    OptionParser(
+        option_list = option_list,
+        description = "Validate a sample sheet against a YML file for model designs.",
+        epilogue =
+        "
+        The process will look for variables and factors in the YML file and validate that they are present in the sample sheet.
+        Will also look for unwanted characters within columns and colnames.
+        Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not.
+        "
+    )
+opt <- parse_args(opt_parser)
+
+## Validate input arguments
+### Required arguments
+if (is.null(opt$yml) || is.null(opt$samplesheet) ) {
+    stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE)
+}
+
+### Collect parameters (easier for dev and testing)
+path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" #
+path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet_witherrors.csv" #
+sample_column    <- opt$sample_id_col   # "sample" #
+
+# LOAD FILES --------------------------------------------------------
+## Load models.yml file
+tryCatch({
+    models <- read_yaml(path_yml)
+    cat("Loaded YML file successfully.\n")
+}, error = function(e) {
+    stop("Error loading YML file: ", e$message)
+})
+
+## Load samplesheet CSV file
+tryCatch({
+    samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
+    if ( !sample_column %in% colnames(samplesheet) ) {
+        stop(paste0("Column ", sample_column, " not present in colnames(samplesheet)") )
+    }
+    cat("Loaded samplesheet successfully.\n")
+}, error = function(e) {
+    stop("Error loading samplesheet CSV: ", e$message)
+})
+
+# PARSE FACTORS/LEVELS FROM YML FILE --------------------------------
+## Collect all variables and factors from the yml file
+## * var_list:        collects all factors levels for each variable.
+## * contrast_list:   collects info for each contrast
+
+process_models <- function(models) {
+    # Initialize lists to store the output
+    contrasts_list <- list()
+    var <- list()
+
+    # Ensure models$models is not null or empty
+    if (is.null(models$models) || length(models$models) == 0) {
+        stop("models$models is null or empty. Please provide valid input.")
+    }
+
+    # Temporary storage for gathering contrasts per variable
+    temp_var <- list()
+    blocking_vars <- c()
+
+    # Iterate through models (formula)
+    for (FORMULA in models$models) {
+
+        # Ensure FORMULA$contrasts is not null or empty
+        if (is.null(FORMULA$contrasts) || length(FORMULA$contrasts) == 0) {
+            stop("FORMULA$contrasts is null or empty. Unable to proceed with this model.")
+        }
+
+        # Iterate through contrasts for each model (formula)
+        for (CONTRAST in FORMULA$contrasts) {
+
+            # Validate CONTRAST$id
+            if (is.null(CONTRAST$id)) {
+                stop("Missing CONTRAST$id detected. Unable to proceed.")
+            }
+
+            if (CONTRAST$id %in% names(contrasts_list)) {
+                stop(paste("Duplicate CONTRAST$id detected:", CONTRAST$id, ". Unable to proceed."))
+            }
+
+            # Get column name: first comparison's element
+            name <- CONTRAST$comparison[1]
+
+            # Check if name is valid
+            if (is.null(name) || name == "") {
+                stop("Invalid name detected in CONTRAST$comparison. Unable to proceed.")
+            }
+
+            # Get factors (rest of components)
+            variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)]
+
+            # Populate contrasts_list for later model validation
+            contrasts_list[[CONTRAST$id]] <- list(
+                "formula" = FORMULA$formula,
+                "variable" = name,
+                "contrast" = variables,
+                "blocking_factors" = CONTRAST$blocking_factors
+            )
+
+            # Gather contrasts per variable into temporary storage
+            if (name %in% names(temp_var)) {
+                temp_var[[name]] <- unique(c(temp_var[[name]], variables))
+            } else {
+                temp_var[[name]] <- variables
+            }
+
+            # Gather blocking factors
+            if (!is.null(CONTRAST$blocking_factors)) {
+                blocking_vars <- unique(c(blocking_vars, CONTRAST$blocking_factors))
+            }
+        }
+    }
+
+    # Consolidate gathered contrasts into `var`
+    for (name in names(temp_var)) {
+        if (name %in% names(var)) {
+            var[[name]] <- unique(c(var[[name]], temp_var[[name]]))
+        } else {
+            var[[name]] <- temp_var[[name]]
+        }
+    }
+
+    # Add blocking variables to `var`
+    var[[ "blocking_factors" ]] <- blocking_vars
+
+    # Return the populated lists
+    return(list(
+        contrasts_list = contrasts_list,
+        var = var
+    ))
+}
+
+## Process models
+models_lists   <- process_models(models)
+### Extract models contrasts
+contrasts_list <- models_lists[["contrasts_list"]]
+### Extract variables and levels
+var            <- models_lists[["var"]]
+
+## Print explicit messages
+for (INDEX in 1:length(var)) {
+    cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "")
+}
+
+# VALIDATE SAMPLESHEET BASED ON YML FILE ----------------------------
+## Define function to compare YML and samplesheet
+validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df
+
+    #######################################################
+    ##
+    ## Function that takes a list of variables (and expected values) from the models.yml file that must be present in the sample sheet.
+    ##
+    ## The function evaluates that:
+    ## * Every variable is present in the sample sheet.
+    ## * Every variable has valid name
+    ## * There are no NAs
+    ## * All expected values are present
+    ## * All samples contains a valid level.
+    ##
+    ## Finally, the function returns
+    ## * An error vector containing all errors found.
+    ## * A warnings vector
+    ##
+    ## If the error vector contains values, the script will end with a non-zero status.
+    ##
+    #######################################################
+
+    ## Initialize errors vector
+    errors <- c()
+
+    ## Initialize warning errors
+    warnings <- c()
+
+    # Initialize vector to report continuos variables
+    continuous <- c()
+
+    ## Do not allow special characters
+    undesired_chars <- "[^a-zA-Z0-9_.]"
+
+    ## Check samplesheet names for invalid colnames
+    df_colnames <- names(samplesheet)
+    true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars))
+
+    if ( sum(true_columns) > 0 ) {
+        errors <- c(errors,
+            paste0("The following columns contain undesired characters: ", paste(df_colnames[true_columns], collapse = " "))
+        )
+    }
+
+    ## Check that blocking variables exists and do not contain NAs, if they were specified
+    blocking_factors <- c()
+    if ( !is.null(variables$blocking_factors) ) {
+        blocking_factors <- variables$blocking_factors
+
+        for (VARIABLE in blocking_factors) {
+            ## Check that the column exists
+            if ( VARIABLE %in% colnames(samplesheet) ) {
+                ## Check if there are NAs
+                na_rows <- is.na( samplesheet[[ VARIABLE ]])
+                if ( sum(na_rows) > 0 ) {
+                    errors <- c(errors, paste0("Blocking factor ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " ")))
+                }
+            } else {
+                errors <- c(
+                    errors,
+                    paste0("Blocking factor ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files."))
+            }
+
+            ## Alert about continuous variables
+            if ( is.numeric(samplesheet[[ VARIABLE ]]) ) {
+                continuous <- c(continuous, VARIABLE)
+            } else {
+                paste(VARIABLE, "not continuous")
+            }
+        }
+    }
+
+    ## Check variables and levels specified
+    for (VARIABLE in names(variables)[ names(variables) != "blocking_factors" ] ) { ## Exclude blocking factors, they cannot be treated equally here
+
+        ## Check that the column exists
+        if ( VARIABLE %in% colnames(samplesheet) ) {
+
+            ## Alert about continuous variables
+            if ( is.numeric(samplesheet[[ VARIABLE ]]) ) {
+                continuous <- unique(c(continuous, VARIABLE))
+            } else {
+                paste(VARIABLE, "is not continuous")
+            }
+
+            ## Check if there are NAs
+            na_rows <- is.na( samplesheet[[ VARIABLE ]] )
+            if ( sum(na_rows) > 0 ) {
+                errors <- c(errors, paste0("Column ", VARIABLE, " contains NA/s in the following rows: ", paste(which(na_rows), collapse = " ")))
+            }
+
+            ## Check that the column data does not contain undesidered characters
+            if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) {
+                errors <- c(errors,
+                    paste0("Column ", VARIABLE, " contains undesired characters\n")
+                )
+            }
+
+            ## Extract the expected levels according to the YML file
+            expected_factors_levels <- c( rep(NA, length(variables[[ VARIABLE ]])) )
+
+            ## Assign names
+            names(expected_factors_levels) <- variables[[ VARIABLE ]]
+
+            ## Extract the levels from the sample sheet
+            obtained_factors_levels <- unique(samplesheet[[ VARIABLE ]])
+
+            ## Check that all expected levels for this variable are effectively present
+            for (LEVEL in names(expected_factors_levels) ) {
+                expected_factors_levels[ LEVEL ] <- LEVEL %in% obtained_factors_levels
+            }
+
+            if( !all(expected_factors_levels) ) {
+                errors <- c(
+                    errors,
+                    paste0(
+                        "Missing factor levels for variable '", VARIABLE, "'. ",
+                        "Present levels: ", paste( names( expected_factors_levels[ expected_factors_levels ]), collapse = " " ), ". ",
+                        "Missing levels: ", paste( names( expected_factors_levels[!expected_factors_levels ]), collapse = " " )
+                        )
+                    )
+            }
+
+            ## Check whether there are levels in the table that are not present in the models definitions
+            if (!all( obtained_factors_levels[!is.na(obtained_factors_levels)] %in% names(expected_factors_levels)) ) {
+                warnings <- c(warnings,
+                paste0( "The following labels are present in the samplesheet but are not part of the models definition and will be excluded: ", paste( obtained_factors_levels[!obtained_factors_levels %in% names(expected_factors_levels) ], collapse = " " ) ))
+            }
+
+        ## Report that a column is not even present in the table
+        } else {
+            errors <- c(
+                errors,
+                paste0("Column ", VARIABLE, " not present in sample sheet. Please check the compatibility between ", basename(path_yml), " and ", basename(path_samplesheet), " files."))
+        }
+    }
+
+    ## Report if any variable was imported as continuous
+    if ( !is.null(continuous)) {
+        warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" )))
+    }
+
+    ## Report ERRORS and stop
+    if (length(errors) > 0 ) {
+        stop(cat("\033[1;31mSome errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\033[0m\n", sep = ""))
+    }
+
+    ## Generate validated phenotypic table
+    tryCatch({
+        ## Get desired columns, starting with the one containing sample identificators
+        selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors)
+        pheno_table      <- samplesheet %>% dplyr::select(all_of(selected_columns))
+
+    }, error = function(e) {
+        stop("Error generating validated phenotypic table: ", e$message)
+    })
+
+    ## Return list
+    return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) )
+
+}
+
+## Collect all outputs from function: errors must be empty, warnings can have messages
+phenotable_warnings <- validate_model(sample_column, var, samplesheet)
+
+## Cat warnings messages
+cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n")
+
+## Get validated pheno table
+pheno_table <- phenotable_warnings[[1]]
+
+## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED
+check_model_contrasts <- function(contrasts_list, colData) {
+
+    #######################################################
+    ##
+    ## Function that takes a list with models + variables, and a phenotypic table
+    ##
+    ## The function evaluates that:
+    ## * The formulas are usable
+    ## * The models generated are full ranked
+    ##
+    ## Finally, the function returns
+    ## * An list with all designs
+    ## * full ranked: true/false
+    ##
+    ## If the error vector contains values, the script will end with a non-zero status.
+    ##
+    #######################################################
+
+    ### Initialize list for models descriptors
+    design_list <- list()
+    ### Iterate over each model
+    for (model_name in names(contrasts_list)) {
+
+        ## Extract model components
+        model        <- contrasts_list[[model_name]]
+        base_formula <- as.formula(model$formula)
+        variable     <- model$variable
+        contrast     <- model$contrast
+        blocking     <- model$blocking_factor
+
+        ## Check if blocking factor is already in the formula
+        if (!is.null(blocking)) {
+            ## Get terms from the formula
+            formula_terms <- all.vars(base_formula)
+
+            ## If blocking factor is not already in the formula, add it
+            if (!(blocking %in% formula_terms)) {
+                updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking))
+            } else {
+                updated_formula <- base_formula
+            }
+        } else {
+            updated_formula <- base_formula
+        }
+
+        cat("\nChecking:", model_name, "\nFormula:", deparse(updated_formula), "\n")
+
+        # Build the design matrix
+        design_matrix <- model.matrix(updated_formula, data = colData)
+
+        # Check the rank of the design matrix
+        rank <- qr(design_matrix)$rank
+        expected_rank <- ncol(design_matrix)
+
+        ## Add results to list
+        design_list[[ length(design_list) + 1 ]] <- list(
+            "formula"       = deparse(updated_formula),
+            "design_matrix" = design_matrix,
+            "full_rank"     = rank == expected_rank
+        )
+        names(design_list)[ length(design_list) ]  <- model_name
+    }
+
+    for (DESIGN in names(design_list)) {
+        cat("Design ", DESIGN,
+            if( design_list[[ DESIGN ]]$full_rank ) { " is " } else { " is not "},
+            "full ranked\n", sep = ""
+        )
+    }
+
+    return(design_list)
+}
+
+design_results <- check_model_contrasts(contrasts_list, pheno_table)
+
+# EXPORT DATA -------------------------------------------------------
+## Export pheno table
+write_csv(pheno_table, "pheno_table.csv")
+
+## Export designs to JSON
+jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE)
+
+## Export warnings
+if ( !is.null(phenotable_warnings[[ 2 ]]) ) {
+    jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE)
+}
+
+## Export RData
+save.image("models.RData")
diff --git a/modules/local/validatemodel/tests/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml
index 05ee8e5e..5e893629 100644
--- a/modules/local/validatemodel/tests/contrasts.yml
+++ b/modules/local/validatemodel/tests/contrasts.yml
@@ -1,12 +1,10 @@
-models:
-  - formula: "~ treatment"
-    contrasts:
-      - id: "treatment_mCherry_hND6"
-        comparison: ["treatment", "mCherry", "hND6"]
+contrasts:
+  - id: "treatment_mCherry_hND6"
+    comparison: ["treatment", "mCherry", "hND6"]
 
-      - id: "treatment_mCherry_hND6_sample_number"
-        comparison: ["treatment", "mCherry", "hND6"]
-        blocking_factors: ["sample_number"]
+  - id: "treatment_mCherry_hND6_sample_number"
+    comparison: ["treatment", "mCherry", "hND6"]
+    blocking_factors: ["sample_number"]
 
-      - id: "treatment234"
-        comparison: ["treatment", "mCherry", "hND6"]
+  - id: "treatment234"
+    comparison: ["treatment", "mCherry", "hND6"]
diff --git a/modules/local/validatemodel/tests/contrasts_maxquant.yml b/modules/local/validatemodel/tests/contrasts_maxquant.yml
deleted file mode 100644
index 05ee8e5e..00000000
--- a/modules/local/validatemodel/tests/contrasts_maxquant.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-models:
-  - formula: "~ treatment"
-    contrasts:
-      - id: "treatment_mCherry_hND6"
-        comparison: ["treatment", "mCherry", "hND6"]
-
-      - id: "treatment_mCherry_hND6_sample_number"
-        comparison: ["treatment", "mCherry", "hND6"]
-        blocking_factors: ["sample_number"]
-
-      - id: "treatment234"
-        comparison: ["treatment", "mCherry", "hND6"]
diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test
index d3f2101c..7bc7f316 100644
--- a/modules/local/validatemodel/tests/main.nf.test
+++ b/modules/local/validatemodel/tests/main.nf.test
@@ -32,7 +32,8 @@ nextflow_process {
 
         then {
             assertAll(
-                { assert process.success }
+                { assert process.success },
+                { assert snapshot(process.out).match() }
             )
         }
     }
diff --git a/modules/local/validatemodel/tests/main.nf.test.snap b/modules/local/validatemodel/tests/main.nf.test.snap
new file mode 100644
index 00000000..24f19d55
--- /dev/null
+++ b/modules/local/validatemodel/tests/main.nf.test.snap
@@ -0,0 +1,57 @@
+{
+    "Samplesheet - YML file - RNASEQ TEST": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test_yml"
+                        },
+                        "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403"
+                    ]
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test_yml"
+                        },
+                        "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0"
+                    ]
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950"
+                ],
+                "designs": [
+                    [
+                        {
+                            "id": "test_yml"
+                        },
+                        "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0"
+                    ]
+                ],
+                "pheno_table": [
+                    [
+                        {
+                            "id": "test_yml"
+                        },
+                        "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950"
+                ],
+                "warnings": [
+                    
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.3"
+        },
+        "timestamp": "2024-12-19T16:59:52.146249557"
+    }
+}
\ No newline at end of file

From 6dafd957c9698ddfb4c9d02fca754d9d4ea0af1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Fri, 20 Dec 2024 20:18:02 +0000
Subject: [PATCH 08/10] Add condition to execute the module and update snaps

---
 conf/modules.config                           |  8 ++++++-
 modules/local/validatemodel/main.nf           |  4 ++--
 .../resources/usr/bin/validate_model.R        | 22 ++++++++++++++++---
 nextflow.config                               |  3 +++
 tests/test.nf.test.snap                       | 17 +++++++++-----
 tests/test_maxquant.nf.test.snap              | 17 +++++++++-----
 workflows/differentialabundance.nf            | 14 +++++++-----
 7 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 6a1f1cb3..a0230c78 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -43,7 +43,13 @@ process {
     }
 
     withName: VALIDATE_MODEL {
-        ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' }
+        ext.args = {
+            params.observations_name_col ?
+            "--sample_id_col ${params.observations_name_col}" :
+            params.observations_id_col ?
+            "--sample_id_col ${params.observations_id_col}" :
+            ''
+        }
         publishDir = [
             path: { "${params.outdir}/validate" },
             mode: params.publish_dir_mode,
diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf
index c7962219..afbb664b 100644
--- a/modules/local/validatemodel/main.nf
+++ b/modules/local/validatemodel/main.nf
@@ -8,8 +8,8 @@ process VALIDATE_MODEL {
         'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }"
 
     input:
-    path samplesheet
-    tuple val(meta), path(models_yml)
+    tuple val(meta) , path(samplesheet)
+    tuple val(meta2), path(models_yml)
 
     output:
     tuple val(meta), path("pheno_table.csv"), emit: pheno_table
diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R
index 62e86932..f02df2ae 100755
--- a/modules/local/validatemodel/resources/usr/bin/validate_model.R
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R
@@ -49,15 +49,31 @@ sample_column    <- opt$sample_id_col   # "sample" #
 # LOAD FILES --------------------------------------------------------
 ## Load models.yml file
 tryCatch({
-    models <- read_yaml(path_yml)
+    if (!file.exists(path_yml)) {
+        stop(sprintf("File '%s' does not exist.", path_yml), call. = FALSE)
+    }
+    if (!grepl("\\.(yaml|yml)$", path_yml, ignore.case = TRUE)) {
+        stop(sprintf("File '%s' is not a YAML file (expected .yaml or .yml extension).", path_yml), call. = FALSE)
+    }
+    models <- yaml::read_yaml(path_yml)
     cat("Loaded YML file successfully.\n")
 }, error = function(e) {
-    stop("Error loading YML file: ", e$message)
+    stop(sprintf("Error loading YML file '%s': %s", path_yml, e$message), call. = FALSE)
 })
 
 ## Load samplesheet CSV file
 tryCatch({
-    samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE)
+    # Set the separator based on the file extension
+    sep <- if (str_detect(path_samplesheet, "\\.(csv|CSV)$")) {
+        ","
+    } else if (str_detect(path_samplesheet, "\\.(tsv|TSV)$")) {
+        "\t"
+    } else {
+        stop("Unsupported file extension. Please provide a .csv or .tsv file.")
+    }
+
+    # Read the file using the determined separator
+    samplesheet <- read_delim(path_samplesheet, delim = sep, show_col_types = FALSE)
     if ( !sample_column %in% colnames(samplesheet) ) {
         stop(paste0("The sample identificator column '", sample_column, "' is not present in the samplesheet file") )
     }
diff --git a/nextflow.config b/nextflow.config
index 230100ac..2ebb7960 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -377,6 +377,9 @@ set -C # No clobber - prevent output redirection from overwriting files.
 // Disable process selector warnings by default. Use debug profile to enable warnings.
 nextflow.enable.configProcessNamesValidation = false
 
+// Enable loading modules bin files
+nextflow.enable.moduleBinaries = true
+
 def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
 timeline {
     enabled = true
diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap
index 37f61c5e..bc141b95 100644
--- a/tests/test.nf.test.snap
+++ b/tests/test.nf.test.snap
@@ -190,7 +190,7 @@
     },
     "Test profile with yaml contrasts": {
         "content": [
-            21,
+            22,
             {
                 "DESEQ2_DIFFERENTIAL": {
                     "r-base": "4.1.3",
@@ -321,7 +321,11 @@
                 "tables/differential/treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv",
                 "tables/processed_abundance",
                 "tables/processed_abundance/all.normalised_counts.tsv",
-                "tables/processed_abundance/all.vst.tsv"
+                "tables/processed_abundance/all.vst.tsv",
+                "validate",
+                "validate/designs.json",
+                "validate/pheno_table.csv",
+                "validate/versions.yml"
             ],
             [
                 "treatment_mCherry_hND6_.dds.rld.rds:md5,dfe51910e230ae9cc2bc9fa5651666ba",
@@ -368,13 +372,16 @@
                 "treatment_mCherry_hND6_sample_number.deseq2.results.tsv:md5,b41cbaacd04a25b45295f3f987ff7500",
                 "treatment_mCherry_hND6_sample_number.deseq2.results_filtered.tsv:md5,98fa3ec0992859d2a524ecce1259f0ed",
                 "all.normalised_counts.tsv:md5,1d7ad0c02b483f2eff1a5b357a74d011",
-                "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd"
+                "all.vst.tsv:md5,a08d06d3c3218619b7bd85beead467bd",
+                "designs.json:md5,088d536fdff430dc12f4f8b0395a9636",
+                "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403",
+                "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1"
             ]
         ],
         "meta": {
-            "nf-test": "0.9.0",
+            "nf-test": "0.9.2",
             "nextflow": "24.10.3"
         },
-        "timestamp": "2024-12-20T13:41:10.4647836"
+        "timestamp": "2024-12-20T19:25:36.541448998"
     }
 }
\ No newline at end of file
diff --git a/tests/test_maxquant.nf.test.snap b/tests/test_maxquant.nf.test.snap
index 619ba2b7..9da3a787 100644
--- a/tests/test_maxquant.nf.test.snap
+++ b/tests/test_maxquant.nf.test.snap
@@ -177,7 +177,7 @@
     },
     "Test maxquant profile with yaml contrasts": {
         "content": [
-            20,
+            21,
             {
                 "LIMMA_DIFFERENTIAL": {
                     "r-base": "4.3.3",
@@ -289,7 +289,11 @@
                 "tables/proteus/Celltype/raw_proteingroups_tab.tsv",
                 "tables/proteus/fakeBatch",
                 "tables/proteus/fakeBatch/normalizeMedian.normalized_proteingroups_tab.tsv",
-                "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv"
+                "tables/proteus/fakeBatch/raw_proteingroups_tab.tsv",
+                "validate",
+                "validate/designs.json",
+                "validate/pheno_table.csv",
+                "validate/versions.yml"
             ],
             [
                 "fakebatch_fakeBatch_b1_b2.MArrayLM.limma.rds:md5,f321bf1e4fd44827810df9a9c38776d6",
@@ -342,13 +346,16 @@
                 "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e",
                 "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a",
                 "normalizeMedian.normalized_proteingroups_tab.tsv:md5,154fcd8d23409981b897d153e7b7e34e",
-                "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a"
+                "raw_proteingroups_tab.tsv:md5,33a8791f84c676ea1aea4842231acb6a",
+                "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006",
+                "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a",
+                "versions.yml:md5,592a22e359ab942c2d315ef0c6ed3fb1"
             ]
         ],
         "meta": {
-            "nf-test": "0.9.0",
+            "nf-test": "0.9.2",
             "nextflow": "24.10.3"
         },
-        "timestamp": "2024-12-20T13:45:34.62418232"
+        "timestamp": "2024-12-20T20:09:08.805617631"
     }
 }
\ No newline at end of file
diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf
index fad64752..3bad756b 100644
--- a/workflows/differentialabundance.nf
+++ b/workflows/differentialabundance.nf
@@ -150,11 +150,15 @@ workflow DIFFERENTIALABUNDANCE {
     ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]])
 
 
-    // Run module to validate models
-    VALIDATE_MODEL(
-        ch_input,
-        ch_contrasts_file
-    )
+    // Run module to validate models from yml file
+    if ( params.contrasts.endsWith(".yaml") || params.contrasts.endsWith(".yml") ) {
+
+        VALIDATE_MODEL (
+            ch_input,
+            ch_contrasts_file
+        )
+
+    }
 
     // If we have affy array data in the form of CEL files we'll be deriving
     // matrix and annotation from them

From 17e1e378b9012f3bcc7db0127e8939f1d65c0a90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Sat, 21 Dec 2024 15:41:25 +0000
Subject: [PATCH 09/10] address PR comments

---
 .../resources/usr/bin/validate_model.R        | 15 ++--
 .../local/validatemodel/tests/contrasts.yml   | 10 ---
 .../local/validatemodel/tests/main.nf.test    | 36 +++++++++-
 .../validatemodel/tests/main.nf.test.snap     | 71 ++++++++++++++++---
 .../local/validatemodel/tests/nextflow.config |  2 +-
 tests/test.nf.test.snap                       | 11 ++-
 workflows/differentialabundance.nf            |  2 +
 7 files changed, 114 insertions(+), 33 deletions(-)
 delete mode 100644 modules/local/validatemodel/tests/contrasts.yml

diff --git a/modules/local/validatemodel/resources/usr/bin/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R
index f02df2ae..05dbf087 100755
--- a/modules/local/validatemodel/resources/usr/bin/validate_model.R
+++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R
@@ -42,9 +42,9 @@ if (is.null(opt$yml) || is.null(opt$samplesheet) ) {
 }
 
 ### Collect parameters (easier for dev and testing)
-path_yml         <- opt$yml             # "/workspace/differentialabundance/modules/local/validatemodel/tests/contrasts.yml" #
-path_samplesheet <- opt$samplesheet     # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #
-sample_column    <- opt$sample_id_col   # "sample" #
+path_yml         <- opt$yml
+path_samplesheet <- opt$samplesheet
+sample_column    <- opt$sample_id_col
 
 # LOAD FILES --------------------------------------------------------
 ## Load models.yml file
@@ -208,11 +208,11 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col
     continuous <- c()
 
     ## Do not allow special characters
-    undesired_chars <- "[^a-zA-Z0-9_.]"
+    undesired_chars <- regex("[^a-zA-Z0-9_.]")
 
     ## Check samplesheet names for invalid colnames
     df_colnames <- names(samplesheet)
-    true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars))
+    true_columns <- stringr::str_detect(df_colnames, pattern = undesired_chars)
 
     if ( sum(true_columns) > 0 ) {
         errors <- c(errors,
@@ -268,7 +268,7 @@ validate_model <- function(sample_column, variables, samplesheet) { # sample_col
             }
 
             ## Check that the column data does not contain undesired characters
-            if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = regex(undesired_chars) ), na.rm = TRUE) > 0 ) {
+            if ( sum( stringr::str_detect( samplesheet[[ VARIABLE ]], pattern = undesired_chars ), na.rm = TRUE) > 0 ) {
                 errors <- c(errors,
                     paste0("Column ", VARIABLE, " contains undesired characters\n")
                 )
@@ -350,8 +350,6 @@ pheno_table <- phenotable_warnings[[1]]
 ## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED
 check_model_contrasts <- function(contrasts_list, colData) {
 
-    #contrasts_list <- contrasts_list
-    #colData        <- pheno_table
     #######################################################
     ##
     ## Function that takes a list with models + variables, and a phenotypic table
@@ -372,7 +370,6 @@ check_model_contrasts <- function(contrasts_list, colData) {
     design_list <- list()
     ### Iterate over each model
     for (model_name in names(contrasts_list)) {
-        #model_name <- names(contrasts_list[2])
 
         ## Extract model components
         model        <- contrasts_list[[model_name]]
diff --git a/modules/local/validatemodel/tests/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml
deleted file mode 100644
index 5e893629..00000000
--- a/modules/local/validatemodel/tests/contrasts.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-contrasts:
-  - id: "treatment_mCherry_hND6"
-    comparison: ["treatment", "mCherry", "hND6"]
-
-  - id: "treatment_mCherry_hND6_sample_number"
-    comparison: ["treatment", "mCherry", "hND6"]
-    blocking_factors: ["sample_number"]
-
-  - id: "treatment234"
-    comparison: ["treatment", "mCherry", "hND6"]
diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test
index 7bc7f316..e35f5de9 100644
--- a/modules/local/validatemodel/tests/main.nf.test
+++ b/modules/local/validatemodel/tests/main.nf.test
@@ -10,21 +10,22 @@ nextflow_process {
     tag "yml"
     tag "samplesheet"
 
-    test("Samplesheet - YML file - RNASEQ TEST") {
+    test("Samplesheet - YML file - TEST") {
 
         when {
             params {
-                observations_id_col = 'sample'
+                module_args = "--sample_id_col 'sample'"
             }
             process {
                 """
                 // Samplesheet
                 input[0] =
                     Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true)
+                        .map{ [ [id: "test_samplesheet"], it ] }
 
                 // Models_yml sheet
                 input[1] =
-                    Channel.fromPath('${moduleDir}/tests/contrasts.yml', checkIfExists: true)
+                    Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/SRP254919.contrasts.yaml", checkIfExists: true)
                         .map{ [ [id: "test_yml"], it ] }
                 """
             }
@@ -37,4 +38,33 @@ nextflow_process {
             )
         }
     }
+
+    test("Samplesheet - YML file - MaxQuant TEST") {
+
+        when {
+            params {
+                module_args = "--sample_id_col 'Name'"
+            }
+            process {
+                """
+                // Samplesheet
+                input[0] =
+                    Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv', checkIfExists: true)
+                        .map{ [ [id: "maxquant_samplesheet"], it ] }
+
+                // Models_yml sheet
+                input[1] =
+                    Channel.fromPath("https://github.com/nf-core/test-datasets/raw/refs/heads/differentialabundance/testdata/MaxQuant_contrasts.yaml", checkIfExists: true)
+                        .map{ [ [id: "maxquant_yml"], it ] }
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
 }
diff --git a/modules/local/validatemodel/tests/main.nf.test.snap b/modules/local/validatemodel/tests/main.nf.test.snap
index 24f19d55..24d37912 100644
--- a/modules/local/validatemodel/tests/main.nf.test.snap
+++ b/modules/local/validatemodel/tests/main.nf.test.snap
@@ -1,11 +1,66 @@
 {
-    "Samplesheet - YML file - RNASEQ TEST": {
+    "Samplesheet - YML file - MaxQuant TEST": {
         "content": [
             {
                 "0": [
                     [
                         {
-                            "id": "test_yml"
+                            "id": "maxquant_samplesheet"
+                        },
+                        "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a"
+                    ]
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "maxquant_samplesheet"
+                        },
+                        "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006"
+                    ]
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950"
+                ],
+                "designs": [
+                    [
+                        {
+                            "id": "maxquant_samplesheet"
+                        },
+                        "designs.json:md5,4ef011dc27b01e863483eb06f0bc4006"
+                    ]
+                ],
+                "pheno_table": [
+                    [
+                        {
+                            "id": "maxquant_samplesheet"
+                        },
+                        "pheno_table.csv:md5,e42e42e88a3f71959fd0750dfe93417a"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,bf359bcdd07f0bcc0a2c494ce2823950"
+                ],
+                "warnings": [
+                    
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.3"
+        },
+        "timestamp": "2024-12-21T15:37:37.285885311"
+    },
+    "Samplesheet - YML file - TEST": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test_samplesheet"
                         },
                         "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403"
                     ]
@@ -13,9 +68,9 @@
                 "1": [
                     [
                         {
-                            "id": "test_yml"
+                            "id": "test_samplesheet"
                         },
-                        "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0"
+                        "designs.json:md5,088d536fdff430dc12f4f8b0395a9636"
                     ]
                 ],
                 "2": [
@@ -27,15 +82,15 @@
                 "designs": [
                     [
                         {
-                            "id": "test_yml"
+                            "id": "test_samplesheet"
                         },
-                        "designs.json:md5,eb5b1a01da5d16b578f50d9bd7853ca0"
+                        "designs.json:md5,088d536fdff430dc12f4f8b0395a9636"
                     ]
                 ],
                 "pheno_table": [
                     [
                         {
-                            "id": "test_yml"
+                            "id": "test_samplesheet"
                         },
                         "pheno_table.csv:md5,4969fce19ad1ea5b3cf124f4dcc6a403"
                     ]
@@ -52,6 +107,6 @@
             "nf-test": "0.9.2",
             "nextflow": "24.10.3"
         },
-        "timestamp": "2024-12-19T16:59:52.146249557"
+        "timestamp": "2024-12-21T15:37:15.759211744"
     }
 }
\ No newline at end of file
diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config
index 52a943d5..32115f62 100644
--- a/modules/local/validatemodel/tests/nextflow.config
+++ b/modules/local/validatemodel/tests/nextflow.config
@@ -1,6 +1,6 @@
 nextflow.enable.moduleBinaries = true
 process {
     withName: 'VALIDATE_MODEL' {
-        ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' }
+        ext.args = params.module_args
     }
 }
diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap
index 142e1497..777b30c1 100644
--- a/tests/test.nf.test.snap
+++ b/tests/test.nf.test.snap
@@ -215,6 +215,13 @@
                 "PLOT_EXPLORATORY": {
                     "r-shinyngs": "2.0.0"
                 },
+                "VALIDATE_MODEL": {
+                    "r-base": "4.4.2",
+                    "tidyverse": "2.0.0",
+                    "optparse": "1.7.5",
+                    "yaml": "2.3.10",
+                    "jsonlite": "1.8.9"
+                },
                 "VALIDATOR": {
                     "r-base": "4.3.3",
                     "r-shinyngs": "2.0.0"
@@ -382,6 +389,6 @@
             "nf-test": "0.9.2",
             "nextflow": "24.10.3"
         },
-        "timestamp": "2024-12-20T19:25:36.541448998"
+        "timestamp": "2024-12-21T15:26:26.473702144"
     }
-}
+}
\ No newline at end of file
diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf
index ce527d66..707395dc 100644
--- a/workflows/differentialabundance.nf
+++ b/workflows/differentialabundance.nf
@@ -158,6 +158,8 @@ workflow DIFFERENTIALABUNDANCE {
             ch_contrasts_file
         )
 
+        ch_versions = ch_versions
+            .mix(VALIDATE_MODEL.out.versions)
     }
 
     // If we have affy array data in the form of CEL files we'll be deriving

From 6ed079319d408163cb0ef3e337db6d5b7ec9578c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alan=20M=C3=B6bbs?= <alanmmobbs@gmail.com>
Date: Sat, 21 Dec 2024 16:06:27 +0000
Subject: [PATCH 10/10] update snaps

---
 tests/test_maxquant.nf.test.snap | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test_maxquant.nf.test.snap b/tests/test_maxquant.nf.test.snap
index 9da3a787..3cc0edfa 100644
--- a/tests/test_maxquant.nf.test.snap
+++ b/tests/test_maxquant.nf.test.snap
@@ -196,6 +196,13 @@
                     "r-plotly": "4.10.2",
                     "bioconductor-limma": "3.54.0"
                 },
+                "VALIDATE_MODEL": {
+                    "r-base": "4.4.2",
+                    "tidyverse": "2.0.0",
+                    "optparse": "1.7.5",
+                    "yaml": "2.3.10",
+                    "jsonlite": "1.8.9"
+                },
                 "VALIDATOR": {
                     "r-base": "4.3.3",
                     "r-shinyngs": "2.0.0"
@@ -356,6 +363,6 @@
             "nf-test": "0.9.2",
             "nextflow": "24.10.3"
         },
-        "timestamp": "2024-12-20T20:09:08.805617631"
+        "timestamp": "2024-12-21T15:58:50.7683424"
     }
 }
\ No newline at end of file