From 9e42bdd8af30c12cb3909426fa51e04fcd17f7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alan=20M=C3=B6bbs?= Date: Wed, 18 Dec 2024 11:47:02 +0000 Subject: [PATCH] POC ready --- conf/modules.config | 9 + modules/local/validatemodel/environment.yml | 8 + modules/local/validatemodel/main.nf | 57 +++++ .../resources/usr/bin}/validate_model.R | 213 +++++++++++------- .../tests}/contrasts.yml | 0 .../tests/contrasts_maxquant.yml | 12 + .../local/validatemodel/tests/main.nf.test | 39 ++++ .../local/validatemodel/tests/nextflow.config | 6 + nextflow_schema.json | 2 +- workflows/differentialabundance.nf | 8 + 10 files changed, 270 insertions(+), 84 deletions(-) create mode 100644 modules/local/validatemodel/environment.yml create mode 100644 modules/local/validatemodel/main.nf rename modules/local/{validate_model/input_test => validatemodel/resources/usr/bin}/validate_model.R (60%) mode change 100644 => 100755 rename modules/local/{validate_model/input_test => validatemodel/tests}/contrasts.yml (100%) create mode 100644 modules/local/validatemodel/tests/contrasts_maxquant.yml create mode 100644 modules/local/validatemodel/tests/main.nf.test create mode 100644 modules/local/validatemodel/tests/nextflow.config diff --git a/conf/modules.config b/conf/modules.config index f26c1f9b..34e23407 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -42,6 +42,15 @@ process { ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" } + withName: VALIDATE_MODEL { + ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + publishDir = [ + path: { "${params.outdir}/validate" }, + mode: params.publish_dir_mode, + ] + + } + withName: AFFY_JUSTRMA_RAW { publishDir = [ [ diff --git a/modules/local/validatemodel/environment.yml b/modules/local/validatemodel/environment.yml new file mode 100644 index 00000000..9a8af4a9 --- /dev/null +++ b/modules/local/validatemodel/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-jsonlite + - conda-forge::r-optparse + - conda-forge::r-tidyverse + - conda-forge::r-yaml diff --git a/modules/local/validatemodel/main.nf b/modules/local/validatemodel/main.nf new file mode 100644 index 00000000..b8cf31ce --- /dev/null +++ b/modules/local/validatemodel/main.nf @@ -0,0 +1,57 @@ +process VALIDATE_MODEL { + tag "${models_yml}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f5/f5ef116da0b04fc6359efa48f08b9acfc040a02d7a1ec62476c77b6db0fc1499/data' : + 'community.wave.seqera.io/library/r-jsonlite_r-optparse_r-tidyverse_r-yaml:18dc3fc2d990206d' }" + + input: + tuple path(samplesheet) + tuple val(meta), path(models_yml) + + output: + tuple val(meta), path("pheno_table.csv"), emit: pheno_table + tuple val(meta), path("designs.json") , emit: designs + tuple val(meta), path("warnings.json") , emit: warnings, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + validate_model.R \\ + --yml "${models_yml}" \\ + --samplesheet "${samplesheet}" \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + + stub: + """ + touch pheno_table.csv + touch designs.json + touch warnings.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(Rscript -e 'R.Version()\$version.string' | sed -n 's|\\[1\\] "R version \\(.*\\) (.*|\\1|p') + tidyverse: \$(Rscript -e "cat(paste(packageVersion('tidyverse'), collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse'), collapse='.'))") + yaml: \$(Rscript -e "cat(paste(packageVersion('yaml'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + END_VERSIONS + """ + +} diff --git a/modules/local/validate_model/input_test/validate_model.R b/modules/local/validatemodel/resources/usr/bin/validate_model.R old mode 100644 new mode 100755 similarity index 60% rename from modules/local/validate_model/input_test/validate_model.R rename to modules/local/validatemodel/resources/usr/bin/validate_model.R index 90e73df6..bde457b1 --- a/modules/local/validate_model/input_test/validate_model.R +++ b/modules/local/validatemodel/resources/usr/bin/validate_model.R @@ -2,41 +2,52 @@ ## TODO: add make contrast arguments -# Load required libraries +# LOAD LIBRARIES ---------------------------------------------------- suppressWarnings(suppressMessages({ library(tidyverse) library(optparse) library(yaml) - #library(readr) - #library(dplyr) + library(jsonlite) })) -# Define command-line arguments +# PARSE ARGUMENTS --------------------------------------------------- +## Generate list option_list <- list( make_option(c("-y", "--yml"), type = "character", default = NULL, help = "Path to the models.yml file", metavar = "character"), make_option(c("-s", "--samplesheet"), type = "character", default = NULL, help = "Path to the samplesheet CSV file", metavar = "character"), - make_option(c("-i", "--sample"), type = "character", default = "sample", + make_option(c("-i", "--sample_id_col"), type = "character", default = "sample", help = "Column that contains sample identificators", metavar = "character") ) -# Parse command-line arguments -opt_parser <- OptionParser(option_list = option_list) +## Parse command-line arguments +opt_parser <- + OptionParser( + option_list = option_list, + description = "Validate a sample sheet against a YML file for model designs.", + epilogue = + " + The process will look for variables and factors in the YML file and validate that they are present in the sample sheet. + Will also look for unwanted characters within columns and colnames. + Finally, will evaluate all model designs based on the formula and contrasts to find whether the models are full ranked or not. + " + ) opt <- parse_args(opt_parser) -# Validate input arguments -## Required arguments -if (is.null(opt$models) || is.null(opt$samplesheet) ) { - stop("'--yml', '--samplesheet' and '--sample' arguments are required.", call. = FALSE) +## Validate input arguments +### Required arguments +if (is.null(opt$yml) || is.null(opt$samplesheet) ) { + stop("'--yml' and '--samplesheet' arguments are required.", call. = FALSE) } -## Collect parameters -path_yml <- "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" #opt$models -path_samplesheet <- "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" #opt$samplesheet -sample_column <- "sample" #opt$sample +### Collect parameters (easier for dev and testing) +path_yml <- opt$yml # "/workspace/differentialabundance/modules/local/validate_model/input_test/contrasts.yml" # +path_samplesheet <- opt$samplesheet # "/workspace/differentialabundance/results/SRP254919.samplesheet.csv" # +sample_column <- opt$sample_id_col # "sample" # -# Load models.yml file +# LOAD FILES -------------------------------------------------------- +## Load models.yml file tryCatch({ models <- read_yaml(path_yml) cat("Loaded YML file successfully.\n") @@ -44,7 +55,7 @@ tryCatch({ stop("Error loading YML file: ", e$message) }) -# Load samplesheet CSV file +## Load samplesheet CSV file tryCatch({ samplesheet <- readr::read_csv(path_samplesheet, show_col_types = FALSE) if ( !sample_column %in% colnames(samplesheet) ) { @@ -55,47 +66,56 @@ tryCatch({ stop("Error loading samplesheet CSV: ", e$message) }) +# PARSE FACTORS/LEVELS FROM YML FILE -------------------------------- ## Collect all variables and factors from the yml file -var <- list(); contrasts_list <- list(); for (FORMULA in models$models) { +## * var_list: collects all factors levels for each variable. +## * contrast_list: collects info for each contrast - ## Populate list +### Initialize empty lists +var <- list() +contrasts_list <- list() + +## Iterate through models (formula) +for (FORMULA in models$models) { + + ## Iterate through contrasts for each model (formula) for (CONTRAST in FORMULA$contrasts) { - ## Get column name: first comparison's element + ### Get column name: first comparison's element name <- CONTRAST$comparison[1] - ## Get factors + ### Get factors (rest of components) variables <- CONTRAST$comparison[2:length(CONTRAST$comparison)] - ## Populate contrasts_list for model validation - contrasts_list[[ CONTRAST$id ]] <- list( + ### Populate contrasts_list for later model validation + contrasts_list[[ CONTRAST$id ]] <- list( ## Adds the ID as main identifier "formula" = FORMULA$formula, "variable" = name, "contrast" = variables, - "blocking_factors" = CONTRAST$blocking_factors + "blocking_factors" = CONTRAST$blocking_factors ## This list can be later extended for "make_contrast" option is required ) - ## If the column is already present in the list, add more components to it + ### Start populating the variables_list + #### If the column is already present in the list, add more components to it if (name %in% names(var) ) { - variables <- unique( c(var[[ name ]], CONTRAST$comparison[2:3] )) - var[[ name ]] <- variables - cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") - ## Default to new variable + variables <- unique( c(var[[ name ]], variables )) ## Combine previous levels with the new ones without duplicating them + var[[ name ]] <- variables ## assign the values to the list + #cat("Duplicated variable: ", name, ". Adding more factors to it.\n", sep = "") + #### Default to new element } else { - cat("Detected variable:", name, "\n") - var[[ length(var) + 1 ]] <- variables - names(var)[length(var) ] <- name + #cat("Detected variable:", name, "\n") + var[[ length(var) + 1 ]] <- variables ## Asign levels to a new entry + names(var)[length(var) ] <- name ## Specify the variable name } ## Get blocking factors blocking <- c() - if ( !is.null(CONTRAST$blocking_factors) ) { - cat("Blocking factors detected for variable", name, "\n") + if ( !is.null(CONTRAST$blocking_factors) ) { ## Check if they were defined in the yml + #cat("Blocking factors detected for variable", name, "\n") blocking <- CONTRAST$blocking_factors - cat(paste(blocking, collapse = ""), "\n") } - ## Add blockig variables - if ( "blocking_factors" %in% names(var) ) { + ## Add blocking variables + if ( "blocking_factors" %in% names(var) ) { ## Check if the category exists from previous iterations var[[ "blocking_factors" ]] <- unique( c( var[[ "blocking_factors" ]], blocking )) } else { var[[ "blocking_factors" ]] <- unique( blocking ) @@ -103,18 +123,14 @@ var <- list(); contrasts_list <- list(); for (FORMULA in models$models) { } } -## Print explicit message +## Print explicit messages for (INDEX in 1:length(var)) { - cat("Detected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\n", sep = "") + cat("\033[32mDetected '", names(var)[INDEX], "' variable with ", paste(var[[INDEX]], collapse = " "), " levels.\033[0m \n", sep = "") } - -## Function to validate the samplesheet against the yml file -validate_model <- function(sample_column, variables, samplesheet) { # variables: list; samplesheet: df - - #variables <- var - #samplesheet <- samplesheet - undesired_chars <- "[^a-zA-Z0-9_]" +# VALIDATE SAMPLESHEET BASED ON YML FILE ---------------------------- +## Define function to compare YML and samplesheet +validate_model <- function(sample_column, variables, samplesheet) { # sample_column: string; variables: list; samplesheet: df ####################################################### ## @@ -144,7 +160,10 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables: # Initialize vector to report continuos variables continuous <- c() - ## Check samplesheet names for invalid characters + ## Do not allow special characters + undesired_chars <- "[^a-zA-Z0-9_.]" + + ## Check samplesheet names for invalid colnames df_colnames <- names(samplesheet) true_columns <- stringr::str_detect(df_colnames, pattern = regex(undesired_chars)) @@ -247,57 +266,80 @@ validate_model <- function(sample_column, variables, samplesheet) { # variables: } } + ## Report if any variable was imported as continuous if ( !is.null(continuous)) { warnings <- c(warnings, paste0("The following continuous variables were detected or coerced into numeric: ", paste(continuous, collapse = "" ))) } + ## Report ERRORS and stop if (length(errors) > 0 ) { stop(cat("Some errors where found while validating the samplesheet and models definitions:\n", paste(errors, collapse = "\n"), "\n", sep = "")) } - ## Generate validated samplesheet + ## Generate validated phenotypic table tryCatch({ + ## Get desired columns, starting with the one containing sample identificators selected_columns <- c( sample_column, names(variables)[ names(variables) != "blocking_factors" ], blocking_factors) pheno_table <- samplesheet %>% dplyr::select(all_of(selected_columns)) + }, error = function(e) { - stop("Error generating validated samplesheet: ", e$message) + stop("Error generating validated phenotypic table: ", e$message) }) - return(list( 'pheno_table' = pheno_table, 'errors' = errors, 'warnings' = warnings) ) + ## Return list + return(list( 'pheno_table' = pheno_table, 'warnings' = warnings) ) } ## Collect all outputs from function: errors must be empty, warnings can have messages -phenotable_errors_warnings <- validate_model(sample_column, var, samplesheet) +phenotable_warnings <- validate_model(sample_column, var, samplesheet) ## Cat warnings messages -cat(phenotable_errors_warnings[[ 3 ]], "\n") +cat("\033[1;33m", paste(phenotable_warnings[[ 2 ]], collapse = "\n"), "\033[0m\n") -## Get validated sample sheet -pheno_table <- phenotable_errors_warnings[[ 1 ]] +## Get validated pheno table +pheno_table <- phenotable_warnings[[1]] -## CHECK THAT THE MODELS ARE FULL RANKED +## FUNCTION TO CHECK THAT THE MODELS ARE FULL RANKED check_model_contrasts <- function(contrasts_list, colData) { + + ####################################################### + ## + ## Function that takes a list with models + variables, and a phenotypic table + ## + ## The function evaluates that: + ## * The formulas are usable + ## * The models generated are full ranked + ## + ## Finally, the function returns + ## * An list with all designs + ## * full ranked: true/false + ## + ## If the error vector contains values, the script will end with a non-zero status. + ## + ####################################################### + + ### Initialize list for models descriptors + design_list <- list() + ### Iterate over each model for (model_name in names(contrasts_list)) { - # Extract model components + ## Extract model components model <- contrasts_list[[model_name]] - base_formula <- as.formula(model$formula) # Ensure formula object + base_formula <- as.formula(model$formula) variable <- model$variable contrast <- model$contrast - blocking <- model$blocking_factor # Extract blocking factor + blocking <- model$blocking_factor - # Check if blocking factor is already in the formula + ## Check if blocking factor is already in the formula if (!is.null(blocking)) { - # Get terms from the formula + ## Get terms from the formula formula_terms <- all.vars(base_formula) - # If blocking factor is not already in the formula, add it + ## If blocking factor is not already in the formula, add it if (!(blocking %in% formula_terms)) { - #cat("\nAdding blocking factor:", blocking, "to the model.\n") updated_formula <- as.formula(paste(deparse(base_formula), "+", blocking)) } else { - #cat("\nBlocking factor:", blocking, "is already in the formula. Skipping addition.\n") updated_formula <- base_formula } } else { @@ -308,32 +350,37 @@ check_model_contrasts <- function(contrasts_list, colData) { # Build the design matrix design_matrix <- model.matrix(updated_formula, data = colData) - cat("Design matrix:\n") print(design_matrix) # Check the rank of the design matrix rank <- qr(design_matrix)$rank expected_rank <- ncol(design_matrix) - if (rank == expected_rank) { - cat("The design matrix is full rank.\n") - } else { - cat("WARNING: The design matrix is NOT full rank!\n") - } - - # Check if contrast variable exists in colData - if (variable %in% colnames(colData)) { - factor_levels <- levels(factor(colData[[variable]], levels = contrast)) - if (all(contrast %in% factor_levels)) { - cat("Contrast levels are valid:", paste(contrast, collapse = " vs "), "\n") - } else { - cat("ERROR: Contrast levels", paste(contrast, collapse = " vs "), - "do not exist in", variable, "\n") - } - } else { - cat("ERROR: Contrast variable", variable, "is not in colData.\n") - } + ## Add results to list + design_list[[ length(design_list) + 1 ]] <- list( + "formula" = deparse(updated_formula), + "design_matrix" = design_matrix, + "full_rank" = rank == expected_rank + ) + names(design_list)[ length(design_list) ] <- model_name } + + return(design_list) +} + +design_results <- check_model_contrasts(contrasts_list, pheno_table) + +# EXPORT DATA ------------------------------------------------------- +## Export pheno table +write_csv(pheno_table, "pheno_table.csv") + +## Export designs to JSON +jsonlite::write_json(design_results, path = "designs.json", pretty = TRUE, auto_unbox = TRUE) + +## Export warnings +if ( !is.null(phenotable_warnings[[ 2 ]]) ) { + jsonlite::write_json(phenotable_warnings[[ 2 ]], path = "warnings.json", pretty = TRUE) } -check_model_contrasts(contrasts_list, pheno_table) +## Export RData +save.image("models.RData") diff --git a/modules/local/validate_model/input_test/contrasts.yml b/modules/local/validatemodel/tests/contrasts.yml similarity index 100% rename from modules/local/validate_model/input_test/contrasts.yml rename to modules/local/validatemodel/tests/contrasts.yml diff --git a/modules/local/validatemodel/tests/contrasts_maxquant.yml b/modules/local/validatemodel/tests/contrasts_maxquant.yml new file mode 100644 index 00000000..05ee8e5e --- /dev/null +++ b/modules/local/validatemodel/tests/contrasts_maxquant.yml @@ -0,0 +1,12 @@ +models: + - formula: "~ treatment" + contrasts: + - id: "treatment_mCherry_hND6" + comparison: ["treatment", "mCherry", "hND6"] + + - id: "treatment_mCherry_hND6_sample_number" + comparison: ["treatment", "mCherry", "hND6"] + blocking_factors: ["sample_number"] + + - id: "treatment234" + comparison: ["treatment", "mCherry", "hND6"] diff --git a/modules/local/validatemodel/tests/main.nf.test b/modules/local/validatemodel/tests/main.nf.test new file mode 100644 index 00000000..d3f2101c --- /dev/null +++ b/modules/local/validatemodel/tests/main.nf.test @@ -0,0 +1,39 @@ +nextflow_process { + + name "Test Process VALIDATE_MODEL" + script "../main.nf" + process "VALIDATE_MODEL" + config "./nextflow.config" + + tag "modules" + tag "models" + tag "yml" + tag "samplesheet" + + test("Samplesheet - YML file - RNASEQ TEST") { + + when { + params { + observations_id_col = 'sample' + } + process { + """ + // Samplesheet + input[0] = + Channel.fromPath('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv', checkIfExists: true) + + // Models_yml sheet + input[1] = + Channel.fromPath('${moduleDir}/tests/contrasts.yml', checkIfExists: true) + .map{ [ [id: "test_yml"], it ] } + """ + } + } + + then { + assertAll( + { assert process.success } + ) + } + } +} diff --git a/modules/local/validatemodel/tests/nextflow.config b/modules/local/validatemodel/tests/nextflow.config new file mode 100644 index 00000000..52a943d5 --- /dev/null +++ b/modules/local/validatemodel/tests/nextflow.config @@ -0,0 +1,6 @@ +nextflow.enable.moduleBinaries = true +process { + withName: 'VALIDATE_MODEL' { + ext.args = { params.observations_id_col ? "--sample_id_col ${params.observations_id_col}" : '' } + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 70af98c8..6a51601d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,7 +42,7 @@ "type": "string", "description": "A CSV file describing sample contrasts", "help_text": "This file is used to define groups of samples from 'input' to compare. It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)", - "pattern": "^\\S+\\.(csv|tsv)$", + "pattern": "^\\S+\\.(csv|tsv|yml|yaml)$", "format": "file-path", "mimetype": "text/csv", "fa_icon": "fas fa-adjust" diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index e49bebda..e70270b1 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -132,6 +132,7 @@ include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/n include { GEOQUERY_GETGEO } from '../modules/nf-core/geoquery/getgeo/main' include { ZIP as MAKE_REPORT_BUNDLE } from '../modules/nf-core/zip/main' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { VALIDATE_MODEL } from '../modules/local/validatemodel/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,6 +148,13 @@ workflow DIFFERENTIALABUNDANCE { // Channel for the contrasts file ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) + + // Run module to validate models + VALIDATE_MODEL( + ch_input, + ch_contrasts_file + ) + // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them