diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1a5b997..ece2dda8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: - "test" - "test_nogtf" - "test_affy" + - "test_maxquant" - "test_soft" steps: - name: Check out pipeline code diff --git a/CHANGELOG.md b/CHANGELOG.md index 79d0b282..582f2078 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ([@azedinez](https://github.com/azedinez), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#137](https://github.com/nf-core/differentialabundance/pull/137)] - Add `--sizefactors_from_controls` and `--gene_id_col` for DESeq2 module to modules.config ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#145](https://github.com/nf-core/differentialabundance/pull/145)] - Template update for nf-core/tools v2.9 ([@nf-core-bot](https://github.com/nf-core-bot), review by [@pinin4fjords](https://github.com/pinin4fjords), [@WackerO](https://github.com/WackerO)) +- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - Add Maxquant analysis module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) ### `Fixed` @@ -28,8 +29,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - [[#159](https://github.com/nf-core/differentialabundance/issues/159)] - CUSTOM/MATRIXFILTER module update ([@WackerO](https://github.com/WackerO), review by [@suzannejin](https://github.com/suzannejin)) -- [[#152](https://github.com/nf-core/differentialabundance/issues/152)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) +- [[#154](https://github.com/nf-core/differentialabundance/issues/154)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#151](https://github.com/nf-core/differentialabundance/issues/151)] - Module update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) +- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - RMARKDOWNNOTEBOOK env update, SHINYNGS and CUSTOM update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) ## v1.2.0 - 2023-04-19 diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 5148b1f3..d6b654b8 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -35,11 +35,18 @@ params: features_metadata_cols: NULL features_gtf_feature_type: NULL features_gtf_table_first_field: NULL + features_log2_assays: NULL raw_matrix: null # e.g. 0_salmon.merged.gene_counts.tsv normalised_matrix: null variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv contrasts_file: null # e.g. GSE156533.contrasts.csv differential_table: file.csv + proteus_measurecol_prefix: NULL + proteus_norm_function: NULL + proteus_plotsd_method: NULL + proteus_plotmv_loess: NULL + proteus_palette_name: NULL + proteus_round_digits: NULL affy_cel_files_archive: NULL affy_file_name_col: NULL affy_background: NULL @@ -235,26 +242,27 @@ names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) assay_data <- lapply(assay_files, function(x) { - mat <- read_matrix( - x, - sample_metadata = observations, - row.names = 1 + mat <- na.omit( + read_matrix( + x, + sample_metadata = observations, + row.names = 1 + ) ) colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] - - # Bit hacky, but ensure log - if (max(mat) > 20){ - log2(mat+1) - }else{ - mat - } + mat }) +if (!is.null(params$features_log2_assays)) { + # Remove brackets from assay list. TODO: Remove if this is added to cond_log2_transform_assays + features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', params$features_log2_assays)) + assay_data <- cond_log2_transform_assays(assay_data, features_log2_assays) +} + # Now we can rename the observations rows using the title field rownames(observations) <- observations[[params$observations_name_col]] # Run PCA early so we can understand how important each variable is - pca_datas <- lapply(names(assay_data), function(assay_type){ compilePCAData(assay_data[[assay_type]]) }) @@ -321,7 +329,6 @@ differential_results <- lapply(differential_files, function(diff_file){ } # Annotate differential tables if possible - if (! is.null(params$features)){ diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) } @@ -594,7 +601,6 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], observations[, iv, drop = FALSE], @@ -813,6 +819,11 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ # Methods +```{r, echo=FALSE, results='asis', eval=params$study_type == 'maxquant'} +cat(paste0("\n## Protein abundance import\n")) +make_params_table('importing maxquant output', 'proteus_', remove_pattern = TRUE) +``` + ## Filtering ```{r, echo=FALSE, results='asis'} diff --git a/conf/maxquant.config b/conf/maxquant.config new file mode 100644 index 00000000..c2f081ca --- /dev/null +++ b/conf/maxquant.config @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running MaxQuant proteomics analysis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines settings specific to MaxQuant proteomics analysis + + Use as follows: + nextflow run nf-core/differentialabundance -profile maxquant, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + + config_profile_name = 'MaxQuant profile' + config_profile_description = 'Settings for MaxQuant analysis' + + // Study + study_type = 'maxquant' + study_abundance_type = 'intensities' + + // Features + features_id_col = 'Majority protein IDs' + features_name_col = 'Majority protein IDs' + features_metadata_cols = 'Majority protein IDs' + features_type = 'protein' + + // Exploratory + exploratory_assay_names = "raw,normalised" + exploratory_final_assay = "normalised" + + // Differential options + differential_file_suffix = ".limma.results.tsv" + differential_fc_column = "logFC" + differential_pval_column = "P.Value" + differential_qval_column = "adj.P.Val" + differential_feature_id_column = "probe_id" + differential_feature_name_column = "Majority protein IDs" + + // Proteus options + proteus_measurecol_prefix = 'LFQ intensity ' + + // Shiny does not work for this datatype + shinyngs_build_app = false +} diff --git a/conf/modules.config b/conf/modules.config index d9e3fe38..37798f8d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,6 +100,43 @@ process { ].join(' ').trim() } } + withName: PROTEUS { + publishDir = [ + [ + path: { "${params.outdir}/tables/proteus/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/plots/proteus/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.png' + ], + [ + path: { "${params.outdir}/other/proteus/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.rds' + + ], + [ + path: { "${params.outdir}/other/proteus/" }, + mode: params.publish_dir_mode, + pattern: '*sessionInfo.log' + ] + ] + ext.args = { [ + "--contrast_variable \"${meta.id}\"", + "--sample_id_col \"${params.observations_id_col}\"", + "--protein_id_col \"${params.features_id_col}\"", + "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"", + "--norm_function $params.proteus_norm_function", + "--plotsd_method $params.proteus_plotsd_method", + "--plotmv_loess $params.proteus_plotmv_loess", + "--palette_name $params.proteus_palette_name", + "--round_digits $params.proteus_round_digits" + ].join(' ').trim() } + } + withName: GEOQUERY_GETGEO { publishDir = [ [ @@ -291,7 +328,8 @@ process { "--assay_names \"${params.exploratory_assay_names}\"", "--final_assay \"${params.exploratory_final_assay}\"", "--outlier_mad_threshold ${params.exploratory_mad_threshold}", - "--palette_name \"${params.exploratory_palette_name}\"" + "--palette_name \"${params.exploratory_palette_name}\"", + ( (params.study_type == 'maxquant') ? "--log2_assays ''" : (((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"".replace('[', '').replace(']', ''))) ) ].join(' ').trim() } } @@ -352,8 +390,8 @@ process { } withName: RMARKDOWNNOTEBOOK { - conda = "bioconda::r-shinyngs=1.8.1" - container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" } + conda = "bioconda::r-shinyngs=1.8.2" + container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" } publishDir = [ path: { "${params.outdir}/report" }, mode: params.publish_dir_mode, diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config new file mode 100644 index 00000000..89d31cca --- /dev/null +++ b/conf/test_maxquant.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple + pipeline test with MaxQuant Mass-spec data. + + Use as follows: + nextflow run nf-core/differentialabundance -profile test_maxquant, --outdir + +---------------------------------------------------------------------------------------- +*/ + +includeConfig 'maxquant.config' + +params { + study_name = 'PXD043349' + config_profile_name = 'MaxQuant test profile' + config_profile_description = 'MaxQuant test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' + matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' + contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' + + // Observations + observations_id_col = 'Experiment' + observations_name_col = 'Name' + + // Exploratory + exploratory_main_variable = 'Celltype' +} diff --git a/docs/output.md b/docs/output.md index c8e4275e..f256c505 100644 --- a/docs/output.md +++ b/docs/output.md @@ -37,6 +37,11 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `[contrast]/png/volcano.png`: Volcano plots of -log(10) p value agains log(2) fold changes - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast. - `[contrast]/png/[gsea_plot_type].png` + - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). + - `[contrast]/[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization. + - `[contrast]/[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level. + - `[contrast]/[norm_function].normalized_distributions.png`: A plot of sample distributions after normalization. + - `[contrast]/raw_distributions.png`: A plot of sample distributions without normalization. @@ -61,6 +66,9 @@ Most plots are included in the HTML report (see above), but are also included in - `OR [contrast_name].limma.results.tsv`: Results of Limma differential analyis (Affymetrix arrays) - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled) - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast + - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). + - `[contrast]/[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization. + - `[contrast]/raw_proteingroups_tab.tsv`: Abundance table without normalization. diff --git a/docs/usage.md b/docs/usage.md index c9859472..dcdcd7e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -11,10 +11,10 @@ Differential analysis is a common task in a variety of use cases. In essence, al With the above in mind, running this workflow requires: - a set of abundance values. This can be: - - (for RNA-seq): a matrix of quantifications with observations by column and features by row + - (for RNA-seq or MaxQuant proteomics measurements): a matrix of quantifications with observations by column and features by row - (for Affymetrix microarrays): a tar'd archive of CEL files - a description of the observations such as a sample sheet from RNA-seq analysis -- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table. +- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. Skip for MaxQuant. You can also supply your own table. - a specification of how the matrix should be split, and how the resulting groups should be compared ## Observations (samplesheet) input @@ -49,6 +49,14 @@ The file can be tab or comma separated. This is a numeric square matrix file, comma or tab-separated, with a column for every observation, and features corresponding to the supplied feature set. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. +### MaxQuant intensities + +```bash +--matrix '[path to matrix file]' +``` + +This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (do not forget trailing whitespace in this parameter, if required!). + ### Affymetrix microarrays ```bash @@ -109,7 +117,7 @@ The file can be tab or comma separated. --gtf '[path to gtf file]' ``` -This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. +This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. Skip for MaxQuant. ### Annotation package identifiers for Affymetrix arrays @@ -123,7 +131,7 @@ To override the above options, you may also supply your own features table as a --features '[path to features TSV]' ``` -By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. +By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis, i.e. do not provide features. ## Shiny app generation @@ -197,7 +205,7 @@ The typical command for running the pipeline is as follows: ```bash nextflow run nf-core/differentialabundance \ - [--profile rnaseq OR -profile affy] \ + [-profile rnaseq OR -profile affy] \ --input samplesheet.csv \ --contrasts contrasts.csv \ [--matrix assay_matrix.tsv OR --affy_cel_files_archive cel_files.tar] \ diff --git a/modules.json b/modules.json index 2a3939fa..dcf9482f 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "custom/matrixfilter": { @@ -60,6 +60,11 @@ "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", "installed_by": ["modules"] }, + "proteus/readproteingroups": { + "branch": "master", + "git_sha": "685765c4a5e3423d20f74aa9c4405ef0b8c4748d", + "installed_by": ["modules"] + }, "rmarkdownnotebook": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", @@ -72,17 +77,17 @@ }, "shinyngs/staticdifferential": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "shinyngs/staticexploratory": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "shinyngs/validatefomcomponents": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "untar": { diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/proteus/readproteingroups/main.nf b/modules/nf-core/proteus/readproteingroups/main.nf new file mode 100644 index 00000000..37126cfe --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/main.nf @@ -0,0 +1,30 @@ +process PROTEUS_READPROTEINGROUPS { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::r-base=4.2.1 bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.2 bioconda::bioconductor-limma=3.54.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0': + 'biocontainers/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0' }" + + input: + tuple val(meta), path(samplesheet), path(intensities) + + output: + tuple val(meta), path("*dendrogram.png") , emit: dendro_plot + tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_plot + tuple val(meta), path("*raw_distributions.png") , emit: raw_dist_plot + tuple val(meta), path("*normalized_distributions.png") , emit: norm_dist_plot + tuple val(meta), path("*raw_proteingroups.rds") , emit: raw_rdata + tuple val(meta), path("*normalized_proteingroups.rds") , emit: norm_rdata + tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: raw_tab + tuple val(meta), path("*normalized_proteingroups_tab.tsv") , emit: norm_tab + tuple val(meta), path("*R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'proteus_readproteingroups.R' +} diff --git a/modules/nf-core/proteus/readproteingroups/meta.yml b/modules/nf-core/proteus/readproteingroups/meta.yml new file mode 100644 index 00000000..bed3dc68 --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/meta.yml @@ -0,0 +1,81 @@ +name: "proteus_readproteingroups" +description: reads a maxQuant proteinGroups file with Proteus +keywords: + - proteomics + - proteus + - readproteingroups +tools: + - "proteus": + description: "R package for analysing proteomics data" + homepage: "https://github.com/bartongroup/Proteus" + documentation: "https://rdrr.io/github/bartongroup/Proteus/" + tool_dev_url: "https://github.com/bartongroup/Proteus" + doi: "10.1101/416511" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] + - samplesheet: + type: file + description: | + CSV or TSV format sample sheet with sample metadata; check here for specifications: https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html + - intensities: + type: file + description: | + proteinGroups TXT file with protein intensities information from maxQuant; check here for specifications: https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html + - meta2: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] + - contrast_variable: + type: string + description: | + The column in the sample sheet that should be used to define groups for comparison + +output: + - dendro_plot: + type: file + description: | + PNG file; dendrogram of the normalized samples hierarchically clustered by their intensities + - mean_var_plot: + type: file + description: | + PNG file; plot of the log-intensity variance vs log-intensity mean of each condition in the normalized samples + - raw_dist_plot: + type: file + description: | + PNG file; plot of the intensity/ratio distributions of the raw samples + - norm_dist_plot: + type: file + description: | + PNG file; plot of the intensity/ratio distributions of the normalized samples + - raw_rdata: + type: file + description: | + RDS file of a proteinGroups object from Proteus, contains raw protein intensities and additional info + - norm_rdata: + type: file + description: | + RDS file of a proteinGroups object from Proteus, contains normalized protein intensities and additional info + - raw_tab: + type: file + description: | + TSV-format intensities table from Proteus, contains raw protein intensities + - norm_tab: + type: file + description: | + TSV-format intensities table from Proteus, contains normalized protein intensities + - session_info: + type: file + description: | + LOG file of the R sessionInfo from the module run + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@WackerO" diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R new file mode 100644 index 00000000..5806971d --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R @@ -0,0 +1,382 @@ +#!/usr/bin/env Rscript + +# Written by Oskar Wacker (https://github.com/WackerO) in +# collaboration with Stefan Czemmel (https://github.com/qbicStefanC) +# Script template by Jonathan Manning (https://github.com/pinin4fjords) + +# MIT License + +# Copyright (c) QBiC + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x) { + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z) { length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Passed to read.delim() +#' @param row.names Passed to read.delim() +#' +#' @return output Data frame + +read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F) { + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) +} + +#' Round numeric dataframe columns to fixed decimal places by applying +#' formatting and converting back to numerics +#' +#' @param dataframe A data frame +#' @param columns Which columns to round (assumes all of them by default) +#' @param digits How many decimal places to round to? If -1, will return the unchanged input df +#' +#' @return output Data frame +round_dataframe_columns <- function(df, columns = NULL, digits = -1) { + if (digits == -1) { + return(df) # if -1, return df without rounding + } + + df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up + if (is.null(columns)) { + columns <- colnames(df) + } + df[,columns] <- round( + data.frame(df[, columns], check.names = FALSE), + digits = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +################################################ +################################################ +## PARSE PARAMETERS FROM NEXTFLOW ## +################################################ +################################################ + +# I've defined these in a single array like this so that we could go back to an +# optparse-driven method in future with module bin/ directories, rather than +# the template + +# Set defaults and classes + +opt <- list( + intensities_file = '$intensities', + sample_file = '$samplesheet', + contrast_variable = NULL, + protein_id_col = 'Majority protein IDs', + sample_id_col = 'sample', + measure_col_prefix = 'intensities', + norm_function = 'normalizeMedian', + plotsd_method = 'violin', + plotmv_loess = T, + palette_name = 'Set1', + round_digits = -1 +) +opt_types <- lapply(opt, class) + +# Apply parameter overrides + +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)) { + if (! ao %in% names(opt)) { + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])) { + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided + +required_opts <- c('contrast_variable') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] + +if (length(missing) > 0) { + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +# Check file inputs are valid + +for (file_input in c('intensities_file', 'sample_file')) { + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + + if (! file.exists(opt[[file_input]])) { + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(limma) +library(plotly) +library(proteus) + +################################################ +################################################ +# READ IN INTENSITIES FILE AND SAMPLE METADATA # +################################################ +################################################ + +intensities.table <- + read_delim_flexible( + file = opt\$intensities_file, + check.names = FALSE + ) + +sample.sheet <- + read_delim_flexible( + file = opt\$sample_file, + check.names=FALSE + ) + +if (! opt\$protein_id_col %in% colnames(intensities.table)) { + stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table")) +} + +if (! opt\$sample_id_col %in% colnames(sample.sheet)) { + stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet")) +} + +# Add metadata columns that are necessary for proteus + +sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] +sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] + +# Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet + +measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) + +# Check that all samples specified in the input sheet are present in the intensities table + +missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) +missing_columns <- missing_columns[!missing_columns %in% colnames(intensities.table)] +if (length(missing_columns) > 0) { + stop(paste( + length(missing_columns), + 'specified samples do not have a(n)', + opt\$measure_col_prefix, + 'column in intensities table. The following columns are missing:', + paste(missing_columns, collapse = ', ') + )) +} + +################################################ +################################################ +## Run Proteus processes and generate outputs ## +################################################ +################################################ + +# Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) + +proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) +proteinGroups <- readProteinGroups( + file=opt\$intensities_file, + meta=sample.sheet, + measure.cols=measure.cols, + data.cols=proteinColumns +) + +# Define valid normalization functions + +valid_norm_functions <- list("normalizeMedian", "normalizeQuantiles") + +# Generate plots for requested normalization; also, save normalized protein groups for limma + +if (! (opt\$norm_function %in% valid_norm_functions)) { + stop(paste0("Invalid norm_function argument: ", opt\$norm_function, + ". Valid norm_functions are: ", paste(valid_norm_functions, collapse=", "), ".")) +} + +proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=opt\$norm_function))) # Proteus also accepts other norm.funs, e.g. from limma +proteinGroups.normalized\$tab <- log2(proteinGroups.normalized\$tab) + +png(paste(opt\$norm_function, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable), fill="condition", method=opt\$plotsd_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() + +png(paste(opt\$norm_function, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotMV(proteinGroups.normalized, with.loess=opt\$plotmv_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable)) + + scale_fill_distiller(palette=opt\$palette_name) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() + +png(paste(opt\$norm_function, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotClustering(proteinGroups.normalized) + + ggtitle(paste0("Sample clustering after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable)) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() + +# R object for other processes to use + +saveRDS(proteinGroups.normalized, file = paste(opt\$norm_function, 'normalized_proteingroups.rds', sep='.')) + +# Write normalized intensities matrix + +out_df <- data.frame( + round_dataframe_columns(proteinGroups.normalized\$tab, digits=opt\$round_digits), + check.names = FALSE +) +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; save these to a separate column +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position +write.table( + out_df, + file = paste(opt\$norm_function, 'normalized_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +# Process and save raw table + +proteinGroups\$tab <- log2(proteinGroups\$tab) + +# Generate raw distribution plot + +png('raw_distributions.png', width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotSampleDistributions(proteinGroups, title=paste("Raw sample distributions in contrast", opt\$contrast_variable), fill="condition", method=opt\$plotsd_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() + +# R object for other processes to use + +saveRDS(proteinGroups, file = 'raw_proteingroups.rds') + + +# Write raw intensities matrix + +out_df <- data.frame( + round_dataframe_columns(proteinGroups\$tab, digits=opt\$round_digits), + check.names = FALSE + ) +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; save these to a separate column +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position + + +write.table( + out_df, + file = 'raw_proteingroups_tab.tsv', + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink("R_sessionInfo.log") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +limma.version <- as.character(packageVersion('limma')) +plotly.version <- as.character(packageVersion('plotly')) +proteus.version <- as.character(packageVersion('proteus')) +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' r-proteus-bartongroup:', proteus.version), + paste(' r-plotly:', plotly.version), + paste(' bioconductor-limma:', limma.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/shinyngs/staticdifferential/main.nf b/modules/nf-core/shinyngs/staticdifferential/main.nf index cdc40d02..d2bbbc8c 100644 --- a/modules/nf-core/shinyngs/staticdifferential/main.nf +++ b/modules/nf-core/shinyngs/staticdifferential/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_STATICDIFFERENTIAL { tag "$meta.id" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(differential_result) // Differential info: contrast and differential stats diff --git a/modules/nf-core/shinyngs/staticexploratory/main.nf b/modules/nf-core/shinyngs/staticexploratory/main.nf index 2c351949..851325bf 100644 --- a/modules/nf-core/shinyngs/staticexploratory/main.nf +++ b/modules/nf-core/shinyngs/staticexploratory/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_STATICEXPLORATORY { tag "$meta.id" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(feature_meta), path(assay_files) diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf index 98fb49e1..bdfb19af 100644 --- a/modules/nf-core/shinyngs/validatefomcomponents/main.nf +++ b/modules/nf-core/shinyngs/validatefomcomponents/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { tag "$sample" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(assay_files) diff --git a/nextflow.config b/nextflow.config index 705fee6c..0e07ee51 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,6 +40,7 @@ params { features_id_col = 'gene_id' features_name_col = 'gene_name' features_metadata_cols = 'gene_id,gene_name,gene_biotype' + features_log2_assays = null // GTF parsing options features_gtf_feature_type = 'transcript' @@ -49,7 +50,7 @@ params { affy_cel_files_archive = null affy_file_name_col = 'file' affy_background = true - affy_bgversion = 2 + affy_bgversion = 2 affy_destructive = false affy_cdfname = null affy_rm_mask = false @@ -57,6 +58,14 @@ params { affy_rm_extra = false affy_build_annotation = true + // Proteus-specific options + proteus_measurecol_prefix = 'LFQ intensity ' + proteus_norm_function = 'normalizeMedian' + proteus_plotsd_method = 'violin' + proteus_plotmv_loess = true + proteus_palette_name = 'Set1' + proteus_round_digits = -1 + // Filtering options filtering_min_samples = 1 filtering_min_abundance = 1 @@ -313,6 +322,7 @@ profiles { rnaseq { includeConfig 'conf/rnaseq.config' } soft {includeConfig 'conf/soft.config'} test_affy { includeConfig 'conf/test_affy.config' } + test_maxquant { includeConfig 'conf/test_maxquant.config' } test_soft {includeConfig 'conf/test_soft.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 5f5915b6..e50f8e88 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,7 @@ "default": "rnaseq", "description": "A string identifying the technology used to produce the data", "help_text": "Currently 'rnaseq' or 'affy_array' may be specified.", - "enum": ["rnaseq", "affy_array", "geo_soft_file"], + "enum": ["rnaseq", "affy_array", "maxquant", "geo_soft_file"], "fa_icon": "far fa-keyboard" }, "input": { @@ -185,6 +185,11 @@ "default": "gene_id", "description": "Where a GTF file is supplied, which field should go first in the converted output table", "fa_icon": "fas fa-fast-backward" + }, + "features_log2_assays": { + "type": "string", + "description": "Of which assays to compute the log2. Not necessary for maxquant data as this is controlled by the pipeline.", + "help_text": "Either comma-separated of assay positions, e.g. '[1,2,3]', or empty list '[]' to not log any assay. If not set, will guess which assays need to be logged (those with a maximum > 20)." } }, "required": ["features_id_col", "features_name_col", "features_type"], @@ -250,6 +255,50 @@ }, "fa_icon": "fas fa-table" }, + "proteus_input_options": { + "title": "Proteus input options", + "type": "object", + "description": "Options for processing of proteomics MaxQuant tables with the Proteus R package", + "default": "", + "properties": { + "proteus_measurecol_prefix": { + "type": "string", + "default": "LFQ intensity ", + "description": "Prefix of the column names of the MaxQuant proteingroups table in which the intensity values are saved; the prefix has to be followed by the sample names that are also found in the samplesheet. Default: 'LFQ intensity '; take care to also consider trailing whitespace between prefix and samplenames." + }, + "proteus_norm_function": { + "type": "string", + "default": "normalizeMedian", + "description": "Normalization function to use on the MaxQuant intensities.", + "help_text": "'normalizeMedian' or 'normalizeQuantiles'", + "enum": ["normalizeMedian", "normalizeQuantiles"] + }, + "proteus_plotsd_method": { + "type": "string", + "default": "violin", + "description": "Which method to use for plotting sample distributions of the MaxQuant intensities; one of 'violin', 'dist', 'box'.", + "help_text": "'violin', 'dist' or 'box'", + "enum": ["violin", "dist", "box"] + }, + "proteus_plotmv_loess": { + "type": "boolean", + "default": true, + "description": "Should a loess line be added to the plot of mean-variance relationship of the conditions? Default: true." + }, + "proteus_palette_name": { + "type": "string", + "default": "Set1", + "help_text": "Check the content of `RColorBrewer::brewer.pal.info` from an R terminal for valid palette names.", + "description": "Valid R palette name", + "fa_icon": "fas fa-palette" + }, + "proteus_round_digits": { + "type": "number", + "default": -1, + "description": "Number of decimals to round the MaxQuant intensities to; default: -1 (will not round)." + } + } + }, "filtering": { "title": "Filtering", "type": "object", @@ -1120,6 +1169,9 @@ { "$ref": "#/definitions/affy_input_options" }, + { + "$ref": "#/definitions/proteus_input_options" + }, { "$ref": "#/definitions/filtering" }, diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 90a57398..5f2ed24a 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -28,6 +28,23 @@ if (params.study_type == 'affy_array'){ } else { error("CEL files archive not specified!") } +} else if (params.study_type == 'maxquant') { + + // Should the user have enabled --shinyngs_build_app and/or --gsea_run, throw an error + if (params.shinyngs_build_app) { + // This can be removed once shinyngs has an inbuilt NA handler + error("Cannot currently build shinyngs app for maxquant data due to data sparsity; please set --shinyngs_build_app to false.") + } + if (params.gsea_run) { + error("Cannot run GSEA for maxquant data; please set --gsea_run to false.") + } + if (!params.matrix) { + error("Input matrix not specified!") + } + matrix_file = file(params.matrix, checkIfExists: true) + + // Make channel for proteus + proteus_in = Channel.of([ file(params.input), matrix_file ]) } else if (params.study_type == 'geo_soft_file'){ // To pull SOFT files from a GEO a GSE study identifer must be provided @@ -38,7 +55,7 @@ if (params.study_type == 'affy_array'){ error("Query GSE not specified or features metadata columns not specified") } } else { - // If this is not microarray data, and this an RNA-seq dataset, + // If this is not microarray data or maxquant output, and this an RNA-seq dataset, // then assume we're reading from a matrix if (params.study_type == "rnaseq" && params.matrix) { @@ -108,7 +125,9 @@ include { CUSTOM_TABULARTOGSEACLS } from '../modules/n include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' +include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/nf-core/proteus/readproteingroups/main' include { GEOQUERY_GETGEO } from '../modules/nf-core/geoquery/getgeo/main' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -122,6 +141,8 @@ workflow DIFFERENTIALABUNDANCE { // Set up some basic variables ch_versions = Channel.empty() + // Channel for the contrasts file + ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them @@ -157,8 +178,34 @@ workflow DIFFERENTIALABUNDANCE { ch_versions = ch_versions .mix(AFFY_JUSTRMA_RAW.out.versions) - } - else if(params.study_type == 'geo_soft_file'){ + } else if (params.study_type == 'maxquant'){ + + // We'll be running Proteus once per unique contrast variable to generate plots + // TODO: there should probably be a separate plotting module in proteus to simplify this + + ch_contrast_variables = ch_contrasts_file + .splitCsv ( header:true, sep:(params.contrasts.endsWith('tsv') ? '\t' : ',')) + .map{ it.tail().first() } + .map{ + tuple('id': it.variable) + } + .unique() // uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.) + + // Run proteus to import protein abundances + PROTEUS( + ch_contrast_variables.combine(proteus_in) + ) + + // Re-map the proteus output tables to the study ID as the tables are the same across contrasts, only one norm table will be necessary + ch_in_raw = PROTEUS.out.raw_tab + .first() + .map{ meta, matrix -> tuple(exp_meta, matrix) } + ch_in_norm = PROTEUS.out.norm_tab + .first() + .map{ meta, matrix -> tuple(exp_meta, matrix) } + + ch_versions = ch_versions.mix(PROTEUS.out.versions) + } else if(params.study_type == 'geo_soft_file'){ GEOQUERY_GETGEO(ch_querygse) ch_in_norm = GEOQUERY_GETGEO.out.expression @@ -204,19 +251,23 @@ workflow DIFFERENTIALABUNDANCE { } else{ - // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + // Otherwise we can just use the matrix input; save it to the workdir so that it does not + // just appear wherever the user runs the pipeline + matrix_as_anno_filename = "${workflow.workDir}/matrix_as_anno.${matrix_file.getExtension()}" + if (params.study_type == 'maxquant'){ + ch_features_matrix = ch_in_norm + } else { + ch_features_matrix = ch_in_raw + } + ch_features = ch_features_matrix + .map{ meta, matrix -> + matrix.copyTo(matrix_as_anno_filename) + [ meta, file(matrix_as_anno_filename) ] + } } - // Channel for the contrasts file - - ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - // Check compatibility of FOM elements and contrasts - - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_matrices_for_validation = ch_in_raw .join(ch_in_norm) .map{tuple(it[0], [it[1], it[2]])} @@ -237,12 +288,12 @@ workflow DIFFERENTIALABUNDANCE { // For Affy, we've validated multiple input matrices for raw and norm, // we'll separate them out again here - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_validated_assays = VALIDATOR.out.assays .transpose() .branch { raw: it[1].name.contains('raw') - normalised: it[1].name.contains('normalised') + normalised: it[1].name =~ /normali[sz]ed/ } ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised @@ -287,7 +338,7 @@ workflow DIFFERENTIALABUNDANCE { .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file'){ + if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file' || params.study_type == 'maxquant'){ LIMMA_DIFFERENTIAL ( ch_contrasts, @@ -515,6 +566,9 @@ workflow DIFFERENTIALABUNDANCE { if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } + if (params.study_type == 'maxquant'){ + params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|proteus|affy|limma|gsea).*/ + } ch_report_params = ch_report_input_files .map{