Merge branch 'devel' into feature_pseudobulk_plot_ALB

bcbio · Feb 3, 2025 · b108c1a · b108c1a
2 parents 48a0293 + 7ce927d
commit b108c1a
Show file tree

Hide file tree

Showing 31 changed files with 1,861 additions and 117 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master]
+    branches: [main, master, devel]
   pull_request:
-    branches: [main, master]
+    branches: [main, master, devel]
 
 name: R-CMD-check
 

diff --git a/inst/templates/base/reports/example.Rmd b/inst/templates/base/reports/example.Rmd
@@ -14,8 +14,6 @@ output:
       toc_float:
          collapsed: true
          smooth_scroll: true
-editor_options: 
-  chunk_output_type: console
 params:
   project_file: ../information.R
 ---

diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd
@@ -14,8 +14,6 @@ output:
       toc_float:
          collapsed: true
          smooth_scroll: true
-editor_options: 
-  chunk_output_type: console
 params:
   # Fill this file with the right paths to nfcore output
   # params_file: params_qc_nf-core-example.R # example data

diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd
@@ -14,8 +14,6 @@ output:
       toc_float:
          collapsed: true
          smooth_scroll: true
-editor_options: 
-  chunk_output_type: console
 params:
   # Fill this file with the right paths to nfcore output
   # .qs file name for saving DiffBind Counts object

diff --git a/inst/templates/methylation/QC/QC.Rmd b/inst/templates/methylation/QC/QC.Rmd
@@ -14,8 +14,6 @@ output:
       toc_float:
          collapsed: true
          smooth_scroll: true
-editor_options: 
-  chunk_output_type: console
 params:
   params_file: ../information.R
   meta_fn: ../meta/methylation_mucci_hbc04926.csv

diff --git a/inst/templates/rnaseq/00_libs/FA.R b/inst/templates/rnaseq/00_libs/FA.R
@@ -3,19 +3,18 @@ library(clusterProfiler)
 source <- "https://github.com/bcbio/resources/raw/refs/heads/main/gene_sets/gene_sets/20240904"
 get_databases_v2=function(sps="human"){
   gmt.files=list(human=c("h.all.v2024.1.Hs.entrez.gmt",
-                         "c5.go.v2024.1.Hs.entrez.gmt",
+                         #"c5.go.v2024.1.Hs.entrez.gmt",
                          "c5.go.mf.v2024.1.Hs.entrez.gmt",
                          "c5.go.cc.v2024.1.Hs.entrez.gmt",
                          "c5.go.bp.v2024.1.Hs.entrez.gmt",
                          "c2.cp.reactome.v2024.1.Hs.entrez.gmt",
                          "c2.cp.kegg_legacy.v2024.1.Hs.entrez.gmt"),
                  mouse=c("mh.all.v2024.1.Mm.entrez.gmt",
-                         "m5.go.v2024.1.Mm.entrez.gmt",
+                         #"m5.go.v2024.1.Mm.entrez.gmt",
                          "m5.go.mf.v2024.1.Mm.entrez.gmt",
                          "m5.go.cc.v2024.1.Mm.entrez.gmt",
                          "m5.go.bp.v2024.1.Mm.entrez.gmt",
-                         "m2.cp.reactome.v2024.1.Mm.entrez.gmt",
-                         "m2.cp.kegg_legacy.v2024.1.Mm.entrez.gmt"))
+                         "m2.cp.reactome.v2024.1.Mm.entrez.gmt"))
   all_in_life=lapply(gmt.files[[sps]], function(gmt){
     df=read.gmt(file.path(source,sps,gmt))
     names(df)=c("gs_name", "entrez_gene")

diff --git a/inst/templates/rnaseq/00_libs/load_data.R b/inst/templates/rnaseq/00_libs/load_data.R
@@ -1,7 +1,10 @@
 library(tidyverse)
 library(SummarizedExperiment)
 library(janitor)
-load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
+load_metrics <- function(se=se_object, multiqc=multiqc_data_dir,
+                         gtf=gtf_fn,
+                         counts=counts,
+                         single_end=FALSE){
 
   # bcbio input
   if (!is.na(se_object)){
@@ -32,18 +35,27 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
       summarize(total_reads = sum(fastqc_raw_total_sequences))
 
     # This renames to user-friendly names the metrics columns
+    if (single_end){
+      metrics <- metrics %>%
+        dplyr::filter(!is.na(fastqc_raw_total_sequences))
+    }else{
+      metrics <- metrics %>%
+        dplyr::filter(is.na(fastqc_raw_total_sequences))
+    }
     metrics <- metrics %>%
-      dplyr::filter(is.na(fastqc_raw_total_sequences)) %>%
       remove_empty(which = 'cols') %>%
       full_join(total_reads) %>%
       mutate(mapped_reads = samtools_reads_mapped) %>%
-      mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>%
-      mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>%
-      mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>%
+      rowwise() %>%
+      mutate(exonic_rate = exonic/(exonic + intronic + intergenic)) %>%
+      mutate(intronic_rate = intronic/(exonic + intronic + intergenic)) %>%
+      mutate(intergenic_rate = intergenic/(exonic + intronic + intergenic)) %>%
       mutate(x5_3_bias = qualimap_5_3_bias)
 
     # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
     gtf <- NULL
+    biotype <- NULL
+
     if (genome =="other"){
       gtf <- gtf_fn
     }else{
@@ -59,22 +71,21 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
                          package="bcbioR")
     }
     if (is.null(gtf)) {
-      print("No genome provided! Please add it at the top of this Rmd")
-    }
-
-    gtf=rtracklayer::import(gtf)
-
-
-    one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
-    another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
-    biotype=NULL
-    if(length(one)==1){
-      biotype=one
-    }else if(length(another)==1){
-      biotype=another
+      warning("No genome provided! Please add it at the top of this Rmd")
     }else{
-      warning("No gene biotype founded")
+      gtf=rtracklayer::import(gtf)
+
+      one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
+      another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
+      if(length(one)==1){
+        biotype=one
+      }else if(length(another)==1){
+        biotype=another
+      }else{
+        warning("No gene biotype founded")
+      }
     }
+
     metrics$sample <- make.names(metrics$sample)
     if (!is.null(biotype)){
       annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)]

diff --git a/...plates/rnaseq/01_quality_assesment/QC.Rmd → ...lates/rnaseq/01_quality_assessment/QC.Rmd b/...plates/rnaseq/01_quality_assesment/QC.Rmd → ...lates/rnaseq/01_quality_assessment/QC.Rmd
@@ -14,17 +14,16 @@ output:
       toc_float:
          collapsed: true
          smooth_scroll: true
-editor_options: 
-  chunk_output_type: console
 params:
   # Fill this file with the right paths to nfcore output
   # Put hg38, mm10, mm39, or other
-  # params_file: params_qc_nf-core-example.R # example data
-  params_file: params_qc_nf-core-example.R
+  # params_file: params_qc-example.R # example data
+  params_file: params_qc-example.R
   genome: hg38
+  single_end: false
+  factor_of_interest: sample_type
   project_file: ../information.R
   functions_file: ../00_libs/load_data.R
-  factor_of_interest: sample_type
 ---
 
 Template developed with materials from https://hbctraining.github.io/main/.
@@ -47,6 +46,11 @@ This code is in this ![](https://img.shields.io/badge/status-stable-green) revis
 #    this is used to color plots, it needs to be part of the metadata
 factor_of_interest=params$factor_of_interest
 genome=params$genome
+<<<<<<< HEAD
+singleend=params$singleend
+=======
+single_end=params$single_end
+>>>>>>> c37fcd42e9e4ba276293a9bc591e6298affd14e5
 # 2. Set input files in this file
 source(params$params_file)
 # 3. If you set up this file, project information will be printed below and
@@ -122,9 +126,11 @@ coldata$sample=row.names(coldata)
 counts <- load_counts(counts_fn)
 counts <- counts[,colnames(counts) %in% coldata$sample]
 
-metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% 
+metrics <- load_metrics(se_object, multiqc_data_dir,
+                        gtf_fn, counts, single_end) %>% 
   left_join(coldata, by = c('sample')) %>% 
   as.data.frame()
+metrics <- subset(metrics, metrics$sample %in% coldata$sample)
 # TODO: change order as needed
 order <- unique(metrics[["sample"]])
 rownames(metrics) <- metrics$sample
@@ -160,15 +166,15 @@ Here, we want to see consistency and a minimum of 20 million reads (the grey lin
 ```{r plot_total_reads}
 metrics %>%
   ggplot(aes(x = factor(sample, level = order),
-             y = total_reads, 
+             y = total_reads/1000000, 
              fill = .data[[factor_of_interest]])) +
   geom_bar(stat = "identity") +
   coord_flip() +
-  scale_y_continuous(name = "million reads") +
+  scale_y_continuous(name = "Millions of reads") +
   scale_x_discrete(limits = rev) +
   xlab("") + 
   ggtitle("Total reads") +
-  geom_hline(yintercept=20000000, color = "grey", linewidth=2)
+  geom_hline(yintercept=20, color = "grey", linewidth=2)
 
 metrics %>%
     ggplot(aes(x = .data[[factor_of_interest]],
@@ -198,6 +204,7 @@ metrics %>%
                color = .data[[factor_of_interest]])) +
     geom_point(alpha = 0.5, size=4) +
     coord_flip() +
+    ylab("Mapped Reads %") +
     scale_x_discrete(limits = rev) +
     ylim(0, 100) +
   ggtitle("Mapping rate") + xlab("") +
@@ -253,13 +260,14 @@ This plot shows how complex the samples are. We expect samples with more reads t
 
 ```{r plot_gene_saturation}
 metrics %>% 
-  ggplot(aes(x = total_reads, 
+  ggplot(aes(x = total_reads/1000000, 
              y = n_genes,
              color = .data[[factor_of_interest]])) +
   geom_point(alpha = 0.5, size=4) +
   scale_x_log10() +
   ggtitle("Gene saturation") +
-  ylab("Number of genes")
+  ylab("Number of genes") +
+  xlab("Millions of reads")
 ```
 
 ## Exonic mapping rate
@@ -343,7 +351,7 @@ metrics %>%
 
 There should be little bias, i.e. the values should be close to 1, or at least consistent among samples
 
-```{r plot_53_bias}
+```{r plot_53_bias, eval=!all(is.na((metrics['r_and_t_rna_rate'])))}
 metrics %>%
   ggplot(aes(x = factor(sample, level = order),
              y = x5_3_bias, 
@@ -410,19 +418,6 @@ pca2 + scale_color_grafify(palette = "kelly")
 
 ```
 
-# Covariates analysis
-
-When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. 
-
-```{r covariate-plot,fig.height=12, fig.width=10}
-## Remove non-useful columns output by nf-core
-coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))])
-
-# Remove missing data
-coldat_2 <- na.omit(coldat_2)
-degCovariates(vst, metadata = coldat_2)
-```
-
 ## Hierarchical clustering
 
 Inter-correlation analysis (ICA) is another way to look at how well samples
@@ -453,6 +448,35 @@ p <- pheatmap(vst_cor,
 p
 ```
 
+# Covariates analysis
+
+When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. 
+
+```{r covariate-plot,fig.height=12, fig.width=10}
+## Remove non-useful columns output by nf-core
+coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))])
+
+# Remove missing data
+coldat_2 <- na.omit(coldat_2)
+degCovariates(vst, metadata = coldat_2)
+```
+
+# Conclusions
+
+
+
+# Methods
+
+RNA-seq counts were generated by the nf-core rnaseq pipeline [version] using Salmon (Patro et al. 2017). Downstream analyses were performed using `r version$version.string`. Counts were imported into R using DESeq2 version `r packageVersion("DESeq2")` (Love, Huber, and Anders 2014). Gene annotations were obtained from Ensembl. Plots were generated by ggplot2 (Wickham 2016). Heatmaps were generated by pheatmap (Kolde 2019).
+
+## R package references
+
+```{r citations}
+citation("DESeq2")
+citation("ggplot2")
+citation("pheatmap")
+```
+
 # R session
 
 List and version of tools used for the QC report generation.

diff --git a/.../01_quality_assesment/params_qc-example.R → ...01_quality_assessment/params_qc-example.R b/.../01_quality_assesment/params_qc-example.R → ...01_quality_assessment/params_qc-example.R
@@ -4,7 +4,6 @@
 # Example data
 coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv'
 counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv')
-se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds')
 # This folder is in the output directory inside multiqc folder
 multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/'
 # This file is inside the genome folder in the output directory

diff --git a/...s/rnaseq/01_quality_assesment/params_qc.R → .../rnaseq/01_quality_assessment/params_qc.R b/...s/rnaseq/01_quality_assesment/params_qc.R → .../rnaseq/01_quality_assessment/params_qc.R
@@ -2,11 +2,12 @@
 
 # Your data
 # This is the file used to run nf-core or compatible to that
-coldata_fn='/Path/to/metadata/meta.csv'
+coldata_fn <- '/Path/to/metadata/meta.csv'
 # This file is inside star_salmon/ folder
-counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv'
+counts_fn <- '/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv'
 # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder
-multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data'
+multiqc_data_dir <- '/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data'
 # This file is inside the genome folder in the output directory, use this only for non-model organism
 # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf'
-se_object = NA
+gtf_fn <- NULL
+se_object <-  NA
diff --git a/...naseq/01_quality_assesment/run_markdown.R → ...aseq/01_quality_assessment/run_markdown.R b/...naseq/01_quality_assesment/run_markdown.R → ...aseq/01_quality_assessment/run_markdown.R