Skip to content

Commit

Permalink
Merge branch 'devel' into feature_pseudobulk_plot_ALB
Browse files Browse the repository at this point in the history
  • Loading branch information
lpantano committed Feb 3, 2025
2 parents 48a0293 + 7ce927d commit b108c1a
Show file tree
Hide file tree
Showing 31 changed files with 1,861 additions and 117 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, devel]
pull_request:
branches: [main, master]
branches: [main, master, devel]

name: R-CMD-check

Expand Down
2 changes: 0 additions & 2 deletions inst/templates/base/reports/example.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ output:
toc_float:
collapsed: true
smooth_scroll: true
editor_options:
chunk_output_type: console
params:
project_file: ../information.R
---
Expand Down
2 changes: 0 additions & 2 deletions inst/templates/chipseq/QC/QC.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ output:
toc_float:
collapsed: true
smooth_scroll: true
editor_options:
chunk_output_type: console
params:
# Fill this file with the right paths to nfcore output
# params_file: params_qc_nf-core-example.R # example data
Expand Down
2 changes: 0 additions & 2 deletions inst/templates/chipseq/diffbind/diffbind.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ output:
toc_float:
collapsed: true
smooth_scroll: true
editor_options:
chunk_output_type: console
params:
# Fill this file with the right paths to nfcore output
# .qs file name for saving DiffBind Counts object
Expand Down
2 changes: 0 additions & 2 deletions inst/templates/methylation/QC/QC.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ output:
toc_float:
collapsed: true
smooth_scroll: true
editor_options:
chunk_output_type: console
params:
params_file: ../information.R
meta_fn: ../meta/methylation_mucci_hbc04926.csv
Expand Down
7 changes: 3 additions & 4 deletions inst/templates/rnaseq/00_libs/FA.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@ library(clusterProfiler)
source <- "https://github.com/bcbio/resources/raw/refs/heads/main/gene_sets/gene_sets/20240904"
get_databases_v2=function(sps="human"){
gmt.files=list(human=c("h.all.v2024.1.Hs.entrez.gmt",
"c5.go.v2024.1.Hs.entrez.gmt",
#"c5.go.v2024.1.Hs.entrez.gmt",
"c5.go.mf.v2024.1.Hs.entrez.gmt",
"c5.go.cc.v2024.1.Hs.entrez.gmt",
"c5.go.bp.v2024.1.Hs.entrez.gmt",
"c2.cp.reactome.v2024.1.Hs.entrez.gmt",
"c2.cp.kegg_legacy.v2024.1.Hs.entrez.gmt"),
mouse=c("mh.all.v2024.1.Mm.entrez.gmt",
"m5.go.v2024.1.Mm.entrez.gmt",
#"m5.go.v2024.1.Mm.entrez.gmt",
"m5.go.mf.v2024.1.Mm.entrez.gmt",
"m5.go.cc.v2024.1.Mm.entrez.gmt",
"m5.go.bp.v2024.1.Mm.entrez.gmt",
"m2.cp.reactome.v2024.1.Mm.entrez.gmt",
"m2.cp.kegg_legacy.v2024.1.Mm.entrez.gmt"))
"m2.cp.reactome.v2024.1.Mm.entrez.gmt"))
all_in_life=lapply(gmt.files[[sps]], function(gmt){
df=read.gmt(file.path(source,sps,gmt))
names(df)=c("gs_name", "entrez_gene")
Expand Down
49 changes: 30 additions & 19 deletions inst/templates/rnaseq/00_libs/load_data.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
library(tidyverse)
library(SummarizedExperiment)
library(janitor)
load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
load_metrics <- function(se=se_object, multiqc=multiqc_data_dir,
gtf=gtf_fn,
counts=counts,
single_end=FALSE){

# bcbio input
if (!is.na(se_object)){
Expand Down Expand Up @@ -32,18 +35,27 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
summarize(total_reads = sum(fastqc_raw_total_sequences))

# This renames to user-friendly names the metrics columns
if (single_end){
metrics <- metrics %>%
dplyr::filter(!is.na(fastqc_raw_total_sequences))
}else{
metrics <- metrics %>%
dplyr::filter(is.na(fastqc_raw_total_sequences))
}
metrics <- metrics %>%
dplyr::filter(is.na(fastqc_raw_total_sequences)) %>%
remove_empty(which = 'cols') %>%
full_join(total_reads) %>%
mutate(mapped_reads = samtools_reads_mapped) %>%
mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>%
mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>%
mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>%
rowwise() %>%
mutate(exonic_rate = exonic/(exonic + intronic + intergenic)) %>%
mutate(intronic_rate = intronic/(exonic + intronic + intergenic)) %>%
mutate(intergenic_rate = intergenic/(exonic + intronic + intergenic)) %>%
mutate(x5_3_bias = qualimap_5_3_bias)

# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
gtf <- NULL
biotype <- NULL

if (genome =="other"){
gtf <- gtf_fn
}else{
Expand All @@ -59,22 +71,21 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){
package="bcbioR")
}
if (is.null(gtf)) {
print("No genome provided! Please add it at the top of this Rmd")
}

gtf=rtracklayer::import(gtf)


one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
biotype=NULL
if(length(one)==1){
biotype=one
}else if(length(another)==1){
biotype=another
warning("No genome provided! Please add it at the top of this Rmd")
}else{
warning("No gene biotype founded")
gtf=rtracklayer::import(gtf)

one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
if(length(one)==1){
biotype=one
}else if(length(another)==1){
biotype=another
}else{
warning("No gene biotype founded")
}
}

metrics$sample <- make.names(metrics$sample)
if (!is.null(biotype)){
annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,16 @@ output:
toc_float:
collapsed: true
smooth_scroll: true
editor_options:
chunk_output_type: console
params:
# Fill this file with the right paths to nfcore output
# Put hg38, mm10, mm39, or other
# params_file: params_qc_nf-core-example.R # example data
params_file: params_qc_nf-core-example.R
# params_file: params_qc-example.R # example data
params_file: params_qc-example.R
genome: hg38
single_end: false
factor_of_interest: sample_type
project_file: ../information.R
functions_file: ../00_libs/load_data.R
factor_of_interest: sample_type
---

Template developed with materials from https://hbctraining.github.io/main/.
Expand All @@ -47,6 +46,11 @@ This code is in this ![](https://img.shields.io/badge/status-stable-green) revis
# this is used to color plots, it needs to be part of the metadata
factor_of_interest=params$factor_of_interest
genome=params$genome
<<<<<<< HEAD
singleend=params$singleend
=======
single_end=params$single_end
>>>>>>> c37fcd42e9e4ba276293a9bc591e6298affd14e5
# 2. Set input files in this file
source(params$params_file)
# 3. If you set up this file, project information will be printed below and
Expand Down Expand Up @@ -122,9 +126,11 @@ coldata$sample=row.names(coldata)
counts <- load_counts(counts_fn)
counts <- counts[,colnames(counts) %in% coldata$sample]
metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%
metrics <- load_metrics(se_object, multiqc_data_dir,
gtf_fn, counts, single_end) %>%
left_join(coldata, by = c('sample')) %>%
as.data.frame()
metrics <- subset(metrics, metrics$sample %in% coldata$sample)
# TODO: change order as needed
order <- unique(metrics[["sample"]])
rownames(metrics) <- metrics$sample
Expand Down Expand Up @@ -160,15 +166,15 @@ Here, we want to see consistency and a minimum of 20 million reads (the grey lin
```{r plot_total_reads}
metrics %>%
ggplot(aes(x = factor(sample, level = order),
y = total_reads,
y = total_reads/1000000,
fill = .data[[factor_of_interest]])) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(name = "million reads") +
scale_y_continuous(name = "Millions of reads") +
scale_x_discrete(limits = rev) +
xlab("") +
ggtitle("Total reads") +
geom_hline(yintercept=20000000, color = "grey", linewidth=2)
geom_hline(yintercept=20, color = "grey", linewidth=2)
metrics %>%
ggplot(aes(x = .data[[factor_of_interest]],
Expand Down Expand Up @@ -198,6 +204,7 @@ metrics %>%
color = .data[[factor_of_interest]])) +
geom_point(alpha = 0.5, size=4) +
coord_flip() +
ylab("Mapped Reads %") +
scale_x_discrete(limits = rev) +
ylim(0, 100) +
ggtitle("Mapping rate") + xlab("") +
Expand Down Expand Up @@ -253,13 +260,14 @@ This plot shows how complex the samples are. We expect samples with more reads t

```{r plot_gene_saturation}
metrics %>%
ggplot(aes(x = total_reads,
ggplot(aes(x = total_reads/1000000,
y = n_genes,
color = .data[[factor_of_interest]])) +
geom_point(alpha = 0.5, size=4) +
scale_x_log10() +
ggtitle("Gene saturation") +
ylab("Number of genes")
ylab("Number of genes") +
xlab("Millions of reads")
```

## Exonic mapping rate
Expand Down Expand Up @@ -343,7 +351,7 @@ metrics %>%

There should be little bias, i.e. the values should be close to 1, or at least consistent among samples

```{r plot_53_bias}
```{r plot_53_bias, eval=!all(is.na((metrics['r_and_t_rna_rate'])))}
metrics %>%
ggplot(aes(x = factor(sample, level = order),
y = x5_3_bias,
Expand Down Expand Up @@ -410,19 +418,6 @@ pca2 + scale_color_grafify(palette = "kelly")
```

# Covariates analysis

When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor.

```{r covariate-plot,fig.height=12, fig.width=10}
## Remove non-useful columns output by nf-core
coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))])
# Remove missing data
coldat_2 <- na.omit(coldat_2)
degCovariates(vst, metadata = coldat_2)
```

## Hierarchical clustering

Inter-correlation analysis (ICA) is another way to look at how well samples
Expand Down Expand Up @@ -453,6 +448,35 @@ p <- pheatmap(vst_cor,
p
```

# Covariates analysis

When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor.

```{r covariate-plot,fig.height=12, fig.width=10}
## Remove non-useful columns output by nf-core
coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))])
# Remove missing data
coldat_2 <- na.omit(coldat_2)
degCovariates(vst, metadata = coldat_2)
```

# Conclusions



# Methods

RNA-seq counts were generated by the nf-core rnaseq pipeline [version] using Salmon (Patro et al. 2017). Downstream analyses were performed using `r version$version.string`. Counts were imported into R using DESeq2 version `r packageVersion("DESeq2")` (Love, Huber, and Anders 2014). Gene annotations were obtained from Ensembl. Plots were generated by ggplot2 (Wickham 2016). Heatmaps were generated by pheatmap (Kolde 2019).

## R package references

```{r citations}
citation("DESeq2")
citation("ggplot2")
citation("pheatmap")
```

# R session

List and version of tools used for the QC report generation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# Example data
coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv'
counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv')
se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds')
# This folder is in the output directory inside multiqc folder
multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/'
# This file is inside the genome folder in the output directory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

# Your data
# This is the file used to run nf-core or compatible to that
coldata_fn='/Path/to/metadata/meta.csv'
coldata_fn <- '/Path/to/metadata/meta.csv'
# This file is inside star_salmon/ folder
counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv'
counts_fn <- '/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv'
# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder
multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data'
multiqc_data_dir <- '/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data'
# This file is inside the genome folder in the output directory, use this only for non-model organism
# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf'
se_object = NA
gtf_fn <- NULL
se_object <- NA
Loading

0 comments on commit b108c1a

Please sign in to comment.