diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 45c8fce..5a54ec7 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, devel] pull_request: - branches: [main, master] + branches: [main, master, devel] name: R-CMD-check diff --git a/inst/templates/base/reports/example.Rmd b/inst/templates/base/reports/example.Rmd index 2a0d7f8..b72a0cb 100644 --- a/inst/templates/base/reports/example.Rmd +++ b/inst/templates/base/reports/example.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: project_file: ../information.R --- diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 85efbb7..d2fff81 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: # Fill this file with the right paths to nfcore output # params_file: params_qc_nf-core-example.R # example data diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 03ab342..442e0c4 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: # Fill this file with the right paths to nfcore output # .qs file name for saving DiffBind Counts object diff --git a/inst/templates/methylation/QC/QC.Rmd b/inst/templates/methylation/QC/QC.Rmd index a3bb540..3bcac0c 100644 --- a/inst/templates/methylation/QC/QC.Rmd +++ b/inst/templates/methylation/QC/QC.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: params_file: ../information.R meta_fn: ../meta/methylation_mucci_hbc04926.csv diff --git a/inst/templates/rnaseq/00_libs/FA.R b/inst/templates/rnaseq/00_libs/FA.R index 7d09d0e..1bfafc6 100644 --- a/inst/templates/rnaseq/00_libs/FA.R +++ b/inst/templates/rnaseq/00_libs/FA.R @@ -3,19 +3,18 @@ library(clusterProfiler) source <- "https://github.com/bcbio/resources/raw/refs/heads/main/gene_sets/gene_sets/20240904" get_databases_v2=function(sps="human"){ gmt.files=list(human=c("h.all.v2024.1.Hs.entrez.gmt", - "c5.go.v2024.1.Hs.entrez.gmt", + #"c5.go.v2024.1.Hs.entrez.gmt", "c5.go.mf.v2024.1.Hs.entrez.gmt", "c5.go.cc.v2024.1.Hs.entrez.gmt", "c5.go.bp.v2024.1.Hs.entrez.gmt", "c2.cp.reactome.v2024.1.Hs.entrez.gmt", "c2.cp.kegg_legacy.v2024.1.Hs.entrez.gmt"), mouse=c("mh.all.v2024.1.Mm.entrez.gmt", - "m5.go.v2024.1.Mm.entrez.gmt", + #"m5.go.v2024.1.Mm.entrez.gmt", "m5.go.mf.v2024.1.Mm.entrez.gmt", "m5.go.cc.v2024.1.Mm.entrez.gmt", "m5.go.bp.v2024.1.Mm.entrez.gmt", - "m2.cp.reactome.v2024.1.Mm.entrez.gmt", - "m2.cp.kegg_legacy.v2024.1.Mm.entrez.gmt")) + "m2.cp.reactome.v2024.1.Mm.entrez.gmt")) all_in_life=lapply(gmt.files[[sps]], function(gmt){ df=read.gmt(file.path(source,sps,gmt)) names(df)=c("gs_name", "entrez_gene") diff --git a/inst/templates/rnaseq/00_libs/load_data.R b/inst/templates/rnaseq/00_libs/load_data.R index ca0a9b1..ad39dc2 100644 --- a/inst/templates/rnaseq/00_libs/load_data.R +++ b/inst/templates/rnaseq/00_libs/load_data.R @@ -1,7 +1,10 @@ library(tidyverse) library(SummarizedExperiment) library(janitor) -load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ +load_metrics <- function(se=se_object, multiqc=multiqc_data_dir, + gtf=gtf_fn, + counts=counts, + single_end=FALSE){ # bcbio input if (!is.na(se_object)){ @@ -32,18 +35,27 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ summarize(total_reads = sum(fastqc_raw_total_sequences)) # This renames to user-friendly names the metrics columns + if (single_end){ + metrics <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) + }else{ + metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) + } metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% remove_empty(which = 'cols') %>% full_join(total_reads) %>% mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + rowwise() %>% + mutate(exonic_rate = exonic/(exonic + intronic + intergenic)) %>% + mutate(intronic_rate = intronic/(exonic + intronic + intergenic)) %>% + mutate(intergenic_rate = intergenic/(exonic + intronic + intergenic)) %>% mutate(x5_3_bias = qualimap_5_3_bias) # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case gtf <- NULL + biotype <- NULL + if (genome =="other"){ gtf <- gtf_fn }else{ @@ -59,22 +71,21 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ package="bcbioR") } if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") - } - - gtf=rtracklayer::import(gtf) - - - one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) - another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) - biotype=NULL - if(length(one)==1){ - biotype=one - }else if(length(another)==1){ - biotype=another + warning("No genome provided! Please add it at the top of this Rmd") }else{ - warning("No gene biotype founded") + gtf=rtracklayer::import(gtf) + + one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) + another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) + if(length(one)==1){ + biotype=one + }else if(length(another)==1){ + biotype=another + }else{ + warning("No gene biotype founded") + } } + metrics$sample <- make.names(metrics$sample) if (!is.null(biotype)){ annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] diff --git a/inst/templates/rnaseq/01_quality_assesment/QC.Rmd b/inst/templates/rnaseq/01_quality_assessment/QC.Rmd similarity index 91% rename from inst/templates/rnaseq/01_quality_assesment/QC.Rmd rename to inst/templates/rnaseq/01_quality_assessment/QC.Rmd index 0ea09d0..c767582 100644 --- a/inst/templates/rnaseq/01_quality_assesment/QC.Rmd +++ b/inst/templates/rnaseq/01_quality_assessment/QC.Rmd @@ -14,17 +14,16 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: # Fill this file with the right paths to nfcore output # Put hg38, mm10, mm39, or other - # params_file: params_qc_nf-core-example.R # example data - params_file: params_qc_nf-core-example.R + # params_file: params_qc-example.R # example data + params_file: params_qc-example.R genome: hg38 + single_end: false + factor_of_interest: sample_type project_file: ../information.R functions_file: ../00_libs/load_data.R - factor_of_interest: sample_type --- Template developed with materials from https://hbctraining.github.io/main/. @@ -47,6 +46,11 @@ This code is in this ![](https://img.shields.io/badge/status-stable-green) revis # this is used to color plots, it needs to be part of the metadata factor_of_interest=params$factor_of_interest genome=params$genome +<<<<<<< HEAD +singleend=params$singleend +======= +single_end=params$single_end +>>>>>>> c37fcd42e9e4ba276293a9bc591e6298affd14e5 # 2. Set input files in this file source(params$params_file) # 3. If you set up this file, project information will be printed below and @@ -122,9 +126,11 @@ coldata$sample=row.names(coldata) counts <- load_counts(counts_fn) counts <- counts[,colnames(counts) %in% coldata$sample] -metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% +metrics <- load_metrics(se_object, multiqc_data_dir, + gtf_fn, counts, single_end) %>% left_join(coldata, by = c('sample')) %>% as.data.frame() +metrics <- subset(metrics, metrics$sample %in% coldata$sample) # TODO: change order as needed order <- unique(metrics[["sample"]]) rownames(metrics) <- metrics$sample @@ -160,15 +166,15 @@ Here, we want to see consistency and a minimum of 20 million reads (the grey lin ```{r plot_total_reads} metrics %>% ggplot(aes(x = factor(sample, level = order), - y = total_reads, + y = total_reads/1000000, fill = .data[[factor_of_interest]])) + geom_bar(stat = "identity") + coord_flip() + - scale_y_continuous(name = "million reads") + + scale_y_continuous(name = "Millions of reads") + scale_x_discrete(limits = rev) + xlab("") + ggtitle("Total reads") + - geom_hline(yintercept=20000000, color = "grey", linewidth=2) + geom_hline(yintercept=20, color = "grey", linewidth=2) metrics %>% ggplot(aes(x = .data[[factor_of_interest]], @@ -198,6 +204,7 @@ metrics %>% color = .data[[factor_of_interest]])) + geom_point(alpha = 0.5, size=4) + coord_flip() + + ylab("Mapped Reads %") + scale_x_discrete(limits = rev) + ylim(0, 100) + ggtitle("Mapping rate") + xlab("") + @@ -253,13 +260,14 @@ This plot shows how complex the samples are. We expect samples with more reads t ```{r plot_gene_saturation} metrics %>% - ggplot(aes(x = total_reads, + ggplot(aes(x = total_reads/1000000, y = n_genes, color = .data[[factor_of_interest]])) + geom_point(alpha = 0.5, size=4) + scale_x_log10() + ggtitle("Gene saturation") + - ylab("Number of genes") + ylab("Number of genes") + + xlab("Millions of reads") ``` ## Exonic mapping rate @@ -343,7 +351,7 @@ metrics %>% There should be little bias, i.e. the values should be close to 1, or at least consistent among samples -```{r plot_53_bias} +```{r plot_53_bias, eval=!all(is.na((metrics['r_and_t_rna_rate'])))} metrics %>% ggplot(aes(x = factor(sample, level = order), y = x5_3_bias, @@ -410,19 +418,6 @@ pca2 + scale_color_grafify(palette = "kelly") ``` -# Covariates analysis - -When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. - -```{r covariate-plot,fig.height=12, fig.width=10} -## Remove non-useful columns output by nf-core -coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) - -# Remove missing data -coldat_2 <- na.omit(coldat_2) -degCovariates(vst, metadata = coldat_2) -``` - ## Hierarchical clustering Inter-correlation analysis (ICA) is another way to look at how well samples @@ -453,6 +448,35 @@ p <- pheatmap(vst_cor, p ``` +# Covariates analysis + +When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. + +```{r covariate-plot,fig.height=12, fig.width=10} +## Remove non-useful columns output by nf-core +coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) + +# Remove missing data +coldat_2 <- na.omit(coldat_2) +degCovariates(vst, metadata = coldat_2) +``` + +# Conclusions + + + +# Methods + +RNA-seq counts were generated by the nf-core rnaseq pipeline [version] using Salmon (Patro et al. 2017). Downstream analyses were performed using `r version$version.string`. Counts were imported into R using DESeq2 version `r packageVersion("DESeq2")` (Love, Huber, and Anders 2014). Gene annotations were obtained from Ensembl. Plots were generated by ggplot2 (Wickham 2016). Heatmaps were generated by pheatmap (Kolde 2019). + +## R package references + +```{r citations} +citation("DESeq2") +citation("ggplot2") +citation("pheatmap") +``` + # R session List and version of tools used for the QC report generation. diff --git a/inst/templates/rnaseq/01_quality_assesment/params_qc-example.R b/inst/templates/rnaseq/01_quality_assessment/params_qc-example.R similarity index 82% rename from inst/templates/rnaseq/01_quality_assesment/params_qc-example.R rename to inst/templates/rnaseq/01_quality_assessment/params_qc-example.R index b364967..4eac43f 100644 --- a/inst/templates/rnaseq/01_quality_assesment/params_qc-example.R +++ b/inst/templates/rnaseq/01_quality_assessment/params_qc-example.R @@ -4,7 +4,6 @@ # Example data coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') -se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') # This folder is in the output directory inside multiqc folder multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' # This file is inside the genome folder in the output directory diff --git a/inst/templates/rnaseq/01_quality_assesment/params_qc.R b/inst/templates/rnaseq/01_quality_assessment/params_qc.R similarity index 62% rename from inst/templates/rnaseq/01_quality_assesment/params_qc.R rename to inst/templates/rnaseq/01_quality_assessment/params_qc.R index 897b6b0..795c000 100644 --- a/inst/templates/rnaseq/01_quality_assesment/params_qc.R +++ b/inst/templates/rnaseq/01_quality_assessment/params_qc.R @@ -2,11 +2,12 @@ # Your data # This is the file used to run nf-core or compatible to that -coldata_fn='/Path/to/metadata/meta.csv' +coldata_fn <- '/Path/to/metadata/meta.csv' # This file is inside star_salmon/ folder -counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' +counts_fn <- '/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' +multiqc_data_dir <- '/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' # This file is inside the genome folder in the output directory, use this only for non-model organism # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' -se_object = NA +gtf_fn <- NULL +se_object <- NA diff --git a/inst/templates/rnaseq/01_quality_assesment/run_markdown.R b/inst/templates/rnaseq/01_quality_assessment/run_markdown.R similarity index 100% rename from inst/templates/rnaseq/01_quality_assesment/run_markdown.R rename to inst/templates/rnaseq/01_quality_assessment/run_markdown.R diff --git a/inst/templates/rnaseq/02_differential_expression/DEG.Rmd b/inst/templates/rnaseq/02_differential_expression/DEG.Rmd index 6fd84b0..3441cef 100644 --- a/inst/templates/rnaseq/02_differential_expression/DEG.Rmd +++ b/inst/templates/rnaseq/02_differential_expression/DEG.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: inline params: ## Combatseq and ruv can both be false or ONLY ONE can be true # numerator: tumor @@ -57,8 +55,15 @@ source(params$project_file) map(list.files(params$functions_file,pattern = "*.R$",full.names = T), source) %>% invisible() # IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) genome=params$genome -library(org.Hs.eg.db) -ann.org=org.Hs.eg.db +if(grepl(genome, "hg|human", ignore.case = TRUE)) { + library(org.Hs.eg.db) + ann.org=org.Hs.eg.db +} else if(grepl(genome, "mm|mouse", ignore.case = TRUE)) { + library(org.Mm.eg.db) + ann.org=org.Mm.eg.db +} else { + warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.") +} column=params$column # use contrast to set up multiple comparisons @@ -145,6 +150,7 @@ metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% left_join(coldata, by = c('sample')) %>% as.data.frame() rownames(metrics) <- metrics$sample +metrics <- subset(metrics, metrics$sample %in% coldata$sample) # if the names don't match in order or string check files names and coldata information counts = counts[,rownames(metrics)] coldata = coldata[rownames(metrics),] @@ -161,9 +167,9 @@ stopifnot(all(names(counts) == rownames(metrics))) - Aim: `r aim` ```{r load_counts_data} -rdata = AnnotationDbi::select(ann.org, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% - dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% - distinct(gene_id, .keep_all = TRUE) +rdata = AnnotationDbi::select(ann.org, rownames(counts), columns=c('SYMBOL', 'ENTREZID'), keytype = 'ENSEMBL') %>% +dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL, entrez= ENTREZID) %>% +distinct(gene_id, .keep_all = TRUE) ``` @@ -455,7 +461,11 @@ This volcano plot shows the genes that are significantly up- and down-regulated for (contrast in names(de_list)){ cat("### ", contrast, "\n\n") res <- de_list[[contrast]][["all"]] - res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) +res_mod <- res + +## The below line of code will change anything with an absolute(lfc) above 5 to a 5. This will help remove outliers to better frame the graph if wanted. +# res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) + show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) p1=EnhancedVolcano(res_mod, lab= res_mod$gene_name, @@ -465,11 +475,9 @@ for (contrast in names(de_list)){ x = 'lfc', y = 'padj', title=contrast, - col=as.vector(colors[c("darkgrey", - "lightblue", - "purple", - "purple")]), - subtitle = "", drawConnectors = T, max.overlaps = Inf) + col=col=c("darkgrey", "lightblue", "plum1", "purple"), + # xlim = c(-5,5), ## set the x-axis to only show some points + subtitle = "", drawConnectors = T, max.overlaps = Inf ) print(p1) cat("\n\n") } @@ -553,7 +561,16 @@ for (contrast in names(de_list)){ From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. ```{r} -all_in_life=get_databases_v2(sps="human") +if(grepl(genome, "hg|human", ignore.case = TRUE)) { + all_in_life=get_databases_v2(sps="human") + run_FA=TRUE +} else if(grepl(genome, "mm|mouse", ignore.case = TRUE)) { + all_in_life=get_databases_v2(sps="mouse") + run_FA=TRUE +} else { + warning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") + run_FA=FALSE +} ``` # Pathway Analysis- GSEA @@ -564,7 +581,7 @@ Gene Set Enrichment Analysis (GSEA) is a computational method used to determine - Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge. - Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes. -```{r, warning=F, message=F} +```{r, warning=F, message=F, eval=run_FA} fa_gsea_list=lapply(de_list,function(contrast){ res=contrast[["all"]] @@ -580,7 +597,7 @@ fa_gsea_list=lapply(de_list,function(contrast){ }) ``` -```{r, results='asis'} +```{r, results='asis', eval=run_FA} # NOTE DT::datatables doesn't work with tabset and for loops # You can use the following code to print dynamically or call manually sanitize_datatable() # multiple times @@ -602,7 +619,7 @@ Over-Representation Analysis (ORA) is a statistical method used to determine whe - Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. - Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. -```{r, warning=F, message=F} +```{r, warning=F, message=F, eval=run_FA} fa_list=lapply(de_list,function(contrast){ res=contrast[["all"]] @@ -631,7 +648,7 @@ fa_list=lapply(de_list,function(contrast){ ## All significant genes {.tabset} -```{r, results='asis'} +```{r, results='asis', eval=run_FA} # NOTE DT::datatables doesn't work with tabset and for loops # You can use the following code to print dynamically or call manually sanitize_datatable() # multiple times @@ -648,7 +665,7 @@ tagList(dt_list) ## Down-regulated genes {.tabset} -```{r, results='asis'} +```{r, results='asis', eval=run_FA} dt_list=list() for (contrast in names(de_list)){ res_sig=fa_list[[contrast]][["down"]] %>% filter(padj < 0.05) @@ -662,7 +679,7 @@ tagList(dt_list) ## Up-regulated genes {.tabset} -```{r, results='asis'} +```{r, results='asis', eval=run_FA} dt_list=list() for (contrast in names(de_list)){ res_sig=fa_list[[contrast]][["up"]] %>% filter(padj < 0.05) @@ -682,12 +699,15 @@ if (!is.null(subset_value) & !is.null(subset_value)){ } else { filenames = "full" } + +filename = paste0(filenames) +name_expression_fn=file.path( + basedir, + str_interp("${filename}_expression.csv")) +write_csv(counts_norm, name_expression_fn) + for (contrast in names(contrasts)){ - filename = paste0(filenames, "_", contrast) - name_expression_fn=file.path( - basedir, - str_interp("${filename}_expression.csv")) - + name_deg_fn=file.path( basedir, str_interp("${filename}_deg.csv")) @@ -708,7 +728,6 @@ for (contrast in names(contrasts)){ pathways_for_writing <- fa_list[[contrast]][["all"]] %>% mutate(comparison = contrast) - write_csv(counts_norm, name_expression_fn) write_csv(res_for_writing, name_deg_fn) write_csv(pathways_for_writing, name_pathways_fn) } diff --git a/inst/templates/rnaseq/03_comparative/Intersections.Rmd b/inst/templates/rnaseq/03_comparative/Intersections.Rmd index 02f01d4..a06a737 100644 --- a/inst/templates/rnaseq/03_comparative/Intersections.Rmd +++ b/inst/templates/rnaseq/03_comparative/Intersections.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: project_file: ../information.R --- diff --git a/inst/templates/rnaseq/03_comparative/Pair-wise-comparison-analysis.Rmd b/inst/templates/rnaseq/03_comparative/Pair-wise-comparison-analysis.Rmd index d61b6a8..ca9f84a 100644 --- a/inst/templates/rnaseq/03_comparative/Pair-wise-comparison-analysis.Rmd +++ b/inst/templates/rnaseq/03_comparative/Pair-wise-comparison-analysis.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: project_file: ../information.R --- diff --git a/inst/templates/rnaseq/03_functional/GSVA.Rmd b/inst/templates/rnaseq/03_functional/GSVA.Rmd index d43db2f..18d723b 100644 --- a/inst/templates/rnaseq/03_functional/GSVA.Rmd +++ b/inst/templates/rnaseq/03_functional/GSVA.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: inline params: # set column name and contrasts to be factors of interest column: "sample_type" diff --git a/inst/templates/rnaseq/03_functional/Immune-deconvolution.Rmd b/inst/templates/rnaseq/03_functional/Immune-deconvolution.Rmd index 61b3596..1bf619e 100644 --- a/inst/templates/rnaseq/03_functional/Immune-deconvolution.Rmd +++ b/inst/templates/rnaseq/03_functional/Immune-deconvolution.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: inline params: # information about project: project name, PI, analyst, experiment, aim project_file: ../information.R diff --git a/inst/templates/rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd b/inst/templates/rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd index c0f105f..e92d3ec 100644 --- a/inst/templates/rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd +++ b/inst/templates/rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: inline params: # provide path to metadata file with species information, input and reference file locations, column names params_file: params_nonmodel_org_pathways.R @@ -395,4 +393,4 @@ List and version of tools used for the FA report generation. ```{r session_info} sessionInfo() -``` \ No newline at end of file +``` diff --git a/inst/templates/rnaseq/04_gene_patterns/WGCNA.Rmd b/inst/templates/rnaseq/04_gene_patterns/WGCNA.Rmd new file mode 100644 index 0000000..34c026c --- /dev/null +++ b/inst/templates/rnaseq/04_gene_patterns/WGCNA.Rmd @@ -0,0 +1,361 @@ +--- +title: "Weighted Gene Co-Expression Network Analysis (WGCNA)" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + minModuleSize: 30 + maxBlockSize: 4000 + mergeCutHeight: 0.25 + column: "sample_type" + project_file: ../information.R + params_file: params_de-example.R + functions_file: ../libs +--- + +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +source(params$project_file) +source(params$params_file) +map(list.files(params$functions_file,pattern = "*.R$",full.names = T),source) %>% invisible() +column=params$column +contrasts=params$contrasts +subset_column=params$subset_column +subset_value=params$subset_value +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(WGCNA) +library(tidyverse) +library(DESeq2) +library(magrittr) +library(rtracklayer) +library(flashClust) +library(gridExtra) +library(DT) +library(AnnotationDbi) +library(bcbioR) + +library(knitr) +library(ggprism) + + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +```{r load_data} + +coldata <- load_coldata(coldata_fn, column, + subset_column, subset_value) +#coldata[[contrasts[[1]][1]]] = relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +# It's easier for future items if the metadata(genotype) is already set up as a factor +#coldata$sample_type <- as.factor(coldata$sample_type) + +``` + +# WGCNA + +WGCNA carries out a correlation network construction. Networks are visual representations of interactions between `nodes` in a system. The nodes in WGCNA are individual genes. So, it helps to visualize patterns and relationships between gene expression profiles to identify genes associated with measured traits as well as identifying genes that are consistently co-expressed and could be contributing to similar molecular pathways. It also accounts for inter-individual variation in gene expression and alleviates issues associated with multiple testing. + +We will use DESeq2 to transform our RNA-seq count data before running WGCNA. Removing the low count genes before the WGCNA is recommended and can help improve the WGCNA results. + +For this analysis we are keeping genes with **10 or more reads in total and expressed in at least 3 samples.** + +For the data normalization we will use variance stabilizing transformation as recommended by the WGCNA's authors and transpose the dataset to prepare for WGCNA analysis. + + +# Data + +```{r show_coldata} +coldata %>% sanitize_datatable() +``` + +```{r normalize_data} +dds <- DESeqDataSetFromMatrix(counts, + colData = coldata, + design = ~ 1) + +## Filtering lowly expressed genes +# We are filtering out genes with fewer than 10 raw counts in total and are present in fewer than 3 samples. +keep <- rowSums(counts(dds)>=10) >=4 +dds <- dds[keep, ] + +# Retrive the normalized data from the DESeqDataSet +dds_norm <- vst(dds) +normalized_counts <- assay(dds_norm) %>% + t() # Transpose this data +#At this point you can look into the data and remove any outlier samples, as those outlier samples can affect the WGCNA results. However, our dataset as analysed earlier doesnot have obvious outliers. + +``` + +## Parameter settings for WGCNA +WGCNA creates a weighted network to define which genes are near each other. The measure of adjacency used is based on the correlation matrix, which requires the definition of a threshold value, which in turn depends on a "power" parameter that defines the exponent used when transforming the correlation values. + +The choice of power parameter will affect the number of modules identified. WGCNA has a function called pickSoftThreshold() to help determine the soft power threshold. + +```{r, message=FALSE, warning=FALSE,fig.align='center'} + +# determine parameters for WGCNA +sft <- pickSoftThreshold(normalized_counts, + dataIsExpr = TRUE, + corFnc = cor, + networkType = "signed" +) +# We have to calculate a measure of the model fit, the signed R2, and make that a new variable. + +sft_df <- data.frame(sft$fitIndices) %>% + dplyr::mutate(model_fit = -sign(slope) * SFT.R.sq) + + +ggplot(sft_df, aes(x = Power, y = model_fit, label = Power)) + + geom_point() + + geom_text(nudge_y = 0.1) + + geom_hline(yintercept = 0.80, col = "red") + + ylim(c(min(sft_df$model_fit), 1.1)) + + xlab("Soft Threshold (power)") + + ylab("Scale Free Topology Model Fit, signed R^2") + + ggtitle("Scale independence") + + theme_classic() +``` + +It is recommend to use a power that has an signed R2 above 0.80, otherwise the results may be too noisy to draw any meanings. In case of multiple power values with signed R2 above 0.80, we need to pick the one at an inflection point or where the R2 value seem to have reached their saturation. You want a power that gives a big enough R2 but is not excessively large. + +## One-step blockwise module detection: + +We will use the blockwiseModules() with power threshold of 20 to find co-expressed genes. It will construct modules with group of genes that are highly correlated. + +```{r, message=FALSE, warning=FALSE} +# picking a soft threshold power near the curve of the plot +picked_power = 20 ## pick a power here based on the above plot and instruction +temp_cor <- cor +cor <- WGCNA::cor +bwnet <- blockwiseModules(normalized_counts, + + # == Adjacency Function == + power = picked_power, + networkType = "signed", # there is option to run unsigned networktype as well. + + # == Set Tree and Block parameters == + deepSplit = 2, + pamRespectsDendro = F, + # detectCutHeight = 0.75, + minModuleSize = params$minModuleSize, + maxBlockSize = params$maxBlockSize, + + # == Module Adjustments == + reassignThreshold = 0, + mergeCutHeight = params$mergeCutHeight, + + # == TOM == Archive the run results in TOM file (saves time) + # saveTOMs = T, + # saveTOMFileBase = "ER", + + # == Output Options + numericLabels = T, + verbose = F) + +# Write down main WGCNA results object to file +readr::write_rds(bwnet, + file = "WGCNA_results.RDS") +``` + +## Eigengene module dataframe + +A module eigengene is the standardized gene expression profile for a given module. An eigengene is the gene whose expression is representative of the majority of the genes within a module. + +```{r, message=FALSE, warning=FALSE} +module_eigengens <- bwnet$MEs +datatable(module_eigengens, options = list(pageLength =10, scrollX=TRUE)) + +``` + +## Dendogram for the modules + +```{r, fig.align='center', fig.width=12, message=FALSE, warning=FALSE} +# Convert labels to colors for plotting +mergedColors =labels2colors(bwnet$colors) +# Plot the dendrogram and the module colors underneath +plotDendroAndColors(bwnet$dendrograms[[1]], + mergedColors[bwnet$blockGenes[[1]]], + "Module colors", + dendroLabels = FALSE, + hang = 0.03, + addGuide = TRUE, + guideHang = 0.05) + +``` + +### Relating modules (Clusters) to geneIds. +GeneIds with their module color and number are given on the table below. + +```{r, message=FALSE, warning=FALSE} +# Relating modules (Clusters) to genotypes +module_df <- data.frame( + gene_id = names(bwnet$colors), + module_no = bwnet$colors, + colors = labels2colors(bwnet$colors) + +) +datatable(module_df, rownames = FALSE) +# lets write out the file lising the genes and their modules +write_delim(module_df, file = "List_of_genes_with_modules.txt", delim = "\t") +``` + +## Relating modules with genotypes + +WGCNA calcualtes and Eigengene (hypothetical central gene) for each module which makes it easier to associate sample groups with module cluster. + +```{r, message=FALSE, warning=FALSE} +# Get Module Eigengens per cluster +MEs0 <- moduleEigengenes(normalized_counts, mergedColors)$eigengenes + +# Reorder modules so similar modules are next to each other +MEs0 <- orderMEs(MEs0) +module_order = names(MEs0) %>% gsub("ME","", .) + +# Add sample names +MEs0$samples = row.names(MEs0) + +# tidy the data +mME = MEs0 %>% + pivot_longer(-samples) %>% + mutate( + name = gsub("ME", "", name), + name = factor(name, levels = module_order) + ) +``` + +```{r, fig.align='center', fig.height=8, message=FALSE, warning=FALSE} +mME %>% ggplot(., aes(x=samples, y=name, fill=value)) + + geom_tile() + + theme_bw() + + scale_fill_gradient2( + low = "blue", + high = "red3", + mid = "white", + midpoint = 0, + limit = c(-1,1)) + + theme(axis.text.x = element_text(angle=90)) + + labs(title = "Module-trait Relationships", y = "Modules", fill="corr") + +``` + +On the heatmap above, samples are on the x-axis and different modules on y-axis. +We can visually inspect this heatmap to understand how different modules are correlated to different sample groups. + +We can also use another approach to extract only the modules with biggest differences and then correlate those modules for conditions. + +## Modules with biggest differences across treatments +We will create a design matrix using `sample_type` in our metadata and run linear model on each module. + +```{r, message=FALSE, warning=FALSE} +# to confirm our eigengens relate to our metadata from deseq object run the line below +#all.equal(colnames(dds), rownames(module_eigengens)) + +# Create a design matrix from the genotype variable +des_mat <- model.matrix(~ coldata$sample_type) + + +# lmFit() needs a transposed version of the matrix +fit <- limma::lmFit(t(module_eigengens), design = des_mat) + +# Apply empirical Bayes to smooth standard errors +fit <- limma::eBayes(fit) + +# Apply multiple testing correction and obtain stats +stats_df <- limma::topTable(fit, number = ncol(module_eigengens)) %>% + tibble::rownames_to_column("module") + +datatable(stats_df, options = list(pageLength =10, scrollX=TRUE)) +``` + +Modules with highest padj values are the ones have bigger differences between sample_groups. Look for these modules in the heatmap above. + +# Conclusions + + + +# Methods + +RNA-seq counts were generated by the nf-core rnaseq pipeline [version] using Salmon (Patro et al. 2017). Downstream analyses were performed using `r version$version.string`. Counts were imported into R using DESeq2 version `r packageVersion("DESeq2")` (Love, Huber, and Anders 2014). Gene annotations were obtained from Ensembl. Plots were generated by ggplot2 (Wickham 2016). Heatmaps were generated by pheatmap (Kolde 2019). + +## R package references + +```{r citations} +citation("WGCNA") +citation("DESeq2") +citation("ggplot2") +``` + +## R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/04_gene_patterns/params_de-example.R b/inst/templates/rnaseq/04_gene_patterns/params_de-example.R new file mode 100644 index 0000000..cc75ad2 --- /dev/null +++ b/inst/templates/rnaseq/04_gene_patterns/params_de-example.R @@ -0,0 +1,18 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index c984029..f844175 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -15,15 +15,15 @@ We recommend to use the `samplesheet.csv` used with nf-core as metadata file, wh - `params*example.R` are parameters pointing to test data to be used as an example to test the reports. - `run_markdown.R` is an example of code to run the Rmd with different parameters. -### Quality assesment +### Quality assessment -`00_quality_assesment/QC.Rmd` is a template report that needs `params_qc.R` for `nf-core/rnaseq` outputs. +`01_quality_assessment/QC.Rmd` is a template report that needs `params_qc.R` for `nf-core/rnaseq` outputs. Follow instruction in the R and Rmd scripts to render it. ### Differential expression -`01_differential_expression/DEG.Rmd` is a template for comparison between two groups. `params_de.R` has the information for the input files to load. You need to point to `nf-core/rnaseq` output files. +`02_differential_expression/DEG.Rmd` is a template for comparison between two groups. `params_de.R` has the information for the input files to load. You need to point to `nf-core/rnaseq` output files. On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: @@ -34,8 +34,18 @@ On the `YAML` header file of the `Rmd` you can specify some parameters or just s - Pathway analysis: Over-Representation Analysis and Gene-Set-Enrichment Analysis - Tables -### Complementary template reports +### Comparative analysis -- `03_functional/GSVA.Rmd` shows an example on how to use [GSVA package](https://bioconductor.org/packages/release/bioc/html/GSVA.html) for estimating variation of gene set enrichment through the samples of a expression data set - `03_comparative/Pair-wise-comparison-analysis.Rmd` shows an exmaple on how to compare two differential expression analysis from the `DEG.Rmd` template. - `03_comparative/Intersections.Rmd` shows an example on how to compare multiple differential expression analyses from `DEG.Rmd` and find intersections. + +### Functional analysis + +- `03_functional/GSVA.Rmd` shows an example on how to use [GSVA package](https://bioconductor.org/packages/release/bioc/html/GSVA.html) for estimating variation of gene set enrichment through the samples of a expression data set +- `03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd` shows an example on how to run Gene Ontology over-representation, KEGG over-representation, and KEGG gene set enrichment analysis (GSEA) for non-model organisms using data from Uniprot. `params_nonmodel_org_pathways.R` has the information for the input files to load. +- `03_functional/Immune-deconvolution.Rmd` shows an example on how to run immune cell type deconvolution. `params_immune_deconv.R` has the information for the input files to load. + +### Gene pattern analysis + +- `04_gene_patterns/WGCNA.Rmd` shows an example on how to use the [WGCNA](https://cran.r-project.org/web/packages/WGCNA/index.html) package to find gene modules in the gene expression data. + diff --git a/inst/templates/singlecell/01_quality_assessment/scATAC_QC.Rmd b/inst/templates/singlecell/01_quality_assessment/scATAC_QC.Rmd new file mode 100644 index 0000000..f4d6fd6 --- /dev/null +++ b/inst/templates/singlecell/01_quality_assessment/scATAC_QC.Rmd @@ -0,0 +1,367 @@ +--- +title: "Quality Control for scATAC-Seq" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +params: + max_PRF: !r Inf + min_PRF: 4200 + max_FRiP: !r Inf + min_FRiP: 40 + min_TSS: 2 + max_NS: 2 + max_blacklistratio: 0.02 + data_dir: !r file.path("data") + results_dir: !r file.path("results") + project_file: ../information.R + #params_file: params_qc.R +--- + + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r source_params, echo = F} +metadata_fn='' +se_object='' +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(Seurat) +library(Signac) +library(tidyverse) +library(ggplot2) +library(ggrepel) +library(cowplot) +library(kableExtra) +library(qs) +library(bcbioR) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + +```{r not_in} +`%!in%` = Negate(`%in%`) +``` + + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + +# Samples and metadata + +```{r load_metadata} +meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>% + dplyr::select(-description) + +ggplot(meta_df, aes(sample_type, fill = sample_type)) + + geom_bar() + ylab("") + xlab("") + + scale_fill_cb_friendly() +``` + + +```{r show-metadata} +se <- readRDS(se_object) #local + + +metrics <- metadata(se)$metrics %>% + full_join(meta_df , by = c("sample" = "sample")) + +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +```{r create_or_read_seurat_obj} +filename <- 'data/scATAC_seurat.rds' +if (!file.exists(filename)){ + +}else{ + seurat <- readRDS(filename) +} +``` + + +# ATAC-Seq {.tabset} + +We use some of the results from cellranger outputs and the peaks called using MACS2 for QCing the scATAC-Seq data. + +## Read counts per cell + +```{r warning=FALSE, message=FALSE, results='asis', fig.width=12} +VlnPlot(seurat, features = "nCount_MACS2peaks", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("Reads") + +cat('\n\n') +``` +## Detected peaks per cell + +```{r warning=FALSE, message=FALSE, fig.width=12} +VlnPlot(seurat, features = "nFeature_MACS2peaks", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") + +# VlnPlot(seurat, features = "nFeature_CellRangerPeaks", ncol = 1, pt.size = 0) + +# scale_fill_cb_friendly() + +# xlab("") + +# ylab("UMI") +``` + +## QC metrics {.tabset} + +### Total number of fragments in peaks {.tabset} + +This metric represents the total number of fragments (= reads) mapping within a region of the genome that is predicted to be accessible (= a peak). It's a measure of cellular sequencing depth / complexity. Cells with very few reads may need to be excluded due to low sequencing depth. Cells with extremely high levels may represent doublets, nuclei clumps, or other artefacts. + + +```{r results='asis', warning=FALSE, message=FALSE, fig.width=12, fig.height=8} +DefaultAssay(seurat) <- "MACS2peaks" + +cat('#### Histogram \n\n') +seurat@meta.data %>% + ggplot(aes(x = atac_peak_region_fragments)) + + geom_histogram() + + facet_wrap(~sample, scale = 'free') + + geom_vline(xintercept = params$min_PRF) +``` + + +```{r results='asis', warning=FALSE, message=FALSE, fig.width=12} +cat('\n#### Violin plot\n\n') +VlnPlot( + object = seurat, + features = 'atac_peak_region_fragments', + pt.size = 0 +) + ylab('Total number of fragments in peaks') + + NoLegend() + + geom_hline(yintercept = params$min_PRF) + xlab("") + + scale_fill_cb_friendly() + +cat('\n\n') +``` + +### Fraction of fragments in peaks + +It represents the fraction of all fragments that fall within ATAC-seq peaks. Cells with low values (i.e. <15-20%) often represent low-quality cells or technical artifacts that should be removed. Note that this value can be sensitive to the set of peaks used. + +```{r results='asis'} +VlnPlot( + object = seurat, + features = 'pct_reads_in_peaks', + pt.size = 0 +) + NoLegend() + xlab("") + + scale_fill_cb_friendly() + + geom_hline(yintercept = params$min_FRiP) + +cat('\n\n') +``` + + +### Transcriptional start site (TSS) enrichment score {.tabset} + +The ENCODE project has defined an ATAC-seq targeting score based on the ratio of fragments centered at the TSS to fragments in TSS-flanking regions (see https://www.encodeproject.org/data-standards/terms/). Poor ATAC-seq experiments typically will have a low TSS enrichment score. We can compute this metric for each cell with the TSSEnrichment() function, and the results are stored in metadata under the column name TSS.enrichment. + + +```{r results='asis'} +VlnPlot( + object = seurat, + features = 'TSS.enrichment', + pt.size = 0 +) + scale_fill_cb_friendly() + NoLegend() + xlab("") +cat('\n\n') +``` +The following tabs show the TSS enrichment score distribution for each sample. Cells with high-quality ATAC-seq data should show a clear peak in reads at the TSS, with a decay the further we get from it. + +Each plot is split between cells with a high or low global TSS enrichment score (cuffoff at `r params$min_TSS`), to double-check whether cells with lowest enrichment scores still follow the expected pattern or rather need to be excluded. + +```{r results='asis'} +seurat$TSS.group <- ifelse(seurat$TSS.enrichment > params$min_TSS, "High", "Low") +Idents(seurat) <- 'sample' +for (sample in levels(seurat$sample)){ + cat('####', sample, '\n\n') + p <- TSSPlot(subset(x = seurat, idents = sample), group.by = "TSS.group") + NoLegend() + print(p) + cat('\n\n') +} +``` + +### Nucleosome signal {.tabset} + +The histogram of DNA fragment sizes (determined from the paired-end sequencing reads) should exhibit a strong nucleosome banding pattern corresponding to the length of DNA wrapped around a single nucleosome, i.e peaks at approximately 100bp (nucleosome-free), and mono, di and tri nucleosome-bound peaks at 200, 400 and 600bp. We calculate this per single cell, and quantify the approximate ratio of mononucleosomal to nucleosome-free fragments (stored as nucleosome_signal). Cells with lower nucleosome signal have a higher ratio of nucleosome-free fragments. + +```{r warning = FALSE, results='asis'} +seurat$nucleosome_group <- ifelse(seurat$nucleosome_signal > 1, "high NS", "low NS") +for (sample in levels(seurat$sample)){ + cat('####', sample, '\n\n') + p <- FragmentHistogram(seurat, group.by = 'nucleosome_group', cells = colnames(seurat[, seurat$sample == sample])) + print(p) + cat('\n\n') +} +# FragmentHistogram(seurat, group.by = 'nucleosome_group', cells = colnames(seurat[, seurat$sample == 'Control'])) +``` + +```{r fig.width=12} +VlnPlot( + object = seurat, + features = 'nucleosome_signal', + pt.size = 0 +) + scale_fill_cb_friendly() + +NoLegend() + xlab("") +``` + + + + +### Blacklist ratio + +tIt's he ratio of reads in genomic blacklist regions. The [ENCODE project](https://www.encodeproject.org/) has provided a list of [blacklist regions](https://github.com/Boyle-Lab/Blacklist), representing reads which are often associated with artefactual signal. Cells with a high proportion of reads mapping to these areas (compared to reads mapping to peaks) often represent technical artifacts and should be removed. ENCODE blacklist regions for human (hg19 and GRCh38), mouse (mm10), Drosophila (dm3), and C. elegans (ce10) are included in the Signac package. **Peaks overlapping with the balcklist regions were removed in the analysis, so we don't show blacklist fraction here**. + +Line is drawn at `r params$max_blacklistratio`. + +```{r fig.width=12} +VlnPlot( + object = seurat, + features = 'blacklist_fraction', + pt.size = 0 +) + scale_fill_cb_friendly() + + NoLegend() + + geom_hline(yintercept = params$max_blacklistratio) + +``` + + +## Normalization, linear dimensional reduction, and clustering + +* Normalization: Signac performs term frequency-inverse document frequency (TF-IDF) normalization. This is a two-step normalization procedure, that both normalizes across cells to correct for differences in cellular sequencing depth, and across peaks to give higher values to more rare peaks. + +* Feature selection: The low dynamic range of scATAC-seq data makes it challenging to perform variable feature selection, as we do for scRNA-seq. Instead, we can choose to use only the top n% of features (peaks) for dimensional reduction, or remove features present in less than n cells with the FindTopFeatures() function. Here, we will use all features, though we note that we see very similar results when using only a subset of features (try setting min.cutoff to ‘q75’ to use the top 25% all peaks), with faster runtimes. Features used for dimensional reduction are automatically set as VariableFeatures() for the Seurat object by this function. + +* Dimension reduction: We next run singular value decomposition (SVD) on the TD-IDF matrix, using the features (peaks) selected above. This returns a reduced dimension representation of the object (for users who are more familiar with scRNA-seq, you can think of this as analogous to the output of PCA). + +The combined steps of TF-IDF followed by SVD are known as latent semantic indexing (LSI), and were first introduced for the analysis of scATAC-seq data by [Cusanovich et al. 2015.](https://www.science.org/doi/10.1126/science.aax6234) + +The first LSI component often captures sequencing depth (techni ccal variation) rather than biological variation. If this is the case, the component should be removed from downstream analysis. We can assess the correlation between each LSI component and sequencing depth using the DepthCor() function (see below). +Here we see there is a very strong correlation between the first LSI component and the total number of counts for the cell, so we will perform downstream steps without this component. + +```{r results='asis'} +DepthCor(seurat, assay = 'MACS2peaks') +cat('\n\n') +``` + +## UMAP plots + +```{r results='asis'} +DimPlot(object = seurat, group.by = 'sample', reduction = "umapATAC") + + scale_color_cb_friendly() + +cat('\n\n') +``` + + + + + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/01_quality_assessment/scRNA_QC.rmd similarity index 100% rename from inst/templates/singlecell/QC/QC.rmd rename to inst/templates/singlecell/01_quality_assessment/scRNA_QC.rmd diff --git a/inst/templates/singlecell/02_differential_expression/MAST_analysis.R b/inst/templates/singlecell/02_differential_expression/MAST_analysis.R new file mode 100644 index 0000000..7a21c49 --- /dev/null +++ b/inst/templates/singlecell/02_differential_expression/MAST_analysis.R @@ -0,0 +1,137 @@ +##### library loading ##### +library(Seurat) +library(apeglm) +library(SummarizedExperiment) +library(SingleCellExperiment) +library(MAST) +library(tidyverse) +library(data.table) +library(lme4) +library(R.utils) +library(glue) +library(optparse) +##### parameter parse ##### +options(stringsAsFactors = F) +option_list = list(make_option("--seurat_obj", default = "https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds"), + make_option("--resolution_column", default = "integrated_snn_res.0.4"), + make_option("--cluster_name", default = "2"), + make_option("--contrast", default = "age"), + make_option("--outputDir", default = "out") +) +args = parse_args(OptionParser(option_list = option_list)) + +invisible(list2env(args,environment())) +column <- contrast +system(glue("mkdir -p {outputDir}")) + + +message("[Preparing inputs for MAST modeling]") +##### Read in provided seurat ##### +if (isUrl(seurat_obj)){ + seurat <- readRDS(url(seurat_obj)) +}else{ + seurat <- readRDS(seurat_obj) +} + +message("Input seurat object: ",seurat_obj) +DefaultAssay(seurat) <- "RNA" +message("RNA is set as the default assay") +message("Column name of clustering to use: ",resolution_column) +Idents(object = seurat) <- resolution_column +message("Subset original seurat to be only cluster ",cluster_name," for faster computing!") +data_subset <- subset(x = seurat, idents = cluster_name) +existintLayers <- Layers(data_subset[["RNA"]]) +if(existintLayers!="counts"){ + print("Not only counts excisted as layers in this object") + print("Make sure your default slot is counts and it is raw counts") +} +##### Start from raw count for MAST ##### +message("Natural log of raw counts with pseudobulk 1 used for MAST modeling") +sce <- as.SingleCellExperiment(data_subset) +##### Log-Normalize Seurat for visualization later ##### +message("Total counts normalization and log1p transformation done to raw counts") +message("New layer Data added to the seurat object for visualization later!") +data_subset <- NormalizeData( + data_subset, + assay = "RNA", + normalization.method = "LogNormalize", + scale.factor = 10000, + margin = 1) +saveRDS(data_subset,glue("{outputDir}/processed_seurat.rds")) +##### Continue MAST input prep ##### +assay(sce, "log") = log(counts(sce) + 1) +# Scaling ngenes +cdr = colSums(assay(sce, "log")>0) +colData(sce)$cngeneson = scale(cdr) + +# Create new sce object (only 'log' count data) +sce.1 = SingleCellExperiment(assays = list(log = assay(sce, "log"))) +colData(sce.1) = colData(sce) + +#change to sca +sca = SceToSingleCellAssay(sce.1) + +message("Subset genes observed in at least 10% of cells") + +expressed_genes <- freq(sca) > 0.1 +sca_filtered <- sca[expressed_genes, ] + +cdr2 <- colSums(SummarizedExperiment::assay(sca_filtered)>0) + +SummarizedExperiment::colData(sca_filtered)$ngeneson <- scale(cdr2) +SummarizedExperiment::colData(sca_filtered)$orig.ident <- + factor(SummarizedExperiment::colData(sca_filtered)$orig.ident) +SummarizedExperiment::colData(sca_filtered)[[column]] <- + factor(SummarizedExperiment::colData(sca_filtered)[[column]]) + +##### MAST modeling ##### +message("[MAST modeling with supplied contrasts]") + +message("Note: this step is time-consuming!") + +comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[column]])[2] +lrt_name <- paste0(column, comp_name) +formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", column)) + +zlmCond <- suppressMessages(MAST::zlm(formula_touse, sca_filtered, method='glmer', + ebayes = F,strictConvergence = FALSE)) +summaryCond_column <- suppressMessages(MAST::summary(zlmCond,doLRT=lrt_name)) + +##### MAST outputs ##### +message("[Main MAST computation done, result outputs]") + +summary_cond_file = paste0(outputDir,"/MAST_RESULTS_",cluster_name,"_", column, ".rds") +saveRDS(summaryCond_column, file = summary_cond_file) + +message("Full MAST object saved to file ", summary_cond_file) + + +summaryDt_column <- summaryCond_column$datatable +fcHurdle_column <- merge(summaryDt_column[contrast == lrt_name & component == 'H', + .(primerid, `Pr(>Chisq)`)], + # This extracts hurdle p-values + summaryDt_column[contrast == lrt_name & component == 'logFC', + .(primerid, coef, ci.hi, ci.lo)], + # This extract LogFC data + by = 'primerid') +fcHurdle_column <- stats::na.omit(as.data.frame(fcHurdle_column)) +fcHurdle_column$fdr <- p.adjust(fcHurdle_column$`Pr(>Chisq)`, 'fdr') +to_save_column <- fcHurdle_column + +full_res_file = paste0(outputDir,"/FULL_MAST_RESULTS_",cluster_name,"_", column, ".csv") +write.table(to_save_column, file=full_res_file, row.names = FALSE, sep=",") + + +message("MAST summary results output to csv files") + + +fcHurdleSig_column <- merge(fcHurdle_column[fcHurdle_column$fdr < .05,], + as.data.table(mcols(sca_filtered)), + by = 'primerid') + +setorder(fcHurdleSig_column, fdr) + +sig_res_file = paste0(outputDir,"/SIG_MAST_RESULTS_padj<0.05_",cluster_name,"_", column, ".csv") +write.table(fcHurdleSig_column, file=sig_res_file, row.names = FALSE, sep=",") + +message("Significant MAST summary results output to csv files") diff --git a/inst/templates/singlecell/02_differential_expression/scRNA_MAST.Rmd b/inst/templates/singlecell/02_differential_expression/scRNA_MAST.Rmd new file mode 100644 index 0000000..3aa6a86 --- /dev/null +++ b/inst/templates/singlecell/02_differential_expression/scRNA_MAST.Rmd @@ -0,0 +1,391 @@ +--- +title: "scRNA MAST" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + eval: false + toc: true + toc_float: + collapsed: true + smooth_scroll: true +# editor_options: +# chunk_output_type: inline +params: + project_file: ../information.R + seurat_obj: "https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds" + column: "age" + contrasts: !r list(c("age", "YOUNG", "OLD")) + cluster_name: "2" + resolution_column: "integrated_snn_res.0.4" + +--- + +Template developed with materials from https://hbctraining.github.io/main/. + + +```{r, message=FALSE, warning=FALSE,eval=T} +library(tidyverse) +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0){ + warning("We recommend >= R4.3.1")} +stopifnot(compareVersion(as.character(BiocManager::version()), "3.16")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) +``` + + +```{r setup, cache=FALSE,message=FALSE,echo=FALSE,eval=T} +# Load libraries' +library(Seurat) +library(tidyverse) +library(data.table) +library(ggprism) +library(knitr) +library(EnhancedVolcano) +library(pheatmap) +library(viridisLite) +library(viridis) +library(ggpubr) + + +ggplot2::theme_set(theme_prism(base_size = 12)) +scale_colour_discrete <- function(...) + scale_colour_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]])) + +set.seed(1454944673L) +opts_chunk[["set"]]( + audodep = TRUE, + cache = FALSE, + cache.lazy = FALSE, + error = TRUE, + echo = F, + eval=F, + fig.height = 6, + fig.retina = 2L, + fig.width = 6, + message = FALSE, + tidy = F, + warning = F +) + +invisible(list2env(params,environment())) +source(project_file) +grpmean <- function(grp,mat){ + if(ncol(mat)==length(grp)){ + mat <- t(mat) + }else{ + stopifnot(nrow(mat)==length(grp)) + } + mat = mat[names(grp),] + grpsum = rowsum(mat,grp,na.rm=T) + sorted_grpsize = table(grp)[rownames(grpsum)] + meangrp = grpsum/as.numeric(sorted_grpsize) + return(t(meangrp)) +} +colorize <- function(x, color) { + if (knitr::is_latex_output()) { + sprintf("\\textcolor{%s}{%s}", color, x) + } else if (knitr::is_html_output()) { + sprintf("%s", color, + x) + } else x +} +``` + + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + + +# MAST analysis + +**Pre-processing** + +**Subset genes observed in at least 10% of cells** ... + +**Prep for MAST comparison** ... + +**MAST run with desired contrasts ** ... + +**Output results ** ... + +[Full MAST object saved] + +[MAST summary results output to csv files] + +[Significant MAST summary results output to csv files] + +## Differentially Expressed Genes Table + +```{r load in pre-computed outputs,eval=T,cache=F} +topn=16 +data_subset <- readRDS("out/processed_seurat.rds") +MAST_Result <- fread("out/FULL_MAST_RESULTS_2_age.csv") +top_DEG <- MAST_Result %>% + slice_min(order_by = fdr, n = topn, with_ties = F) %>% + pull(primerid) +geneMat <- GetAssayData(data_subset, slot = "data", assay = "RNA") +data_subset$contrast_grp <-paste0(data_subset@meta.data[[column]], + "_",data_subset$orig.ident) +cellGrp <- setNames(data_subset$contrast_grp,colnames(geneMat)) +grp_mean <- grpmean(cellGrp,geneMat) +``` + + +**[Option 1: numbers displayed in digits]** + +```{r,eval=T} +MAST_Result %>% + arrange(fdr) %>% + select(Gene = primerid,Estimate=coef,FDR = fdr) %>% + mutate(Estimate=sprintf(Estimate,fmt = '%.2f'), + FDR =sprintf(FDR,fmt = '%.4f')) %>% + DT::datatable() +``` + +**[Option 2: FDR displayed in scientific notation]** + +```{r,eval=T} +MAST_Result %>% + arrange(fdr) %>% + select(Gene = primerid,Estimate=coef,FDR = fdr) %>% + mutate(Estimate=sprintf(Estimate,fmt = '%.2f'), + FDR = formatC(FDR, format = "e", digits = 1)) %>% + DT::datatable() +``` + + +## Group Mean Heatmap {.tabset} + +```{r group mean heatmap,fig.height=6,fig.width=8,results='asis',eval=T} + +cat("### ", column, "\n\n") +grp_mean <- grp_mean[rowSums(grp_mean)>0,] +p1 <- pheatmap(grp_mean, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 10, + angle_col = 0) +print(p1) +cat("\n\n") +``` + + + +## Volcano plot and Top DEG {.tabset} + +This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. + +The points highlighted: + +- in `r colorize("grey", "darkgrey")` points are non-significant. +- in `r colorize("blue", "lightblue")` have a padj > 0.05 and a log2-fold change > 0.5. +- in `r colorize("plum", "plum1")` have a padj < 0.05 and a log2-fold change < 0.5. +- in `r colorize("purple", "purple")` are genes that have fdr < 0.05 and a log2-fold change > 0.5. + + +The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. + +```{r volcano_plot,fig.height=10,fig.width=13,results='asis',eval=T} +cat("### ", column, "\n\n") +yrange <- c(-0.2,max(-log10(MAST_Result$fdr))+2) +max_effect <- max(2,round(max(MAST_Result$coef))) +min_effect <- min(-2,round(min(MAST_Result$coef))) +xrange <- c(min_effect,max_effect) +p1 <- EnhancedVolcano(MAST_Result, + ylim=yrange, + xlim= xrange, + lab= MAST_Result$primerid, + selectLab = top_DEG, + # cut-offs + pCutoff = 0.05, + FCcutoff = 0.5, + # axis, title + x = 'coef', + y = 'fdr', + title="", + subtitle = "", + # label + drawConnectors = T, + labSize = 7, + labCol = 'black', + labFace = 'bold', + # color + colAlpha = 4/5, + # legend + legendPosition = 'right', + legendLabSize = 16, + legendIconSize = 5.0, + boxedLabels = TRUE, + max.overlaps = Inf, + col=c("darkgrey", "lightblue", "plum1", "purple")) +print(p1) +cat("\n\n") +``` + +## % of expression for Top DEGs {.tabset} + +```{r sc-heatmap seurat,fig.height=5,fig.width=10,eval=T} +DotPlot(data_subset, features = top_DEG, + group.by = "contrast_grp",assay = "RNA") + + RotatedAxis()+ + geom_point(aes(size=pct.exp), shape = 21, colour="black", stroke=0.5) + + scale_colour_viridis(option="magma") + + guides(size=guide_legend(override.aes=list(shape=21, colour="black", fill="white")))+ + labs(x="",y="") +``` + +## Plain sc-expression across contrast groups: top `r topn` DEGs {.tabset} + +```{r sc-expr for all DEG1,fig.height=5,fig.width=10,eval=T,results='asis'} +top_n_exp <- t(geneMat[top_DEG,]) %>% + as.data.frame() %>% + mutate(contrast_grp = cellGrp[colnames(geneMat)]) +for (g in top_DEG){ + cat("### ",g, "\n\n") + p1=ggplot(top_n_exp %>% dplyr::rename(val=g), aes(x = contrast_grp, y =val)) + + geom_boxplot(width=0.1,outliers = F)+ + geom_violin(position=position_dodge(1),alpha=0.5, + na.rm=TRUE,trim=FALSE)+ + ggbeeswarm::geom_quasirandom(na.rm=TRUE,dodge.width=0.5, + method='quasirandom')+ + theme_minimal() + + theme( + axis.text.x = element_text(size=rel(1.5),face="bold"), + plot.title = element_text(hjust = 0.5), + legend.position = "none" + )+ + labs(x="",y="log1p(Sum Normalized Counts)") + print(p1) + cat("\n\n") +} +``` + + + +## Colored sc-expression across contrast groups: top `r topn` DEGs {.tabset} + +```{r sc-expr for all DEG2,fig.height=5,fig.width=10,eval=T,results='asis'} +relSize=1.5 +for (g in top_DEG){ + cat("### ",g, "\n\n") + p1=ggplot(top_n_exp %>% dplyr::rename(val=g), + aes(x = contrast_grp,y =val,fill=contrast_grp))+ + scale_fill_viridis_d(option = "D")+ + geom_violin(alpha=0.5, position = position_dodge(width = .75),size=1,color=NA) + + geom_boxplot(notch = F, outlier.size = -1, color="black", + lwd=1, alpha = 0.7,show.legend = F,width=0.1)+ + ggbeeswarm::geom_quasirandom(shape = 21,size=2, dodge.width = .75, + color="black", + alpha=.8,show.legend = F)+ + theme_minimal() + + rremove("legend.title")+ + theme( + axis.line = element_line(colour = "black",size=1), + axis.ticks = element_line(size=1,color="black"), + axis.text = element_text(color="black"), + axis.ticks.length=unit(0.2,"cm"), + strip.text = element_text(size = rel(relSize)), + legend.position = c(0.92, 0.9))+ + font("xylab",size=rel(relSize))+ + font("xy",size=rel(relSize))+ + font("xy.text", size = rel(relSize)) + + font("legend.text",size = rel(relSize*0.7))+ + guides(fill = guide_legend(override.aes =list(alpha = 1,color="black")))+ + labs(x="",y="log1p(Sum Normalized Counts)") + print(p1) + cat("\n\n") +} +``` + + +## Dot styles: sc-expression across contrast groups {.tabset} + +```{r sc-expr for all DEG3,fig.height=5,fig.width=10,eval=T,results='asis'} +cat("### ","quasirandom", "\n\n") +p1=ggplot(top_n_exp, aes_string(x = "contrast_grp", y = "PERP")) + + geom_boxplot(width=0.1,outliers = F)+ + geom_violin(position=position_dodge(1),alpha=0.5, na.rm=TRUE,trim=FALSE)+ + ggbeeswarm::geom_quasirandom(na.rm=TRUE,dodge.width=0.5,method='quasirandom')+ + theme_minimal() + + theme( + axis.text.x = element_text(size=rel(1.5),face="bold"), + plot.title = element_text(hjust = 0.5), + legend.position = "none" + )+ + labs(x="",y="log1p(Sum Normalized Counts)") +print(p1) +cat("\n\n") +``` + + +```{r sc-expr plot2,fig.height=5,fig.width=10,eval=T,results='asis'} +cat("### ", "dotplot", "\n\n") +p1 <- ggplot(top_n_exp, aes_string(x = "contrast_grp", y = g)) + + geom_boxplot(width=0.1,outliers = F)+ + geom_violin(position=position_dodge(1),alpha=0.5, na.rm=TRUE,trim=FALSE)+ + geom_dotplot(binaxis="y",stackdir='center',binwidth = 0.01)+ + theme_minimal() + + theme( + axis.text.x = element_text(size=rel(1.5),face="bold"), + plot.title = element_text(hjust = 0.5), + legend.position = "none" + )+ + labs(x="",y="log1p(Sum Normalized Counts)") +print(p1) +cat("\n\n") +``` + +```{r sc-expr plot3,fig.height=5,fig.width=10,eval=T,results='asis'} +cat("### ","jitter" , "\n\n") +p1 <- ggplot(top_n_exp, aes_string(x = "contrast_grp", y = g)) + + geom_boxplot(width=0.1,outliers = F)+ + geom_violin(position=position_dodge(1),alpha=0.5, na.rm=TRUE,trim=FALSE)+ + geom_jitter(width = 0.2)+ + theme_minimal() + + theme( + axis.text.x = element_text(size=rel(1.5),face="bold"), + plot.title = element_text(hjust = 0.5), + legend.position = "none" + )+ + labs(x="",y="log1p(Sum Normalized Counts)") +print(p1) +cat("\n\n") +``` + + + +# Method description + +MAST is used to generate differential expressed genes among the supplied contrast groups by taking into account the number of genes activated in every cell, also other batch information (e.g. different samples in the suppliedd `seurat` object). + +Please read more about MAST in its [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0844-5) and [github repository](https://github.com/RGLab/MAST). + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` + diff --git a/inst/templates/singlecell/differential_expression/scRNA_pseudobulk.Rmd b/inst/templates/singlecell/02_differential_expression/scRNA_pseudobulk.Rmd similarity index 99% rename from inst/templates/singlecell/differential_expression/scRNA_pseudobulk.Rmd rename to inst/templates/singlecell/02_differential_expression/scRNA_pseudobulk.Rmd index 5dfac30..3848914 100644 --- a/inst/templates/singlecell/differential_expression/scRNA_pseudobulk.Rmd +++ b/inst/templates/singlecell/02_differential_expression/scRNA_pseudobulk.Rmd @@ -14,10 +14,7 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: - project_file: ../information.R seurat_obj: "https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds" column: "age" diff --git a/inst/templates/singlecell/Integration/norm_integration.rmd b/inst/templates/singlecell/03_Integration/norm_integration.rmd similarity index 99% rename from inst/templates/singlecell/Integration/norm_integration.rmd rename to inst/templates/singlecell/03_Integration/norm_integration.rmd index 0890acf..80b575c 100644 --- a/inst/templates/singlecell/Integration/norm_integration.rmd +++ b/inst/templates/singlecell/03_Integration/norm_integration.rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: inline params: project_file: ../information.R --- diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index cdc9868..23d21bf 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -6,15 +6,23 @@ Make sure there is a valid project name, and modify `information.R` with the rig `pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. -# QC +# Quality Assessment -Currently we are working on deploying a shiny app to inspect the single cell object and find the best cut-offs for filtering. The Rmd that helps to visualize the before and after is `QC.Rmd`. +## scATAC + +The Rmd that helps to visualize ATAC metrics is `scATAC_QC.Rmd`. + +## scRNA + +Currently we are working on deploying a shiny app to inspect the single cell object and find the best cut-offs for filtering. The Rmd that helps to visualize the before and after is `scRNA_QC.Rmd`. # Integration `Integration/norm_integration.rmd` is a template with guidelines on how to work with multiple samples. It compares log2norm vs SCT, work with SCT by samples to remove batch biases better, provide options for integration between CCA and Harmony. As last step, it contains cell type clustering and visualization to help decide the best parameters. -# differential_expression +# Differential expression + +## pseudobulk approach `scRNA_pseudobulk.Rmd` is a template that performs pseudobulk differential expression analysis using DESeq2. It takes as input: @@ -25,3 +33,35 @@ Currently we are working on deploying a shiny app to inspect the single cell obj - the name/number of the cluster of interest The template aggregates single cell counts to the sample x cluster x factor_of_interest pseudobulk level, subsets to the cluster of interest, performs the desired statistical comparisons, and visualizes the results. + +## MAST - single cell approach + +`differential_expression/scRNA_MAST.Rmd` is a template to visualize differentially expressed genes (DEG) results generated from MAST analysis. Main visualizations include: + +- Group-level mean expression shown in heatmap; +- Volcano plots highlighting top DEGs; +- An adapted `seurat` Dotplot to show % of cells expressing top DEGs; +- An integrated Violin-Box-Scatter (VBS) plot displaying the normalized expression of top DEGs per single cell across contrast groups. + +We separately prepare the Rscript `differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, ~1000 cells and two group comparison, it took around 10-15 minutes. + +To run this Rscript, you should input below parameters: + +- `--seurat_obj`: the seurat object containing your scRNA-seq data, raw counts stored in the layer `counts` of assay `RNA` is required +- `--resolution_column`: the column name of your choice of clustering method + resolution chosen, for example `pca_res0.4`, this is only required if you want to subset your data +- `--cluster_name`: the cluster you want to subset for MAST analysis +- `--contrast`: the column name in your `seurat` cell metadata indicating the group you want to do contrast for +- `--outputDir`: the output directory you will find your intermediate results for running `scRNA_MAST.Rmd` Default: `out`. + +If none of the parameters are supplied, just run `Rscript differential_expression/MAST_analysis.R` will run it on our demo dataset. + +To obtain more informative console logs from MAST analysis, please consider running: + +`Rscript --no-save --no-restore --verbose differential_expression/MAST_analysis.R > MAST.Rout 2>&1` + +You will expect to have three main outputs in your specified output folder: + +- `processed_seurat.rds`: the processed seurat object containing log-normalized data +- `MAST_RESULTS*`: the MAST modeling object to allow for further follow-up analysis of your own choice +- `FULL_MAST_RESULTS_*`: full MAST differential expression analysis results in csv format +- `SIG_MAST_RESULTS_padj<0.05*`: significant DEGs using 0.05 as the threshold for FDR diff --git a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd index 6e49644..fb737f9 100644 --- a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd +++ b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: project_file: information.R seurat_fn: ../data/fDat_sn_RC.rds diff --git a/inst/templates/spatial/cosmx/QC/QC.Rmd b/inst/templates/spatial/cosmx/QC/QC.Rmd index a514031..c0dde78 100644 --- a/inst/templates/spatial/cosmx/QC/QC.Rmd +++ b/inst/templates/spatial/cosmx/QC/QC.Rmd @@ -14,8 +14,6 @@ output: toc_float: collapsed: true smooth_scroll: true -editor_options: - chunk_output_type: console params: project_file: ./reports/information.R seurat_fn: ./data/CTCL_PAT8_DEV3_LVL7.RDS diff --git a/inst/templates/spatial/visium/01_quality_assessment/qc.Rmd b/inst/templates/spatial/visium/01_quality_assessment/qc.Rmd new file mode 100644 index 0000000..da034ce --- /dev/null +++ b/inst/templates/spatial/visium/01_quality_assessment/qc.Rmd @@ -0,0 +1,388 @@ +--- +title: "Visium QC" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: false + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +params: + project_file: ../information.R + results_dir: ./results + path_data: ./data +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} +# Libraries +library(bcbioR) +library(Seurat) +library(tidyverse) +library(glue) +library(gridExtra) +library(scales) +library(ggprism) +library(grafify) +ggplot2::theme_set(theme_prism(base_size = 12)) +# https://grafify-vignettes.netlify.app/colour_palettes.html +# NOTE change colors here if you wish +scale_colour_discrete <- function(...) + scale_colour_manual(..., + values = as.vector(grafify:::graf_palettes[["kelly"]])) +scale_fill_discrete <- function(...) + scale_fill_manual(..., + values = as.vector(grafify:::graf_palettes[["kelly"]])) + +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +# Project details + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +The following samples were found in the data folder for this project: + +```{r samples} +samples <- list.files(path_data) +samples +``` + +## Visium Description + +This report is adapted from: https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/seurat/seurat_07_spatial.html + +Spatial transcriptomic data with the Visium platform is in many ways similar to scRNAseq data. The data is represented per spot on the slide, as we have spatial barcode bust no cellular barcodes. Each spot contains UMI counts for 5-20 cells instead of single cells, but is still quite sparse in the same way as scRNAseq data is, but with the additional information about spatial location in the tissue. + + +# 10X Sequencing Metrics {.tabset} + +Here, we take a quick look at metrics reported from spaceranger for each sample. + +```{r summary-metrics} +# Create a for loop to create one dataframe with 10X metrics +metrics <- data.frame() +for (sample in samples) { + m <- read.csv(glue("{path_data}/{sample}/outs/metrics_summary.csv")) + metrics <- rbind(metrics, m) +} +``` + +## Number of spots per slide + +```{r nSpots} +metrics %>% + ggplot(aes(x=Sample.ID, y=(Number.of.Spots.Under.Tissue), fill=Sample.ID)) + + geom_col() + + theme_classic() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), + axis.title = element_text(size=rel(1.5)), + plot.title = element_text(hjust = 0.5, face="bold")) + + ggtitle("Number of Spots") + + xlab("") + ylab("Number of spots") +``` + +## Number of sequencing reads + + +```{r nReads} +metrics %>% + ggplot(aes(x=Sample.ID, y=(Number.of.Reads/ 1000000), fill=Sample.ID)) + + geom_col() + + theme_classic() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), + axis.title = element_text(size=rel(1.5)), + plot.title = element_text(hjust = 0.5, face="bold")) + + ggtitle("Number of sequencing reads") + + xlab("") + ylab("Number of reads (in millions)") +``` + +## Mean reads per spot + +**The recommended value here is 50,000.** + +```{r reads_per_spot} +metrics %>% + ggplot(aes(x=Sample.ID, y=Mean.Reads.per.Spot, fill=Sample.ID)) + + geom_col() + + theme_classic() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), + axis.title = element_text(size=rel(1.5)), + plot.title = element_text(hjust = 0.5, face="bold")) + + ggtitle("Mean reads per spot") + + geom_hline(yintercept=50000, linetype=2) +``` + +## Fraction of reads in spots under tissue + +**The recommended value here is > 50%.** + +Low fraction reads in spots under the tissue indicate that many of the reads were not assigned to tissue covered spots. This could be caused by: + +* High levels of ambient RNA resulting from inefficient permeabilization +* The incorrect image or orientation was used +* Poor tissue detection + +```{r frac_reads} +metrics %>% + ggplot(aes(x=Sample.ID, y= (Fraction.Reads.in.Spots.Under.Tissue * 100), fill=Sample.ID)) + + geom_col() + + theme_classic() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), + axis.title = element_text(size=rel(1.5)), + plot.title = element_text(hjust = 0.5, face="bold")) + + xlab("") + ylab("Fraction of reads in spots") + + ggtitle("Fraction of reads") + + geom_hline(yintercept = 50, linetype =2) + + ylim(c(0, 100)) +``` + +# Setup for analysis + +Next, we will run quality control in a similar manner to scRNAseq data. Start by loading in each sample and then merged into one Seurat object. + +```{r create_seurat} +# Create seurat list +seurat_list <- list() +for (sample in samples) { + + # Load data to create a Seurat object + x <- Load10X_Spatial(glue("{path_data}/{sample}/outs/")) + x$orig.ident <- sample + + # Add individual seurat objects to list + seurat_list[[sample]] <- x +} + +# Merge together every seurat object in list +merged <- merge( + x = seurat_list[[samples[1]]], + y = seurat_list[samples[2: length(samples)]], + project = project, + add.cell.ids = samples +) + +merged +``` + +Next we make sure that all the relevant metadata for each cell is included in the seurat object by looking at the first few rows of the dataframe. Note that `nCounts` refers to the total number of UMIs for a spot while `nFeatures` tallies the number of genes that are expressed in the spot. + +```{r view_metadata} +# This is great spot to add any additional metadata you may have for the samples + +# Calculate novelty score +merged$log10GenesPerUMI <- log10(merged$nFeature_Spatial) / log10(merged$nCount_Spatial) + +merged@meta.data %>% head() +``` + +# Quality control per spot {.tabset} + +Similar to scRNAseq we use statistics on number of counts, number of features and percent mitochondria for quality control. However, the expected distributions for high-quality spots are different (compared to high-quality cells in scRNA-seq), since spots may contain zero, one, or multiple cells. + +## Number of UMIs detected per spot + +This number is really dependent on tissue type, RNA quality, and sequencing depth. For example, with one of the public Visium datasets using the mouse brain hippocampus region, we saw an average of 4.5k genes and 20k UMIs per spot. + +```{r nUMI} +merged@meta.data %>% + ggplot(aes(x=orig.ident, y = nCount_Spatial)) + + geom_violin(aes(fill=orig.ident), alpha = 0.5) + + geom_boxplot(width=0.1) + + geom_hline(yintercept = 10000, linetype = "dashed") + + scale_y_log10() + + theme_classic() + + ylab("density") + + theme(plot.title = element_text(hjust=0.5, face="bold")) + + ggtitle("Num UMIs per spot") +``` + +## Number of genes detected per spot + +The number of genes per spot will vary by sample, however in this dataset values are generally on the high end. _The dashed line is drawn at 4000 genes, which is a fairly high threshold._ + + +```{r nGenes, fig.width=9} +merged@meta.data %>% + ggplot(aes(x=orig.ident, y = nFeature_Spatial)) + + geom_violin(aes(fill=orig.ident), alpha = 0.5) + + geom_boxplot(width=0.1) + + geom_hline(yintercept = 4000, linetype = "dashed") + + scale_y_log10() + + theme_classic() + + ylab("density") + + theme(plot.title = element_text(hjust=0.5, face="bold")) + + ggtitle("Num Genes per Spot") +``` + +## Overall complexity of transcriptional profile per spot + +We can evaluate each spot in terms of how complex the RNA species are by using a measure called the novelty score. The novelty score is computed by taking the ratio of nGenes over nUMI. If there are many captured transcripts (high nUMI) and a low number of genes detected in a cell, this likely means that you only captured a low number of genes and simply sequenced transcripts from those lower number of genes over and over again. + +With scRNA-seq this is more easily interpreted for a single cell, but for spatial data this would give us complexity of the spot, which is across multiple cells. + +```{r complexity} +merged@meta.data %>% + ggplot(aes(x=log10GenesPerUMI, color=orig.ident, fill=orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + geom_vline(xintercept = 0.8) + + ggtitle("Novelty score") + + theme(plot.title = element_text(hjust=0.5, face="bold")) +``` + +# QC metrics visualized on slides {.tabset} + +Here, we can look at the distribution of UMI and gene counts on the individual tissue slide. + +```{r spatial-plot, results='asis', fig.width=7, fig.height=4} +# Iterate through each sample to make visualizations for each slide +for (sample in samples) { + cat("\n\n") + + # Print plots for nUMI and nGenes for each spot on the slide + cat("## Sample", sample, "\n") + p <- SpatialFeaturePlot(seurat_list[[sample]], + features = c("nCount_Spatial", "nFeature_Spatial"), + max.cutoff = 'q80') + print(p) + cat("\n") + } +``` + +# Normalization and Clustering + +Seurat recommends the use of SCTransform for spatial data. SCTransform normalizes the data, detects high-variance features, and stores the data in the SCT assay. We then perform clustering (at a default resolution of 0.8 with 30 PCs) and create UMAP coordinates. + +Let's look at the UMAP and clustering for each group to see if further integration is necessary. + +```{r sc_workflow} +# Making sure that the rownames of metadata are the same as what is stored in Cells() +rownames(merged@meta.data) <- Cells(merged) + +# Standard processing of counts matrix +# SCTransform, PCA, clusters, and run UMAP +merged <- SCTransform(merged, assay = "Spatial", verbose = TRUE) +merged <- RunPCA(merged , assay = "SCT", verbose = FALSE) +merged <- FindNeighbors(merged, reduction = "pca", dims = 1:30) +merged <- FindClusters(merged, verbose = FALSE, + resolution = c(0.2, 0.4, 0.6, 0.8, 1)) +merged <- RunUMAP(merged, reduction = "pca", dims = 1:30) +``` + +## UMAP distribution {.tabset} + +### Samples + +```{r umap_sample} +Idents(merged) <- "orig.ident" +DimPlot(merged, reduction = "umap") +``` + +### Samples split + +```{r umap_sample_split} +# Get UMAP coordinates +df <- FetchData(merged, c("umap_1", "umap_2", "orig.ident")) + +# Colors for each sample +palette_fill <- hue_pal()(length(samples)) +names(palette_fill) <- samples + +# Create plot list to arrange later +plot_list <- list() +for (sample in samples) { + + # Get cells of one sample + df_sample <- df[df$orig.ident == sample, ] + + # Plot color dots (for each sample) overlaid over grey + p <- ggplot() + + geom_point(data=df, + aes(x=umap_1, y=umap_2), color="lightgray", + alpha = 0.5, size=1, show.legend=FALSE) + + geom_point(data=df_sample, + aes(x=umap_1, y=umap_2), + color=palette_fill[sample]) + + theme_classic() + + ggtitle(sample) + + plot_list[[sample]] <- p +} + +grid.arrange(grobs = plot_list) +``` + + +### Clusters at resolution 0.8 + +```{r umap_clusters} +Idents(merged) <- "SCT_snn_res.0.8" +p <- DimPlot(merged, reduction = "umap") +LabelClusters(p, id ="ident", fontface = "bold", size = 6, + bg.colour = "white", bg.r = .2, force = 0) +``` + +# Clusters ontop slides + +```{r cluster_slides fig.height=15} +SpatialDimPlot(merged, label = TRUE, ncol=1) +``` + +# Save seurat object + +```{r save_seurat} +saveRDS(merged, file.path(params$results_dir, "01_qc.RDS")) +``` +# Methods + + +## Citation + +```{r citations} +citation("Seurat") +``` + +## Session Information + +```{r} +sessionInfo() +``` diff --git a/inst/templates/spatial/visium/information.R b/inst/templates/spatial/visium/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/spatial/visium/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core'