From 282c7302e487c95481ab7e288bef64cfd545d6a2 Mon Sep 17 00:00:00 2001
From: Alex Bartlett <albartlett@hsph.harvard.edu>
Date: Thu, 10 Oct 2024 18:35:26 +0000
Subject: [PATCH 1/3] add venn diagram to chipseq qc

---
 inst/templates/chipseq/QC/QC.Rmd              | 41 +++++++++++--------
 inst/templates/chipseq/QC/params_qc-example.R |  9 ++--
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd
index 4ba539c..7dbfa68 100644
--- a/inst/templates/chipseq/QC/QC.Rmd
+++ b/inst/templates/chipseq/QC/QC.Rmd
@@ -27,8 +27,8 @@ params:
 
 ```{r, cache = FALSE, message = FALSE, warning=FALSE}
 # This set up the working directory to this file so all files can be found
-# library(rstudioapi)
-# setwd(fs::path_dir(getSourceEditorContext()$path))
+library(rstudioapi)
+setwd(fs::path_dir(getSourceEditorContext()$path))
 ```
 
 
@@ -66,6 +66,8 @@ library(bcbioR)
 library(janitor)
 library(ChIPpeakAnno)
 library(UpSetR)
+
+colors=cb_friendly_cols(1:15)
 ggplot2::theme_set(theme_light(base_size = 14))
 opts_chunk[["set"]](
     cache = FALSE,
@@ -353,8 +355,7 @@ ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]]
 
 We examine the amount of overlap between peaks in replicates of the same experimental condition. 
 
-``` {r peak overlap, results = 'asis', fig.width = 8}
-
+``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6}
 for (current_sample_group in unique(peaks$sample_group)){
   cat("## ", current_sample_group, "\n")
 
@@ -374,19 +375,27 @@ for (current_sample_group in unique(peaks$sample_group)){
   # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5     peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge"      will add 1 to the overlapping counts
   overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge')
   
-  set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% 
-    as.data.frame() %>% 
-    mutate(group_number = row_number()) %>%
-    pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
-    filter(member > 0) %>%
-    group_by(Counts, group_number) %>% 
-    summarize(group = paste(sample, collapse = '&'))
+  n_samples <- length(names(overlaps$overlappingPeaks))
+
+  if (n_samples > 3){
+    set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% 
+      as.data.frame() %>% 
+      mutate(group_number = row_number()) %>%
+      pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
+      filter(member > 0) %>%
+      group_by(Counts, group_number) %>% 
+      summarize(group = paste(sample, collapse = '&'))
+    
+    set_counts_upset <- set_counts$Counts
+    names(set_counts_upset) <- set_counts$group
   
-  set_counts_upset <- set_counts$Counts
-  names(set_counts_upset) <- set_counts$group
-
-  p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
-  print(p)
+    p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
+    print(p)
+  } else{
+    venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks))
+    invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], 
+                    NameOfPeaks = venn_sample_names)))
+  }
   
   cat('\n\n')
 
diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R
index eeb3ce5..7f2da63 100644
--- a/inst/templates/chipseq/QC/params_qc-example.R
+++ b/inst/templates/chipseq/QC/params_qc-example.R
@@ -2,11 +2,10 @@
 
 
 # Example data
-coldata_fn='~/Downloads/chipseq_peakanalysis_PRDM16.csv'
+coldata_fn='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv'
 # This folder is in the output directory inside multiqc folder
-multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_prdm16/multiqc/narrowPeak/multiqc_data/'
+multiqc_data_dir='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/multiqc/narrowPeak/multiqc_data/'
 # This folder is in the output director
-peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/'
+peaks_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/'
 # This folder is in the output directory 
-# counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'
-counts_fn = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/PRDM16/deseq2/PRDM16.consensus_peaks.rds'
+counts_fn = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'

From a0ad6c92cd73c37eae481f987cc041cbe417e1cd Mon Sep 17 00:00:00 2001
From: Alex Bartlett <albartlett@hsph.harvard.edu>
Date: Thu, 10 Oct 2024 20:02:23 +0000
Subject: [PATCH 2/3] start edits to diffbind

---
 inst/templates/chipseq/diffbind/diffbind.Rmd | 73 ++++++++++++--------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd
index 6e7eb46..f71a20d 100644
--- a/inst/templates/chipseq/diffbind/diffbind.Rmd
+++ b/inst/templates/chipseq/diffbind/diffbind.Rmd
@@ -29,7 +29,7 @@ params:
   denominator: WT
   species: mouse
   counts_fn: diffbind_counts.csv
-  results_sig_anno_fn: diffbind_results_sig_anno.csv
+  results_sig_anno_fn: diffbind_results_anno.csv
 ---
 Template developed with materials from https://hbctraining.github.io/main/.
 
@@ -37,11 +37,12 @@ Template developed with materials from https://hbctraining.github.io/main/.
 # This set up the working directory to this file so all files can be found
 library(rstudioapi)
 setwd(fs::path_dir(getSourceEditorContext()$path))
+
 # NOTE: This code will check version, this is our recommendation, it may work
 #.      other versions
-stopifnot(R.version$major>= 4) # requires R4
-stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3
-stopifnot(compareVersion(BiocManager::version(), "3.18")>=0)
+# stopifnot(R.version$major>= 4) # requires R4
+# stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3
+# stopifnot(compareVersion(BiocManager::version(), "3.18")>=0)
 ```
 
 This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision.
@@ -107,10 +108,12 @@ if (params$species == 'mouse'){
   library(TxDb.Mmusculus.UCSC.mm10.knownGene)
   txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
   anno_db <- 'org.Mm.eg.db'
+  library(org.Mm.eg.db)
 } else if (params$species == human){
   library(TxDb.Hsapiens.UCSC.hg38.knownGene)
   txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
   anno_db <- 'org.Hs.eg.db'
+  library(org.Hs.eg.db)
 }
 
 
@@ -237,6 +240,10 @@ including estimation of size factors and dispersions, fitting and testing the
 model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR 
 is assigned to each candidate binding site indicating confidence that they are differentially bound.
 
+We use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) 
+to identify any gene features within 1000 bp of a differentially bound site. 
+
+
 ```{r DB analysis}
 diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator))
 results_obj <- dba.analyze(diffbind_norm, bGreylist = F)
@@ -244,11 +251,32 @@ results_obj <- dba.analyze(diffbind_norm, bGreylist = F)
 results_report <- dba.report(results_obj, th = 1) 
 results_report_sig <- dba.report(results_obj)
 
-
 results <- results_report %>% as.data.frame()
-results_sig <- results_report_sig %>% as.data.frame()
+
+```
+
+```{r annotate DB peaks}
+
+anno_data <- toGRanges(txdb, feature = 'gene')
+results_anno_batch <- annotatePeakInBatch(results_report, 
+                                              AnnotationData = anno_data,
+                                              output = 'overlapping',
+                                              maxgap = 1000)
+
+results_anno_batch_df <- results_anno_batch %>% as.data.frame()
+
+entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
+                                          "ENTREZID", columns = 'SYMBOL') %>%
+  filter(!is.na(ENTREZID)) %>% distinct()
+
+results_anno_batch_df <- results_anno_batch_df %>% 
+  left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL))
+
+write_csv(results_anno_batch_df, params$results_sig_anno_fn)
+
 ```
 
+
 ## MA plot
 
 This plot can help to:
@@ -257,11 +285,10 @@ This plot can help to:
 - Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset.
 
 ```{r MA plot}
-results_for_ma <- results%>%
+results_for_ma <- results_anno_batch_df%>%
   mutate(peak = paste(seqnames, start, end, sep = '_')) %>%
   mutate(t = 0) %>% 
   dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t)
-rownames(results_for_ma) <- results_for_ma$peak
 degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. ')))
 
 ```
@@ -270,7 +297,9 @@ degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denomi
 
 ```{r DB table}
 
-results_sig %>% sanitize_datatable()
+results_sig_anno_batch_df <- results_anno_batch_df %>% filter(FDR < 0.05)
+results_sig_anno_batch_df %>% dplyr::select(names(results), feature, gene_name) %>%
+  sanitize_datatable()
 
 ```
 
@@ -280,15 +309,17 @@ results_sig %>% sanitize_datatable()
 This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen.
 
 ```{r volcano, fig.height = 8}
-results_mod <- results %>% 
+results_mod <- results_sig_anno_batch_df %>% 
   mutate(Fold = replace(Fold, Fold < -5, -5)) %>% 
   mutate(Fold = replace(Fold, Fold > 5, 5)) %>%
-  mutate(peak = paste(seqnames, start, end, sep = '_'))
-show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "peak")])
+  mutate(peak = paste(seqnames, start, end, sep = '_')) 
+# show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")])
+
+show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR)
 EnhancedVolcano(results_mod,
-                lab= results_mod$peak, 
+                lab= results_mod$gene_name,
                 pCutoff = 0.05, 
-                selectLab = c(show$peak),
+                # selectLab = c(show$gene_name),
                 FCcutoff = 0.5,
                 x = 'Fold',
                 y = 'FDR', 
@@ -321,15 +352,12 @@ We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/
 package to determine the genomic context of the differentially bound peaks and 
 visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. 
 
-We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) 
-to identify any gene features within 1000 bp of a differentially bound site. 
-
 ```{r annotate, echo = F}
 
 results_sig_anno <- annotatePeak(results_report_sig, 
                                  tssRegion = c(-2000, 2000), 
                                  TxDb = txdb, 
-                                 annoDb = params$anno_db, 
+                                 annoDb = anno_db, 
                                  verbose = F)
 results_sig_anno_df <- results_sig_anno %>% as.data.frame()
 
@@ -337,15 +365,6 @@ plotAnnoPie(results_sig_anno)
 
 plotDistToTSS(results_sig_anno)
 
-anno_data <- toGRanges(txdb, feature = 'gene')
-results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, 
-                                              AnnotationData = anno_data,
-                                              output = 'overlapping',
-                                              maxgap = 1000)
-
-results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame()
-
-write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn)
 ```
 
 # Functional Enrichment

From 1e718b3b4bfa5249adcf1198585d7bc45aea06e2 Mon Sep 17 00:00:00 2001
From: Alex Bartlett <albartlett@hsph.harvard.edu>
Date: Fri, 11 Oct 2024 16:37:00 +0000
Subject: [PATCH 3/3] annotation and ora for diffbind, fix typo in DEG.Rmd

---
 inst/templates/chipseq/diffbind/diffbind.Rmd | 76 ++++++++++++++++++--
 inst/templates/chipseq/libs/load_data.R      | 44 ++++++++++++
 inst/templates/rnaseq/DE/DEG.Rmd             |  2 +-
 3 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd
index f71a20d..ef7f88a 100644
--- a/inst/templates/chipseq/diffbind/diffbind.Rmd
+++ b/inst/templates/chipseq/diffbind/diffbind.Rmd
@@ -103,6 +103,8 @@ library(qs)
 library(EnhancedVolcano)
 library(ggprism)
 library(ChIPseeker)
+library(msigdbr)
+library(fgsea)
 
 if (params$species == 'mouse'){
   library(TxDb.Mmusculus.UCSC.mm10.knownGene)
@@ -265,9 +267,15 @@ results_anno_batch <- annotatePeakInBatch(results_report,
 
 results_anno_batch_df <- results_anno_batch %>% as.data.frame()
 
-entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
-                                          "ENTREZID", columns = 'SYMBOL') %>%
-  filter(!is.na(ENTREZID)) %>% distinct()
+if(params$species == 'mouse'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+} else if (params$species == 'human'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+}
 
 results_anno_batch_df <- results_anno_batch_df %>% 
   left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL))
@@ -316,10 +324,12 @@ results_mod <- results_sig_anno_batch_df %>%
 # show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")])
 
 show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR)
+
+results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak  , gene_name, NA))
 EnhancedVolcano(results_mod,
                 lab= results_mod$gene_name,
                 pCutoff = 0.05, 
-                # selectLab = c(show$gene_name),
+                selectLab = c(show$gene_name),
                 FCcutoff = 0.5,
                 x = 'Fold',
                 y = 'FDR', 
@@ -369,6 +379,64 @@ plotDistToTSS(results_sig_anno)
 
 # Functional Enrichment
 
+Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA:
+
+- Simplicity: Easy to perform and interpret.
+- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
+- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.
+
+```{r get databases}
+if(params$species == 'human'){
+  all_in_life=get_databases()
+} else if (params$species == 'mouse'){
+  all_in_life = get_databases('Mus musculus')
+}
+```
+
+```{r ora}
+
+universe_mapping = results_anno_batch_df %>% 
+    filter(!is.na(FDR), !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+all = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+up = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+down = run_fora(ora_input, universe_mapping, all_in_life)
+  
+```
+
+
+## Significant pathways using all DB genes
+
+```{r all pathways}
+all %>% sanitize_datatable()
+```
+
+
+## Significant pathways using increased DB genes
+
+```{r up pathways}
+up %>% sanitize_datatable()
+```
+
+
+## Significant pathways using decreased DB genes
+
+```{r down pathways, results='asis'}
+down %>% sanitize_datatable()
+```
+
 # R session
 
 List and version of tools used for the report generation.
diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R
index fec3d4a..324e808 100755
--- a/inst/templates/chipseq/libs/load_data.R
+++ b/inst/templates/chipseq/libs/load_data.R
@@ -114,3 +114,47 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL
   
   return(samplesheet)
 }
+
+get_databases=function(sps="human"){
+  all_in_life=list(
+    msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
+    msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:BP"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:MF")
+    #  msigdbr(species = "human", category = "C5", subcategory = "HPO"),
+    #  msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
+    #  msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
+  )
+  all_in_life
+}
+
+run_fora=function(input, uni,all_in_life){
+  # browser()
+  total_deg=length(unique(input))/length(unique(uni$ENTREZID))
+  pathways_ora_all = lapply(all_in_life, function(p){
+    pathway = split(x = p$entrez_gene, f = p$gs_name)
+    db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
+    respath <- fora(pathways = pathway,
+                    genes = unique(input$ENTREZID),
+                    universe = unique(uni$ENTREZID),
+                    minSize  = 15,
+                    maxSize  = 500)
+    # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1],
+    #                                    pathway, unique(input$ENTREZID), unique(uni$ENTREZID))
+    as_tibble(respath)  %>%
+      mutate(database=db_name, NES=(overlap/size)/(total_deg))
+  }) %>% bind_rows() %>%
+    mutate(analysis="ORA")
+  ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
+    group_by(pathway) %>%
+    left_join(uni, by =c("overlapGenes"="ENTREZID")) %>%
+    dplyr::select(pathway, padj, NES, SYMBOL, analysis,
+                  database) %>%
+    group_by(pathway,padj,NES,database,analysis) %>%
+    summarise(genes=paste(SYMBOL,collapse = ","))
+  ora_tb
+  
+}
+
diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd
index 0384f70..129e328 100644
--- a/inst/templates/rnaseq/DE/DEG.Rmd
+++ b/inst/templates/rnaseq/DE/DEG.Rmd
@@ -604,7 +604,7 @@ fa_list=lapply(de_list,function(contrast){
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   up=run_fora(input_entrezid, universe_mapping,all_in_life)
 
-  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id)
+  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<-0.3) %>% pull(gene_id)
   #change to the right species
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   down=run_fora(input_entrezid, universe_mapping,all_in_life)