project.Rmd

---
title: "Project"
author: "Jiayi"
date: "17/11/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Set up
```{r}
rm(list =ls())
setwd('~/Desktop/STA_426_UZH/Project/')
load("DF.RData")
```

UpSetR
```{r}
library(UpSetR)
thd = 0.01
DF_01 = data.frame(eisaR = ifelse(DF$eisaR < thd, 1, 0), 
                   BRIE2 = ifelse(DF$BRIE2 < thd, 1, 0),
                   DEXSeq_USA = ifelse(DF$DEXSeq_USA < thd, 1, 0))
upset(DF_01)

```

top 100 genes from each method, in every cell-type
```{r}
cell_types = unique(DF$Cell_type); cell_types
TOP_100 = list()

xx = 100

for(i in 1:length(cell_types)){
  cell = cell_types[i]
  
  tmp = DF[DF$Cell_type == cell,]
  
  # check top 100 genes from each method:
  TOP_100[[i]] = data.frame(Cell_type = cell,
                            eisaR = (tmp$Gene_id[order(tmp$eisaR)])[1:xx],
                            BRIE2 = tmp$Gene_id[order(tmp$BRIE2)][1:xx],
                            DEXSeq_USA = tmp$Gene_id[order(tmp$DEXSeq_USA)][1:xx])
}
TOP_100 = do.call(rbind, TOP_100)

# top 100 genes from each method, in every cell-type
head(TOP_100)

all_genes = unique(unlist(TOP_100[,-1]))
length(all_genes)
head(all_genes)
```


```{r}
# define GO analysis functions
library(topGO)
library(org.Hs.eg.db)

GO_analysis_count = function(DE_genes) {
    xx <- annFUN.org("BP", feasibleGenes = DE_genes, mapping = "org.Hs.eg.db", ID ="Ensembl")
    allGenes <- unique(unlist(xx))
    myInterestedGenes <- sample(allGenes, length(allGenes)/10)
    geneList <- factor(as.integer(allGenes %in% myInterestedGenes))
    names(geneList) <- allGenes
    
    GOdata <- new("topGOdata",
              ontology = "BP",
              allGenes = geneList,
              nodeSize = 5,
              annot = annFUN.org, 
              mapping = "org.Hs.eg.db",
              ID = "Ensembl") 
    test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
    resultFisher <- getSigGroups(GOdata, test.stat)
    GO_score <- score(resultFisher)
    
    return(GO_score)
}

topDiffGenes <- function(allScore) {
  return(allScore < 0.01)
}

GO_analysis_score = function(score_genes) {
    geneList <- score_genes
    
    GOdata <- new("topGOdata", 
                  ontology= "BP", 
                  allGenes = geneList, 
                  nodeSize = 5, 
                  geneSelectionFun = topDiffGenes, 
                  annot=annFUN.org, 
                  mapping="org.Hs.eg.db", 
                  ID = "Ensembl")
    
    test.stat <- new("classicScore", testStatistic = GOKSTest, name = "KS tests")
    resultKS <- getSigGroups(GOdata, test.stat)
    GO_score <- score(resultKS)
    return(GO_score)
}


```

Gene ontology enrichment analysis

```{r message=FALSE}
methods = c("eisaR", "BRIE2", "DEXSeq_USA")
GO_term_count <- list()
GO_term_score <- list()

# loop over cell type:
for(i in 1:length(cell_types)){
  cell = cell_types[i]
  
  TOP_100_cell = TOP_100[TOP_100$Cell_type==cell,]
  DF_cell = DF[DF$Cell_type==cell, ]
  
  # loop over methods:
  for(j in 1:length(methods)){
    method = methods[j]

    # list of all analyzed genes:
    all_genes = DF_cell$Gene_id
    
    if(TRUE){ # GO based on the list of 100 top genes
      # list of 100 differential genes:
      DE_genes = TOP_100_cell[, colnames(TOP_100_cell) == method] # thd defined at the beginning (0.01 or 0.05)
      GO_score <- GO_analysis_count(DE_genes)
      name = paste0(cell,'_',method)
      GO_term_count[[name]] = names(GO_score[GO_score < 0.05])
    }else{ # GO based on ranking from all results:
      #FDR = DF_cell[, colnames(DF) == method]
      # gene ranking (1 = most significant gene)
      #rank_genes = order(FDR)
      score_genes <- DF_cell[, colnames(DF) == method]
      names(score_genes) <- DF_cell$Gene_id
      name = paste0(cell,'_',method)
      GO_score <- GO_analysis_score(score_genes)
      GO_term_score[[name]] = names(GO_score[GO_score < 0.05])
    }
  }
}

```

Gene level analysis

```{r}
library('rjson')
development <- fromJSON(file='development.json')
development_brain <- fromJSON(file='development_brain.json')
brain <- fromJSON(file='brain.json')

development_genes <- c()
development_brain_genes <- c()
brain_genes <- c()
for(i in 1:length(development)){
  development_genes <- append(development_genes,development[[i]][[1]])
}

for(i in 1:length(development_brain)){
  development_brain_genes <- append(development_brain_genes,development_brain[[i]][[1]])
}

for(i in 1:length(brain)){
  brain_genes <- append(brain_genes,brain[[i]][[1]])
}

top_100_in_development <- list()
top_100_in_development_brain <- list()
top_100_in_brain <- list()

for(i in 1:length(cell_types)){
  cell = cell_types[i]
  TOP_100_cell = TOP_100[TOP_100$Cell_type==cell,]
  for(j in 1:length(methods)){
    method = methods[j]
    DE_genes = TOP_100_cell[, colnames(TOP_100_cell) == method]
    name = paste0(cell,'_',method)
    top_100_in_development[[name]] = DE_genes[DE_genes %in% development_genes]
    top_100_in_development_brain[[name]] = DE_genes[DE_genes %in% development_brain_genes]
    top_100_in_brain[[name]] = DE_genes[DE_genes %in% brain_genes]
  }
}

```