-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproc_lab_adapt_atac_extra.R
117 lines (85 loc) · 3.22 KB
/
proc_lab_adapt_atac_extra.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
library(edgeR)
library(limma)
library(openxlsx)
library(gplots)
library(dplyr)
library(tidyverse)
library(splines)
count.tab <- read.xlsx("../Input/Toxo_lab_adapt/BulkATACToxoPlasma/macs2_Union/peak_gene_assigned_raw_counts_avg_final.xlsx")
#count.tab <- read.xlsx("../Input/Toxo_lab_adapt/BulkATACToxoPlasma/macs2_Union/peak_gene_assigned_raw_counts_old.xlsx")
count.tab <- count.tab %>% na.omit()
colnames(count.tab)[2:ncol(count.tab)] <- paste("extra", colnames(count.tab)[2:ncol(count.tab)], sep = ".")
# this part is done in peak gene assignment code
#count.ave <- count.tab[, 3:ncol(count.tab)]
count.ave <- count.tab
colnames(count.ave) <- gsub("gene_name", "GeneID", colnames(count.ave))
# count.ave <- count.ave %>% group_by(GeneID) %>% summarise_all("mean")
# count.ave <- count.ave %>% group_by(GeneID) %>% summarise_all("ceiling")
## Detecting Bias
y <- count.ave
## Filtering low expressions
CPM <- cpm(y[,2:ncol(y)])
keep <- rowSums(CPM > 2) >= 3
x <- y[keep,2:ncol(y)] %>% data.frame()
rownames(x) <- y$GeneID[keep]
treatment <- colnames(y[2:ncol(y)])
treatment <- factor(treatment, levels = unique(treatment))
y <- DGEList(counts=x, group=treatment)
y <- calcNormFactors(y)
y$samples
plotMDS(y)
extra.counts <- count.ave %>% dplyr::select(contains('P')) %>% data.frame()
rownames(extra.counts) <- count.ave$GeneID
Hours <- as.numeric(gsub("extra.P", "", colnames(extra.counts)))
Time <- paste0(Hours, "P")
ind <- sort(Hours, index.return = T)$ix
extra.counts <- extra.counts[, ind]
Hours <- Hours[ind]
Time <- Time[ind]
plotMDS(extra.counts)
y <- DGEList(counts=extra.counts, group=Time)
y$genes <- data.frame(Symbol=count.ave$GeneID, stringsAsFactors=FALSE)
plotMDS(y)
## Filter lowly expressed genes
keep <- filterByExpr(y)
table(keep)
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
y$samples
plotMDS(y, labels=Hours)
## 3-knows natural splies
#X <- poly(Hours, degree=3)
#design <- model.matrix(~ X)
X <- ns(Hours, df=3)
design <- model.matrix(~X)
design
y <- estimateDisp(y, design)
sqrt(y$common.dispersion)
plotBCV(y)
fit <- glmQLFit(y, design, robust=F)
plotQLDisp(fit)
fit <- glmQLFTest(fit, coef=2:4)
tab <- as.data.frame(topTags(fit, n=nrow(y$counts)))
summary(decideTests(fit))
logCPM.obs <- cpm(y, log=TRUE, prior.count=fit$prior.count)
logCPM.fit <- cpm(fit, log=TRUE)
# par(mfrow=c(2,2))
# for(i in 1:4) {
# GeneID <- row.names(tab)[i]
# Symbol <- tab$Symbol[i]
# logCPM.obs.i <- logCPM.obs[GeneID,]
# logCPM.fit.i <- logCPM.fit[GeneID,]
# plot(Hours, logCPM.obs.i, ylab="log-CPM", main=Symbol, pch=16)
# lines(Hours, logCPM.fit.i, col="red", lwd=2)
# }
ind.sig <- tab$FDR[match(rownames(logCPM.obs), tab$Symbol)] < 0.3
print(sum(ind.sig)) ## number of DEGs
tc.logCPM <- logCPM.obs %>% as.data.frame() %>%
mutate(GeneID = rownames(logCPM.obs))
tc.logCPM$trend.pval <-tab$PValue[match(tc.logCPM$GeneID, tab$Symbol)]
tc.logCPM$trend.fdr <- tab$FDR[match(tc.logCPM$GeneID, tab$Symbol)]
tc.logCPM <- tc.logCPM %>%
pivot_longer(-c(GeneID,trend.pval, trend.fdr), names_to = "Sample", values_to = "expr")
tc.logCPM$passage <- gsub("extra.", "", tc.logCPM$Sample)
tc.logCPM$Time <- as.numeric(gsub("extra.P", "", tc.logCPM$Sample))
saveRDS(tc.logCPM, '../Input/Toxo_lab_adapt/RDS//extra_tc_atac_logCPM.rds')