-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnalysisFunctions.R
271 lines (240 loc) · 9.87 KB
/
AnalysisFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
##helper function to calculate two-tailed T-
calculatePvaluefromTvalue <- function(t.value, df){
return(2*pt(-abs(t.value), df))
}
customWrite <- function(DF, filename, ...){
write.table(DF, filename, sep = "\t", row.names = FALSE, ...)
}
customRead <- function(filename, ...){
read.table(filename, sep = '\t', header = TRUE, stringsAsFactors = FALSE, ...)
}
##function for cleaning hsbc(after getting the correct subjects in the hsbc)
cleanHsbc <- function(hsbcDF){
##get the hsbc right
hsbcDF$vhash <- NULL
hsbcDF$jhash <- NULL
hsbcDF$chain <- NULL
hsbcDF <- as.data.frame(t(hsbcDF))
names(hsbcDF) <- c("age", "dbid", "gender", "positive_602", "positive_0301", "beforeafter", "casecontrol", "cell_type", "rs1483979")
hsbcDF <- hsbcDF[-1,]
hsbcDF$age <- as.numeric(as.character(hsbcDF$age))
hsbcDF$gender <- factor(as.character(hsbcDF$gender))
hsbcDF$positive_602 <- factor(as.character(hsbcDF$positive_602))
hsbcDF$positive_0301 <- factor(as.character(hsbcDF$positive_0301))
hsbcDF$casecontrol <- factor(as.character(hsbcDF$casecontrol))
hsbcDF$cell_type <- factor(as.character(hsbcDF$cell_type))
hsbcDF$rs1483979 <- factor(as.character(hsbcDF$rs1483979))
return(hsbcDF)
}
##function for identifying two types of alpha-J24s
identifyJ24 <- function(DF = data.frame()){
J24.df <- subset(DF,jhash=="TRAJ24*01")
not.J24.df <- subset(DF, jhash!="TRAJ24*01")
#get ones that match the "SWGK[FL][RQ]$" pattern
J24.df <- J24.df[grepl("SWGK[FL][RQ]$", J24.df$covariate),]
J24.df$J24type <- sapply(J24.df$covariate, function(x){ifelse(grepl("L[RQ]$", x), "01", "02")}, USE.NAMES = FALSE, simplify = TRUE)
J24.df$jhash <- paste(J24.df$jhash, J24.df$J24type, sep = ":")
J24.df$J24type <- NULL
return(rbind(J24.df, not.J24.df))
}
##analysis functions
##function for building dataframes for usage analyses
##data:the dataframe of raw counts
##columns.to.aggregate.by: named list
##for example: alpha.V.cd4 <- aggregateForUsage(alpha.cd4, list(vhash = alpha.cd4$vhash), 1:3)
aggregateForUsage <- function(data, columns.to.aggregate.by, columns.to.omit = 1:4, agg.function = sum){
return(aggregate(data[, -columns.to.omit], by = columns.to.aggregate.by, FUN = agg.function ))
}
#scale to smallest library size
normalize<-function(mat=matrix()){
col.sums<-colSums(mat)
if(prod(col.sums)){
t(t(mat)/col.sums)*min(col.sums)
}
else{
stop("some columns sum to zero")
}
}
#scale to frequencies
normalize2 <- function(mat=matrix()){
col.sums<-colSums(mat)
if(prod(col.sums)){
t(t(mat)/col.sums)
}
else{
stop("some columns sum to zero")
}
}
##This is according to
##http://grokbase.com/t/r/bioconductor/127r11kp23/bioc-edger-and-libsize-normalization
##answer by Mark Robinson
getTMMNormalizedMatrix <- function(M){
require(edgeR)
norm.factors <- calcNormFactors(M, method = "TMM")
lib.sizes <- colSums(M)
t(t(M)/(norm.factors*lib.sizes))
}
##removeRareClones assumes first column contains clone names, and the rest of the columns contain actual numeric data
removeRareClones<-function(m,N){
m[rowSums(m[,-1]>0)>N,]
}
wrapNormalize <- function(DF){
mat <- as.matrix(sapply(DF[, -1], as.numeric))
mat <- normalize(mat)
return(cbind(name = DF[, 1], as.data.frame(mat)))
}
##does not order the result
doLogmodel <- function(DF, hsbc = NULL){
name <- DF$name
DF$name <- NULL
bigmodel <- lm(log(t(DF)+1) ~ hsbc$age + hsbc$gender + hsbc$positive_0301 + hsbc$casecontrol + hsbc$rs1483979)
bigsummary <- summary(bigmodel)
coef <- lapply(bigsummary, function(x)x$coefficients[,4])
eff <- lapply(bigsummary,function(x)x$coefficients[,1])
coefDF <- as.data.frame(coef)
effDF <- as.data.frame(eff)
minpvals <- apply(coefDF,2,min)
coefDF <- rbind(coefDF,minpvals)
names(coefDF) <- name
names(effDF) <- name
coefDF <- rbind(coefDF,effDF)
return(coefDF)
}
##simple log(x+1) transformed model with just case control status as covariate
doSimpleLogmodel <- function(DF, hsbc = NULL){
name <- DF$name
DF$name <- NULL
bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol)
bigsummary <- summary(bigmodel)
coef <- lapply(bigsummary, function(x)x$coefficients[,4])
eff <- lapply(bigsummary,function(x)x$coefficients[,1])
coefDF <- as.data.frame(coef)
effDF <- as.data.frame(eff)
minpvals <- apply(coefDF,2,min)
coefDF <- rbind(coefDF,minpvals)
names(coefDF) <- name
names(effDF) <- name
coefDF <- rbind(coefDF,effDF)
return(coefDF)
}
##same function with casecontrol and celltype as covariates
doSimpleLogmodelwithcelltype <- function(DF, hsbc = NULL){
name <- as.character(DF[, 1])
DF[, 1] <- NULL
bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol + hsbc$cell_type)
bigsummary <- summary(bigmodel)
coef <- lapply(bigsummary, function(x)x$coefficients[,4])
eff <- lapply(bigsummary,function(x)x$coefficients[,1])
coefDF <- as.data.frame(coef)
effDF <- as.data.frame(eff)
minpvals <- apply(coefDF,2,min)
coefDF <- rbind(coefDF,minpvals)
names(coefDF) <- name
names(effDF) <- name
coefDF <- data.frame(name = name, t(rbind(coefDF,effDF)))
names(coefDF)[5] <- "MinP"
return(coefDF)
}
##same function with casecontrol and 301-status as covariates
doSimpleLogmodelwith301status <- function(DF, hsbc = NULL){
name <- as.character(DF[, 1])
DF[, 1] <- NULL
bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol + hsbc$positive0301)
bigsummary <- summary(bigmodel)
coef <- lapply(bigsummary, function(x)x$coefficients[,4])
eff <- lapply(bigsummary,function(x)x$coefficients[,1])
coefDF <- as.data.frame(coef)
effDF <- as.data.frame(eff)
minpvals <- apply(coefDF,2,min)
coefDF <- rbind(coefDF,minpvals)
names(coefDF) <- name
names(effDF) <- name
coefDF <- data.frame(name = name, t(rbind(coefDF,effDF)))
names(coefDF)[5] <- "MinP"
return(coefDF)
}
##calculate standard error of a vector
std <- function(x) sd(x, na.rm =TRUE)/sqrt(length(x[!is.na(x)]))
##function to perform wilcox test for case/control groups according to a character vector. By default perform an unpaired wilcoxon test.
doWilcoxtest<-function(DF = NULL, casecontrol = NULL, paired = FALSE ){
name <- NA
testp <- NA
meancase <- NA
SEMcase <- NA
SEMctrl <- NA
meanctrl <- NA
tDF <- as.data.frame(t(DF[, -1]))
nimet <- as.character(DF[, 1])
for (i in 1:ncol(tDF)) {
testp[i] <- wilcox.test(tDF[, i] ~ casecontrol, paired = paired)$p.value
name[i] <-nimet[i]
meancase[i] <-mean(tDF[which(casecontrol == "case") ,i])
meanctrl[i] <-mean(tDF[which(casecontrol == "control") ,i])
SEMcase[i] <-std(tDF[which(casecontrol == "case") ,i])
SEMctrl[i] <-std(tDF[which(casecontrol == "control") ,i])
}
bound <- as.data.frame(cbind(name, testp, meancase, SEMcase, meanctrl, SEMctrl))
return(bound)
}
##countNonZeroClones function:
##count the number of cases and controls for which the clone count is non-zero
##input: dataframe with names of clones on the first column, and numeric data in the rest of the columns
## character vector indicating cases ("case") and controls ("control")
countNonZeroClones <- function(DF, casecontrol = character()){
case <- which(casecontrol == "case")
control <- which(casecontrol == "control")
case.nonzero <- rowSums(apply(DF[, -1][, case], c(1,2), function(x){x != 0}))
control.nonzero <- rowSums(apply(DF[, -1][, control], c(1,2), function(x){x != 0}))
names(case.nonzero) <- NULL
names(control.nonzero) <- NULL
return(data.frame(casenonzeroN = case.nonzero, ctrlnonzeroN = control.nonzero))
}
##calculateCaseControlMeans function:
##calculate mean counts of cases and controls indicated by character vector in dataframe with names of clones in the first column and numeric
##data in the rest of the columns.
##outputs a two column dataframe with columns case.mean and column.mean containing the numerical mean count values
calculateCaseControlMeans <- function(DF, casecontrol = character()){
case.indices <- which(casecontrol == "case")
control.indices <- which(casecontrol == "control")
case.n <- length(case.indices)
control.n <- length(control.indices)
case.mean <- rowSums(DF[, -1][, case.indices]) / case.n
control.mean <- rowSums(DF[, -1][, control.indices]) / control.n
return(data.frame(case.mean = case.mean, control.mean = control.mean))
}
##buildContingencyTables
##input: numerical vectors of same length x, and y, indicating the number of present condition,
##numbers totalX and totalY indicating the total amounts of subjects
##(cases vs controls for example).
##builds a list (of length length(x)=length(y)) of tables ready-for-use for chisq.test function in {base}
buildContingencyTables <- function(x, y, totalX, totalY){
if ((length(x) != length(y)) | (length(x)*length(y) == 0)){
stop("the input vectors have to be of equal, nonzero, length")
}
result = list()
for (i in 1:length(x)){
xpresent <- x[[i]]
xabsent <- totalX - x[[i]]
ypresent <- y[[i]]
yabsent <- totalY - y[[i]]
t <- as.table(rbind(c(xpresent, xabsent), c(ypresent, yabsent)))
dimnames(t) <- list(cactrl = c("case", "control"), presence = c("present", "absent"))
result[[i]] <- t
}
return(result)
}
##wrapper for analysis functions requires dplyr
buildWilcoxDataFrameWithExtras <- function(DF = data.frame(), casecontrol = character()){
require(dplyr)
result <- data.frame(doWilcoxtest(DF = DF, casecontrol = casecontrol), countNonZeroClones(DF = DF, casecontrol = casecontrol))
n.case <- length(which(casecontrol == "case"))
n.ctrl <- length(which(casecontrol == "control"))
result <- mutate(result, caseminusctrl = casenonzeroN - ctrlnonzeroN)
result <- mutate(result, caseminusctrlpercentage = (casenonzeroN/n.case - ctrlnonzeroN/n.ctrl)*100)
return(result)
}
combineNames <- function(DF = data.frame(), sep = "_"){
DF$chain <- NULL
DF <- data.frame(name = paste(DF[, 1], DF[, 2], DF[, 3], sep = sep), DF[, -c(1:3)], stringsAsFactors = FALSE)
return(DF)
}