-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHomework #2
269 lines (201 loc) · 12.2 KB
/
Homework #2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
library(marray)
library(limma)
# Read in the .gpr files (change the R files to .gpr)
dir.path <-"C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\GSE12050_GenePix"
GSE12050 <- read.GenePix(path = dir.path, name.Gf = "F532 Median", name.Gb = "B532 Median", name.Rf = "F635 Median", name.Rb = "B635 Median", name.W = "Flags")
# https://www.computerworld.com/article/2497319/business-intelligence-beginner-s-guide-to-r-syntax-quirks-you-ll-want-to-know.html
# R basics, when using the apply() function, 1=row, 2=column
# Normalize each array using median global, loess, and print-tip-group loess methods. Then plot MvA plots of all 4 arrays comparing no normalization to the other 3 normalization approaches.
# Subject 1
subject.one <- GSE12050[ , 1]
subject.one.Median <- maNorm(subject.one, norm = c("median"))
subject.one.Loess <- maNorm(subject.one, norm = c("loess"))
subject.one.PTL <- maNorm(subject.one, norm = c("printTipLoess"))
# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1))
maPlot(subject.one, main = "Subject 1 - No Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.one.Median, main = "Subject 1 - Global Median Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.one.Loess, main = "Subject 1 - Loess Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.one.PTL, main = "Subject 1 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL)
# Subject 2
subject.two <- GSE12050[ , 2]
subject.two.Median <- maNorm(subject.two, norm = c("median"))
subject.two.Loess <- maNorm(subject.two, norm = c("loess"))
subject.two.PTL <- maNorm(subject.two, norm = c("printTipLoess"))
# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1))
maPlot(subject.two, main = "Subject 2 - No Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.two.Median, main = "Subject 2 - Global Median Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.two.Loess, main = "Subject 2 - Loess Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.two.PTL, main = "Subject 2 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL)
# Subject 3
subject.three <- GSE12050[ , 3]
subject.three.Median <- maNorm(subject.three, norm = c("median"))
subject.three.Loess <- maNorm(subject.three, norm = c("loess"))
subject.three.PTL <- maNorm(subject.three, norm = c("printTipLoess"))
# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1))
maPlot(subject.three, main = "Subject 3 - No Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.three.Median, main = "Subject 3 - Global Median Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.three.Loess, main = "Subject 3 - Loess Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.three.PTL, main = "Subject 3 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL)
# Subject 4
subject.four <- GSE12050[ , 4]
subject.four.Median <- maNorm(subject.four, norm = c("median"))
subject.four.Loess <- maNorm(subject.four, norm = c("loess"))
subject.four.PTL <- maNorm(subject.four, norm = c("printTipLoess"))
# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1))
maPlot(subject.four, main = "Subject 4 - No Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.four.Median, main = "Subject 4 - Global Median Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.four.Loess, main = "Subject 4 - Loess Normalization", lines.func = NULL, legend.func = NULL)
maPlot(subject.four.PTL, main = "Subject 4 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL)
# Plot density plots of the log ratio values for each normalization (and pre normalization) for only array #4. Put them all on the same plot. Make sure to label the axes and provide a legend.
# how to use na.omit https://statisticsglobe.com/na-omit-r-example/
# maM - Object of class "matrix", intensity log-ratios (base 2) M, rows correspond to spotted probe sequences, columns to arrays in the batch
plot(density(na.omit(maM(subject.four))), main = "Log Ratio Values for Each Normalization and Pre-Normalization for Array #4",
ylim=c(0, 1), xlim=c(-7, 9),col="purple")
lines(density(na.omit(maM(subject.four.Loess))), col="red")
lines(density(na.omit(maM(subject.four.Median))), col="blue")
lines(density(na.omit(maM(subject.four.PTL))), col="orange")
leg <- c("No Normalization", "Loess", "Global Median", "Print Tip Group Loess")
legend("topright", legend = leg,
col = c("purple", "red", "blue", "orange"),
lty = 1)
# So, first extract the Cy5 foreground and background values for each of the 4 arrays and subtract the background from the foreground values, then log2 transform these values. Then calculate global median normalization on these 4 arrays using these background subtracted Cy5 values. Hint, you need to use the median of each array to scale, such that after normalization, all arrays will have a median of 1.
# maRf = Red foreground intensities (Cy5), maRb = Red background intensities (Cy5), subtract the 2 intensities and log2 data. apply by column(2)
log2.Cy5 <- log2(GSE12050@maRf - GSE12050@maRb)
apply(GSE12050@maRf - GSE12050@maRb, 2, function(x) sum(x < 0))
# http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/normalizeMedianAbsValues.html
# normalize with global median normalization
norm.log2.Cy5 <- normalizeMedianValues(log2.Cy5)
norm.log2.Cy5
# now all arrays have same median and so to scale just divide by the median, and ignore NA values
scaled.norm.log2.Cy5 <- norm.log2.Cy5/
median(norm.log2.Cy5, na.rm = T)
# check median of each array is 1, apply by column(2)
apply(scaled.norm.log2.Cy5, 2, median, na.rm = T)
# Next calculate a Spearman's rank correlation between all 4 arrays that you normalized in #5 and do the same with the M values from loess normalized data that you generated in #2. Plot a scatter plot matrix for each of the two normalizations (pairs() function), and be sure to label the arrays and title the plot. Print the correlation coefficients to the screen.
library(limma)
colnames(scaled.norm.log2.Cy5) <- make.names(array.names, unique = TRUE)
cor.Cy5 <- cor(scaled.norm.log2.Cy5, method="spearman", use="complete.obs")
cor.Cy5
s.1 <- maM(subject.one.Loess)
s.2 <- maM(subject.two.Loess)
s.3 <- maM(subject.three.Loess)
s.4 <- maM(subject.four.Loess)
loess.data <- cbind(s.1, s.2, s.3, s.4)
array.names <- c("GSM304445", "GSM304446", "GSM304447", "GSM304448")
colnames(loess.data) <- make.names(array.names, unique = TRUE)
cor.loess <- cor(loess.data, method = "spearman", use = "complete.obs")
cor.loess
pairs(cor.Cy5, main = "Scatter Plot of Normalized Data of Cy5", pch=21, col=1,
bg='pink')
print('Spearman Correlation Global Median Scaled log2 Cy5')
cor.Cy5
pairs(cor.loess, main = "Scatter Plot of Normalized Loess Arrays", pch=21, col=1,
bg='purple')
print('Spearman Correlation Normalized Loess Arrays')
cor.loess
# 1. Subtract the foreground - background for each of the 4 chips for only the Cy5 channel. This should all be on the linear or raw scale (no logging yet).
Cy5 <- (GSE12050@maRf - GSE12050@maRb)
Cy5
# 2. Sort each column independently in this new matrix, apply by column(2)
Sort.Cy5 <- apply(Cy5, 2, sort)
Sort.Cy5
# 3. Calculate row means for the sorted matrix
Cy5.mean <- rowMeans(Sort.Cy5, na.rm = T)
Cy5.mean
# 4. Create a new matrix with each row having the same values as the sorted row mean vectors from step #3 (you should have a new R matrix)
Cy5.matrix <- matrix(rep(Cy5.mean, 4), ncol = 4, byrow = F)
Cy5.matrix
# 5. Rank the columns independently on the original background subtracted matrix (from step #1) Hint: use the rank() function with the argument ties="first" or order(), apply by column(2)
Cy5.ranked <- apply(Cy5, 2, function(x) rank(x, ties="first"))
Cy5.ranked
# 6. Reorder the columns in the new matrix from step #4 using the ranks from step #5, apply by column(2)
Cy5.matrix.ranked <- apply(Cy5.matrix, 2, function(x) rank(x, ties = "first"))
Cy5.matrix.ranked
# To verify that each array has the same distribution, use the hist() function to look at various arrays
par(mfrow=c(4,1))
hist(Cy5.matrix[,1])
hist(Cy5.matrix[,2])
hist(Cy5.matrix[,3])
hist(Cy5.matrix[,4])
# Now log (base 2) the new R matrix you created from step 6 (question #7) and calculate a Spearman's rank correlation between the 4 arrays and plot a scatter plot matrix as you did before. Print the correlation coefficients to the screen
log.Cy5.matrix <- log2(Cy5.matrix)
log.Cy5.matrix
cor.Cy5.matrix <- cor(log.Cy5.matrix, method="spearman", use="complete.obs")
cor.Cy5.matrix
pairs(log.Cy5.matrix,
pch=21, col=1, bg='darkgreen',
main='Spearman Correlation Normalized New Matrix Arrays', cex=0.4,
method="spearman")
print('Spearman Correlation Normalized New Matrix Arrays')
cor.Cy5.matrix
# Then change the normalization script from the lecture notes to include the housekeeping genes beta actin, GAPDH, and 18S. Look at the file to make sure the housekeepers are spelled correctly.
# qRT-PCR file formatting and calculation of fold changes
f.parse <- function(path=pa,file=fi,out=out.fi) {
d <- read.table(paste(path,file,sep=""),skip=11,sep=",",header=T)
u <- as.character(unique(d$Name))
u <- u[u!=""]; u <- u[!is.na(u)];
ref <- unique(as.character(d$Name[d$Type=="Reference"]))
u <- unique(c(ref,u))
hg <- c("B-ACTIN","GAPDH","18S")
hg <- toupper(hg)
p <- unique(toupper(as.character(d$Name.1)))
p <- sort(setdiff(p,c("",hg)))
mat <- matrix(0,nrow=length(u),ncol=length(p))
dimnames(mat) <- list(u,p)
for (i in 1:length(u)) {
print(paste(i,": ",u[i],sep=""))
tmp <- d[d$Name %in% u[i],c(1:3,6,9)]
g <- toupper(unique(as.character(tmp$Name.1)))
g <- sort(setdiff(g,c("",hg)))
for (j in 1:length(g)) {
v <- tmp[toupper(as.character(tmp$Name.1)) %in% g[j],5]
v <- v[v!=999]
v <- v[((v/mean(v))<1.5) & ((v/mean(v))>0.67)] #gene j vector
hv3 <- NULL
for (k in 1:length(hg)) { #housekeeping gene vector (each filtered by reps)
hv <- tmp[toupper(as.character(tmp$Name.1)) %in% hg[k],5]
hv <- hv[hv!=999]
hv3 <- c(hv3,hv[((hv/mean(hv))<1.5) & ((hv/mean(hv))>0.67)])
}
sv <- mean(as.numeric(v)) - mean(as.numeric(hv3)) #scaled value for gene j
if(i==1) { #reference sample only
mat[u[i],g[j]] <- sv
next
}
mat[u[i],g[j]] <- sv - mat[u[1],g[j]]
}
}
mat[1,][!is.na(mat[1,])] <- 0
fc <- 2^(-1 * mat)
write.table(t(c("Subject",dimnames(mat)[[2]])),paste(path,out,sep=""),quote=F,sep="\t",col.names=F,row.names=F)
write.table(round(fc,3),paste(path,out,sep=""),quote=F,sep="\t",append=T,col.names=F)
}
# run function
pa <- "C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\"
fi <- "Inflammation_qRT-PCR.csv"
out.fi <- "fold_chg_matrix.txt"
f.parse(pa,fi,out.fi)
# Read the normalized qRT-PCR data matrix into R, using a Spearman's rank correlation, which two patients are most correlated? Plot these two patients against each other in a scatter plot.
pcr <- t(read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\fold_chg_matrix.txt",
sep = "\t", header = T, row.names = 1))
cor.pcr <- cor(pcr, method = "spearman", use = "complete.obs")
## remove diag of 1 to find index of highest match
## https://stackoverflow.com/questions/12471780/set-diagonal-of-a-matrix-to-zero-in-r
diag(cor.pcr) = NA
# find index
# https://stackoverflow.com/questions/17606906/find-row-and-column-index-of-maximum-value-in-a-matrix
which(cor.pcr == max(cor.pcr, na.rm=T), arr.ind = T)
correlation <- round(max(cor.pcr, na.rm=T), 4)
## get the sample names
highest.correlated <- row.names(which(cor.pcr == max(cor.pcr, na.rm=T),
arr.ind = T))
highest.correlated
## new data x and y
pcr.dat.plot <- pcr[, highest.correlated]
plot(pcr.dat.plot, pch=23, main="Two most correlated samples")
text(3, 55, paste("Spearman correlation = ", correlation), pos=4)
abline(0, 1, col="red")