-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLab 6
188 lines (140 loc) · 8.01 KB
/
Lab 6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
dat <- read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\agingStudy11FCortexAffy.txt", header=T, row.names = 1)
ann <- read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\agingStudy1FCortexAffyAnn.txt", header=T, row.names = 1)
dimnames(dat)
dim(dat)
dim(ann)
dimnames(ann)
# Prepare 2 separate vectors for comparison. The first is a comparison between male and female patients. The current data frame can be left alone for this, since the males and females are all grouped together. The second vector is comparison between patients >= 50 years of age and those < 50 years of age. To do this, you must use the annotation file and logical operators to isolate the correct arrays/samples.
# Assign the original data set w/o manipulation - already in Male v. Female order.
gender <- dat
gender
# Assign the age groups w/ sort.list using column 2, will sort in ascending order.
age <- ann[sort.list(ann[, 2]), ]
age
# Create the sorted data column names
age.sort <- paste(dimnames(age)[[1]])
age.sort
# Assign the newly sorted column names by age to original data set.
dat.age <- dat[, age.sort]
dat.age
# Run the t.test function from the notes using the first gene vector below for the gender comparison. Then use the second gene vector below for the age comparison.
# gender comparison gene vector
g.g <- c(1394, 1474, 1917, 2099, 2367, 2428, 2625, 3168, 3181, 3641, 3832, 4526,
4731, 4863, 6062, 6356, 6684, 6787, 6900, 7223, 7244, 7299, 8086, 8652,
8959, 9073, 9145, 9389, 10219, 11238, 11669, 11674, 11793)
# age comparison gene vector
g.a <- c(25, 302, 1847, 2324, 246, 2757, 3222, 3675, 4429, 4430, 4912, 5640, 5835, 5856, 6803, 7229, 7833, 8133, 8579, 8822, 8994, 10101, 11433, 12039, 12353,
12404, 12442, 67, 88, 100)
t.test.all.genes <- function(x,s1,s2) {
x1 <- x[s1]
x2 <- x[s2]
x1 <- as.numeric(x1)
x2 <- as.numeric(x2)
t.out <- t.test(x1,x2, alternative="two.sided",var.equal=T)
out <- as.numeric(t.out$p.value)
return(out)
}
# Assign the gender vector to the t.test function. Gender data set and the subest of gender comparison. Assign s1 to column 1 and M, assign s2 to column 1 and to F
rawp.gender <- apply(gender[g.g, ], 1, t.test.all.genes, s1= ann=="M",s2= ann=="F")
rawp.gender
rawp.age <- apply(dat.age[g.a, ], 1, t.test.all.genes, s1= age[,2] <50 , s2= age[,2] >50)
# Using these p-values, use either p.adjust in the base library or mt.rawp2adjp in the multtest library to adjust the values for multiple corrections with the Holm's method.
# apply multiple test correction using non-permuted methods
library(base)
p.g <- c(rawp.gender)
p.gender <- p.adjust(p.g,method="holm")
p.gender
p.a <- c(rawp.age)
p.age <- p.adjust(p.a,method="holm")
p.age
# Sort the adjusted p-values and non-adjusted p-values and plot them vs. the x-axis of numbers for each comparison data set. Make sure that the two lines are different colors. Also make sure that the p-values are sorted before plotting.
# Gender
par(mfrow=c(2,1))
rawp.gender.sort <- sort(rawp.gender)
p.gender.sort <- sort(p.gender)
gender1 <- as.matrix(rawp.gender.sort)
gender2 <- as.matrix(p.gender.sort)
a.gender <- cbind(gender1, gender2)
colnames(a.gender) <- c("Non-adjusted p-values", "Adjusted p-values")
matplot(a.gender, type = "b", pch = 2, col = 2:3, main = "Sorted Gender: Adjusted vs Non-Adjusted p-values", ylab = "p-values")
legend("topleft", legend = colnames(a.gender), pch = 2, col = 2:3)
# Age
rawp.age.sort <- sort(rawp.age)
p.age.sort <- sort(p.age)
age1 <- as.matrix(rawp.age.sort)
age2 <- as.matrix(p.age.sort)
a.age <- cbind(age1, age2)
colnames(a.age) <- c("Non-adjusted p-values", "Adjusted p-values")
matplot(a.age, type = "b", pch = 2, col = 6:4, main = "Sorted Age: Adjusted vs Non-Adjusted p-values", ylab = "p-values")
legend("topleft", legend = colnames(a.gender), pch = 2, col = 6:4)
# Repeat #4 and #5 with the Bonferroni method.
p.g <- c(rawp.gender)
b.gender <- p.adjust(p.g,method="bonferroni")
b.gender
p.a <- c(rawp.age)
b.age <- p.adjust(p.a,method="bonferroni")
b.age
# Gender
par(mfrow=c(2,1))
rawp.gender.sort <- sort(rawp.gender)
b.gender.sort <- sort(b.gender)
gender1 <- as.matrix(rawp.gender.sort)
gender2 <- as.matrix(b.gender.sort)
a.gender <- cbind(gender1, gender2)
colnames(a.gender) <- c("Non-adjusted p-values", "Adjusted p-values")
matplot(a.gender, type = "b", pch = 2, col = 7:2, main = "Sorted Gender: Adjusted vs Non-Adjusted p-values", ylab = "p-values")
legend("topleft", legend = colnames(a.gender), pch = 2, col = 7:2)
# Age
rawp.age.sort <- sort(rawp.age)
b.age.sort <- sort(b.gender)
age1 <- as.matrix(rawp.age.sort)
age2 <- as.matrix(b.age.sort)
a.age <- cbind(age1, age2)
colnames(a.age) <- c("Non-adjusted p-values", "Adjusted p-values")
matplot(a.age, type = "b", pch = 2, col = 8:5, main = "Sorted Age: Adjusted vs Non-Adjusted p-values", ylab = "p-values")
legend("topleft", legend = colnames(a.gender), pch = 2, col = 8:5)
# Read in the log2 normalized fragments per kb per million mapped reads (FPKM) data matrix and annotation files
FPKM <- read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\tcga_brca_fpkm.txt", header=T, row.names = 1)
f.ann <- read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\tcga_brca_fpkm_sam.txt", sep="\t")
f.ann
FPKM
# Use grep to subset the data matrix only by gene 'GATA3' and make sure to cast this vector to numeric.
# as.numeric to cast the vector to numeric
# grep() command will return all lines from the input file that yield a match for the regular expression
# we want the row names in data
# as.numeric(FPKM[grep('pattern we are looking for', rownames(data)'specify rows'), ])
# the second comma specifies we are looking at the rows of data
g.FPKM <- as.numeric(FPKM[grep('GATA3', rownames(FPKM)), ])
g.FPKM
# Create a binary (1/0) vector for the patients where the upper 25% expression of GATA3 is coded as 1 and all other patients are coded as 0. Call this new variable 'group'.
group <- +(g.FPKM >= quantile(g.FPKM, 0.75))
# Create a data matrix with the 'group' variable you created in #9 and the remaining variables in the annotation file.
group.df<- data.frame(V1 = g.FPKM, group)
group.df
# Run a Kaplan-Meier (KM) analysis to determine if a difference in survival experience exists between the two GATA3 expression groups using the survdiff function. Extract the p-value from the chi squared test output.
library(survival)
library(splines)
library(dplyr)
sdf <- survdiff(Surv(time, status)~ group, data = group.df)
sdf
p.value.km <- signif(pchisq(sdf$chisq, length(sdf$n) - 1, lower.tail = FALSE), 4)
p.value.km
# Now run a Cox proportion hazard (PH) regression model on just the grouping variable (i.e. no other covariates) and extract both the p-value and hazard ratio from the output.
cox.ph <- coxph(Surv(time, status)~ group, data = group.df)
summary(cox.ph)
p.value.cox <- signif(summary(cox.ph)$coefficients[, "Pr(>|z|)"], 4)
p.value.cox
HR.cox <- signif(summary(cox.ph)$coefficients[, "exp(coef)"], 2)
HR.cox
# Run the survfit() function only on the grouping variable (i.e. no other covariates) and plot the KM curves, being sure to label the two groups with a legend, two different colors for each line, and provide the KM p-value, Cox PH p-value, Cox PH hazard ratio, and sample sizes all in each of the two groups all on the plot.
sfit <- survfit(surv ~ group, data = group.df)
sfit
plot(sfit, main = "Kaplan-Meier Curves for TCGA GATA3 Study", lty = 2:2, xlab="Months",ylab="S(t)", col=c("purple", "orange"), lwd=2)
n.low.sfit <- sfit$n[1]
n.low.sfit
n.high.sfit <- sfit$n[2]
n.high.sfit
legend("topright", c(paste("Lower 75% GATA3 n=", n.low.sfit), paste("Upper 25% GATA3 n=", n.high.sfit)), lty = 2:2, col=c("purple", "orange"), lwd=2, y.intersp=2)
text(x=30, y=0.8, labels=paste("KM p-value: ", p.value.km, "Cox PH p-value: ",
p.value.cox, "Cox PH hazard ratio: ",
HR.cox), pos=4, col = "darkgreen")