forked from ICI3D/RTutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathICI3D_Lab5a_introLikelihoodPlusRejectionP.R
251 lines (206 loc) · 9.86 KB
/
ICI3D_Lab5a_introLikelihoodPlusRejectionP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
## Introduction to Likelihood
## Meaningful Modeling of Epidemiolgic Data, 2012
## (C) Steve Bellan, 2009-2015
## The code is made available under a Creative Commons Attribution 4.0 International License. You
## are free to reuse this code provided that you give appropriate credit, provide a link to the
## license, and indicate if changes were made. You may do so in any reasonable manner, but not in
## any way that suggests the licensor endorses you or your use. Giving appropriate credit includes
## citation of the original repository.
## Set margins for plotting.
par(mar=c(6,4,4,2))
## Say we are randomly sampling from a population of people in a town
## of size 1,000,000 and we sample 100 people from that population.
size <- 100
## Say that the true HIV prevalence in that population of size 1,000,000
## is 30%, i.e. 300,000 are HIV positive.
true.prev <- .3
## Sample from this distribution once
## samp.pos <- rbinom(1,100,.3)
## You can use the above code to sample from this distribution, but
## we'll set it at 28 just so everyone is working on the same example
## for now.
samp.pos <- 28
samp.prev <- samp.pos/size
samp.pos
samp.prev
## Create a color vector for every possible number of HIV+ people we
## could have found when sampling 100 people (0:100 means 101
## possibilities). Then make the value we chose have a different
## color.
color.vec <- rep("black",size+1)
color.vec[samp.pos+1] <- "purple"
### Create a histogram of the binomial distribution function with the
### true prevalence, highlighting the value we sampled.
barplot.obj <- barplot(dbinom(0:size,size=size, prob=true.prev),
xlab="number HIV+", names.arg=c(0:size),
ylab="probability", col=color.vec, border = NA, space =0)
## Given that we found samp.pos
samp.pos
## positives, would we accept or reject the hypothesis that the true prevalence is .2?
potential.prev <- .2
## To do this we use a two-tailed test. Meaning we are asking "If the
## true prevalence was .2, what is the probability of finding 28 HIV+
## people or another value as extreme when sampling 100 people?
## Now calculate the total probability of finding a value in the
## extreme lower tail plus the extreme upper tail.
if(samp.pos < potential.prev*size)
{
p.val.one.tail <- pbinom(samp.pos, size, potential.prev, lower.tail = TRUE)
}else{
p.val.one.tail <- pbinom(samp.pos - 1, size, potential.prev, lower.tail = FALSE)
}
## Now multiply times 2, because we could have gotten reults that
## extreme in the other direction too. Make sure you understand what
## this means.
p.val <- p.val.one.tail*2
## Calculate the probability of getting every single possible value
## from the binomial distribtion with this hypothesized prevalence.
prob.dist <- dbinom(0:size, size=size, prob=potential.prev)
ymax <- .2
color.vec <- rep("grey",size)
## Create a logical vector for which values are more extreme
if(samp.pos < potential.prev*size)
{
more.extreme <- 0:100 <= samp.pos
}else{
more.extreme <- 0:100 >= samp.pos
}
## Now create a color vector that's grey for all possible sampleable
## values (0-101) but red for the more extreme values.
color.vec <- rep("grey",size+1)
color.vec[more.extreme] <- "red"
## Plot the binomial distribution for N=100, and p=.2, and show the values more extreme than 28 in red
barplot(prob.dist,
xlab = "number HIV+",
names.arg = c(0:size),
ylab = "probability",
col = color.vec ,
border = NA,
main = paste("hypothetical prevalence:",potential.prev*100,"%"),
ylim = c(0,ymax),
space = 0)
## Remember this doubles the area in the red tail!
text(80, .75*ymax, paste("p-value =", signif(p.val,3)))
## Task 1: Edit the above code for the theoretical null hypotheses
## that the prevalences are .15,.25,.3,.35,.4, or .5
## Task 2: Repeat the above but change samp.pos to be 38 (i.e. pretend
## 38/100 tested HIV+)
######################################################################
## Now we're going to plot the above results on a single plot by
## showing the p-value for each null hypothesis.
######################################################################
## Create a vector of potential prevalences spanning 0-1 with 1000 values
## These are all potential "null hypotheses".
potential.prev.vector <- seq(0,1, length=10000)
## Create an empty vector to fill for the p.value at each
## corresponding prevalence value.
p.val <- rep(NA, length(potential.prev.vector))
## Now loop thrugh all the hypothesized prevalences, calculate the
## p-value, and store in our p.val vector.
for(jj in 1:length(potential.prev.vector))
{
## Now calculate the total probability of finding a value in the
## extreme lower tail plus the extreme upper tail.
if(samp.pos < potential.prev.vector[jj]*size)
{
p.val.one.tail <- pbinom(samp.pos, size, potential.prev.vector[jj], lower.tail = TRUE)
}else{
p.val.one.tail <- pbinom(samp.pos - 1, size, potential.prev.vector[jj], lower.tail = FALSE)
}
## Now multiply times 2, because we could have gotten reults that
## extreme in the other direction too. Make sure you understand what
## this means.
p.val[jj] <- p.val.one.tail*2
}
## Which of these null hypothesized prevalences are not rejected at
## the alpha = .05 cutoff?
non.rejected.nulls <- potential.prev.vector[p.val > .05]
ci <- range(non.rejected.nulls)
## Compare this to the CI given by the R function binom.test.
ci
binom.test(28,100,.2, alternative = "two.sided")
## Plot the p-values vs the null hypotheses.
plot(potential.prev.vector, p.val, xlab = "hypothetical prevalence (null hypothesis)", ylab = "p-value",
lwd = 2, bty = "n", type = "l", cex.lab = 1, col = "red")
## Draw a line at the alpha = .05 cutoff.
segments(0, .05, 1, .05, lty = 2, lwd = 3)
## Now plot the CI bounds
ci.l <- round(ci[1],3)
ci.u <- round(ci[2],3)
arrows(ci.l, .4, ci.l, -.03, length = .2)
arrows(ci.u, .4, ci.u, -.03, length = .2)
text(ci.l,.4, ci.l,cex = 1,pos = 3)
text(ci.u,.4, ci.u,cex = 1,pos = 3)
mtext(paste("95% CI includes HIV prevalences of ", ci.l*100, "% to ", ci.u*100, "%", sep=""), side = 3, line =0)
## Task 3: Write down in words what this plot is showing.
######################################################################
## Now let's do the same analysis but using Maximum Likelihood approach.
######################################################################
## The likelihood of a prevalence is the probability of observing the data given the
## hypothesized prevalence:
## L(prevalence | data) = p(data | prevalence)
likelihoods <- dbinom(samp.pos, size, potential.prev.vector)
plot(potential.prev.vector, likelihoods, col = "purple", type = "l", lwd = 2,
xlab = "potential HIV prevalences", ylab = "likelihood",
main = "p(our data given prevalence) = LIKELIHOOD",
bty = "n")
######################################################################
## Let's look at the negative log-likelihood which is what people
## traditionally use because it's usually easier to calculate (though
## for this example R can do all the work computationally quite
## easily).
plot(potential.prev.vector, -log(likelihoods), type = "l", col = "purple", col.main = "white",
bty = "n", lwd = 3, xlab = "potential prevalences (our models)",
ylab = "-log(likelihood)", main = "we usually minimize the -log(likelihood)")
## Now recall that we can use the Chi Squared Likelihood Ratio
## Test. If the logL of a given prevalence is within the chi-squared
## alpha=.05 cutoff of the minimum logL, then we do not reject it.
chisq.crit <- qchisq(.95, df = 1)
chisq.crit
min.l <- min(-log(likelihoods))
abline(h = min.l + chisq.crit/2, lwd = 3, lty = 2)
######################################################################
## It's hard to see this so let's zoom in on last plot for values of
## the potential prevalence between .15 and .5.
## Let's also make a 2 panel plot so we can compare this with the
## P-Value approach from above.
par(mfrow=c(2,1), mar = c(6,4,4,4))
xlim <- c(.15, .5)
zoom.plot.index <- potential.prev.vector>.15 & potential.prev.vector<.5
plot(potential.prev.vector[zoom.plot.index], -log(likelihoods[zoom.plot.index]),
type = "l", col = "purple", col.main = "white", xlim = xlim,
bty = "n", lwd = 3, xlab = "potential prevalences (our models)",
ylab = "-log(likelihood)", main = "we usually minimize the -log(likelihood)")
chisq.crit <- qchisq(.95, df = 1)
chisq.crit
min.l <- min(-log(likelihoods))
ci.likelihood <- range(potential.prev.vector[-log(likelihoods) < min.l + chisq.crit/2])
abline(h = min.l + chisq.crit/2, lwd = 3, lty = 2)
## add title to the plot
ci.l <- signif(ci.likelihood[1],3)
ci.u <- signif(ci.likelihood[2],3)
mtext(paste("95% CI includes HIV prevalences of ", ci.l*100, "% to ", ci.u*100, "%", sep=""), side = 3, line =0)
arrows(ci.l, min.l + 4, ci.l, min.l, length = .2)
arrows(ci.u, min.l + 4, ci.u, min.l, length = .2)
text(ci.l, min.l + 4, ci.l,cex = 1,pos = 3)
text(ci.u, min.l + 4, ci.u,cex = 1,pos = 3)
## Redraw the P-Value plot from above
## Plot the p-values vs the null hypotheses.
plot(potential.prev.vector, p.val, xlab = "hypothetical prevalence (null hypothesis)", ylab = "p-value",
xlim = xlim, lwd = 2, bty = "n", type = "l", cex.lab = 1, col = "red")
## Draw a line at the alpha = .05 cutoff.
segments(0, .05, 1, .05, lty = 2, lwd = 3)
## Now plot the CI bounds
ci.l <- round(ci[1],3)
ci.u <- round(ci[2],3)
arrows(ci.l, .4, ci.l, -.03, length = .2)
arrows(ci.u, .4, ci.u, -.03, length = .2)
text(ci.l,.4, ci.l,cex = 1,pos = 3)
text(ci.u,.4, ci.u,cex = 1,pos = 3)
mtext(paste("95% CI includes HIV prevalences of ", ci.l*100, "% to ", ci.u*100, "%", sep=""), side = 3, line =0)
## Compare confidence intervals constructed from cumulative
## probability distribution (i.e. p-values) and from the Likelihood
## Ratio Test.
ci
ci.likelihood
## Task 4: Explain in your own words how the Likelihood construction of p-values