forked from ICI3D/RTutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathICI3D_Lab6_MLE_SIV_HIV.R
312 lines (269 loc) · 15.9 KB
/
ICI3D_Lab6_MLE_SIV_HIV.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
## Introduction to Likelihood 2: Fitting an SI model to an HIV epidemic with Maximum Likelihood Estimation
## Clinic on the Meaningful Modeling of Epidemiological Data
## International Clinics on Infectious Disease Dynamics and Data (ICI3D) Program
## African Institute for Mathematical Sciences, Muizenberg, RSA
## (C) Steve Bellan, 2015
######################################################################
## By the end of this tutorial you should…
## * Understand how to simulate cross-sectional prevalence data around a simulated epidemic.
## * Calculate a binomial likelihood from these prevalence data and a fully specified epidemic model.
## * Use R's "optim" function to do multivariate optimization over transformed parameter space.
## * Understand the difference betweeen SANN and Nelder-Mead algorithms
## * Create 95% CIs and contours from the hessian matrix
## * Create 95% CIs and contours based on the likelihood ratio
require(boot); require(deSolve); require(ellipse);
## Function that makes a list of disease parameters with default values
disease_params <- function(Beta = 0.9 ## transmission coefficient when prevalence is 0
, alpha = 8 ## for transmission coefficient: decline with prevalence
, progRt = (1/10)*4 ## rate of of progression through each of the I classes, for 10 years total
, birthRt = .03 ## birth rate, 3% of people give birth per year
, deathRt = 1/60 ## 60 year natural life expectancy
){
return(as.list(environment()))
}
disease_params()
disease_params(Beta = .2)
initPrev <- exp(-7) ## infected at start
tseqMonth <- seq(1976, 2015, by = 1/12)
init <- c(S=1-initPrev, I1=initPrev, I2=0, I3=0, I4=0, CI = 0, CD = 0) ## modeling proportion of population
Is <- paste0('I',1:4) ## for easy indexing
## Define the SI ODE model. This model is equivalent to the third model in the HIV in Harare tutorial
## (though some parameters may have different names)
SImod <- function(tt, yy, parms) with(c(parms,as.list(yy)), {
## State variables are: S, I1, I2, I3, I4
## derived quantitties
I <- I1+I2+I3+I4 ## total infected
N <- I + S ## total population
transmissionCoef <- Beta * exp(-alpha * I/N) ## Infectious contact rate
## state variable derivatives (ODE system)
deriv <- rep(NA,7)
deriv[1] <- birthRt*N - deathRt*S - transmissionCoef*S*I/N ## Instantaneous rate of change: Susceptibles
deriv[2] <- transmissionCoef*S*I/N - progRt*I1 - deathRt*I1 ## Instantaneous rate of change: Infection class I1
deriv[3] <- progRt*I1 - progRt*I2 - deathRt*I2 ## Instantaneous rate of change: Infection class I2
deriv[4] <- progRt*I2 - progRt*I3 - deathRt*I3 ## Instantaneous rate of change: Infection class I3
deriv[5] <- progRt*I3 - progRt*I4 - deathRt*I4 ## Instantaneous rate of change: Infection class I4
deriv[6] <- transmissionCoef*S*I/N ## Instantaneous rate of change: Cumulative incidence
deriv[7] <- progRt*I4 ## Instantaneous rate of change: Cumulative mortality
return(list(deriv))
})
## Function to run the deterministic model simulation, based on the ODE system defined in SImod().
simEpidemic <- function(init, tseq = tseqMonth, modFunction=SImod, parms = disease_params()) {
simDat <- as.data.frame(lsoda(init, tseq, modFunction, parms=parms))
simDat$I <- rowSums(simDat[, Is])
simDat$N <- rowSums(simDat[, c('S',Is)])
simDat$P <- with(simDat, I/N)
return(simDat)
}
## Function to 'sample' the population:
## From a simulated epidemic, measure prevalence at several time points by drawing
## cross-sectional samples of individuals at each time, testing them, and then calculating sample
## prevalence and associated binomial confidence intervals
sampleEpidemic <- function(simDat # Simulated "data" which we treat as real
, sampleDates = seq(1980, 2010, by = 3) # Sample every 3 years
, numSamp = rep(80, length(sampleDates)) # Number of individuals sampled at each time point
){
prev_at_sample_times <- simDat[simDat$time %in% sampleDates, 'P']
numPos <- rbinom(length(numSamp), numSamp, prev_at_sample_times)
lci <- mapply(function(x,n) binom.test(x,n)$conf.int[1], x = numPos, n = numSamp)
uci <- mapply(function(x,n) binom.test(x,n)$conf.int[2], x = numPos, n = numSamp)
return(data.frame(time = sampleDates, numPos, numSamp, sampPrev = numPos/numSamp,
lci = lci, uci = uci))
}
## Run system of ODEs for "true" parameter values
trueParms <- disease_params() # Default model parameters are defined in lines 20-26
simDat <- simEpidemic(init, parms = trueParms) # Simulated epidemic (underlying process)
par(bty='n', lwd = 2)
# Plot simulated prevalence through time:
with(simDat, plot(time, P, xlab = '', ylab = 'prevalence', type = 'l', ylim = c(0,.4), col='red', las = 1))
## Take cross-sectional sample of individuals to get prevalence estimates at multiple time points:
set.seed(1) # Initiate the random number generator (so everyone's simulation looks the same)
myDat <- sampleEpidemic(simDat) # Simulate data from the sampling process (function defined above)
points(myDat$time, myDat$sampPrev, col = 'red', pch = 16, cex = 2) # Plot sample prevalence at each time point
arrows(myDat$time, myDat$uci, myDat$time, myDat$lci, col = 'red', len = .025, angle = 90, code = 3) # Plot 95% CIs around the sample prevalences
## To start, we need to write a likelihood function that gives the probability that a given parameter set
## (Beta, alpha values) would generate the observed data. Remember that we are assuming that there is some
## true underlying epidemic curve that is deterministic and the data we observe are only noisy
## because of sampling/observation error (not because the underlying curve is also
## noisy--i.e. process error--which would be particularly likely for epidemics in small populations).
## We assume binomial sampling errors. So we can write the -log-likelihood as the probability of
## observing the observed number positive out of each sample if the prevalence is the value generated by
## a model parameterized by a given set of parameters:
## Return the negative log of likelihood by convention
nllikelihood <- function(parms = disease_params(), obsDat=myDat) {
simDat <- simEpidemic(init, parms=parms)
## What are the rows from our simulation at which we have observed data?
matchedTimes <- simDat$time %in% obsDat$time
nlls <- -dbinom(obsDat$numPos, obsDat$numSamp, prob = simDat$P[matchedTimes], log = T)
return(sum(nlls))
}
nllikelihood(trueParms) ## loglikelihood of the true parameters (which we usually never know)
nllikelihood(disease_params(Beta = 3, alpha = 1)) ## vs some random guessed parameters
## First look up how optim() works. The more you read through the help
## file the easier this will be!!! In particular make sure you understand that
## the first argument of optim must be the initial values of the parameters to
## be fitted (i.e. Beta & alpha) and that any other parameters to be fixed are
## given as additional arguments (in the help file under "...")
## ?optim
## Since we need to be able to easily separate fitted and fixed parameters,
## let's make a function that takes fitted and fixed parameters together and
## puts them back in a parameter list (similar to the output of
## disease_params()). We also want it to be able to take logged parameter values
## and unlog them automatically back into the parameter list, so that we can fit
## on a log scale, but run our model with the unlogged values.
subsParms <- function(fit.params, fixed.params=disease_params())
within(fixed.params, {
loggedParms <- names(fit.params)[grepl('log_', names(fit.params))]
unloggedParms <- names(fit.params)[!grepl('log_', names(fit.params))]
for(nm in unloggedParms) assign(nm, as.numeric(fit.params[nm]))
for(nm in loggedParms) assign(gsub('log_','',nm), exp(as.numeric(fit.params[nm])))
rm(nm, loggedParms, unloggedParms)
})
guess.params <- c(log_Beta = log(5), log_alpha = log(8))
subsParms(guess.params, disease_params())
## Make likelihood a function of fixed and fitted parameters.
objFXN <- function(fit.params ## paramters to fit
, fixed.params =disease_params() ## fixed paramters
, obsDat=myDat) {
parms <- subsParms(fit.params, fixed.params)
nllikelihood(parms, obsDat = obsDat) ## then call likelihood
}
objFXN(guess.params, disease_params())
## Select initial values for fitted parameters from which optimization routine
## will start. If you select bad initial values the algorithm can get stuck on a
## bad set of parameters. You can always try the true values as a starting point
## for this problem, although that's rarely possible in real problems.
init.pars <- c(log_alpha = log(30), log_Beta = log(.1))
## We will start with SANN optimization since it is stochastic and therefore
## less likely to get stuck in a local minima. But then finish with Nelder-Mead
## optimization which is much faster.
### NOTE: for trace >0 you see more progress report, bigger numbers show more
### update
trace <- 3
## SANN: This is stochastic, be CAREFUL -- sometimes it gets stuck at local minima
## for unreasonble parameters. If you see this happen, run it again!
optim.vals <- optim(par = init.pars
, objFXN
, fixed.params = disease_params()
, obsDat = myDat
, control = list(trace = trace, maxit = 150)
, method = "SANN")
exp(unname(optim.vals$par))
trueParms[c('alpha','Beta')]
## We feed the last parameters of SANN in as the first values of Nelder-Mead
optim.vals <- optim(par = optim.vals$par
, objFXN
, fixed.params = disease_params()
, obsDat = myDat
, control = list(trace = trace, maxit = 800, reltol = 10^-7)
, method = "Nelder-Mead"
, hessian = T)
optim.vals
MLEfits <- optim.vals$par
trueParms[c('alpha','Beta')]
exp(unname(MLEfits))
log_alpha.fit <- MLEfits["log_alpha"]
log_Beta.fit <- MLEfits["log_Beta"]
## Look at the output of optim. Understand what it means. Did the algorithm
## converge? Look at ?optim to understand it.
## Plot MLE fit time series
par(bty='n', lwd = 2, las = 1)
with(simDat, plot(time, P, xlab = '', ylab = 'prevalence', type = 'l', ylim = c(0,.4), col='red'))
fitDat <- simEpidemic(init, parms = subsParms(optim.vals$par, trueParms))
with(fitDat, lines(time, P, col='blue'))
points(myDat$time, myDat$sampPrev, col = 'red', pch = 16, cex = 2)
arrows(myDat$time, myDat$uci, myDat$time, myDat$lci, col = 'red', len = .025, angle = 90, code = 3)
legend("topleft", c('truth', 'observed', 'fitted'), lty = c(1, NA, 1), pch = c(NA,16, NA),
col = c('red', 'red', 'blue'), bty = 'n')
######################################################################
## Contour plots with the hessian
######################################################################
## The Hessian matrix gives you the curvature of the likelihood function at the maximum likelihood
## estimate (MLE) of the fitted parameters. In other words, it tells you the second derivative around
## MLE, which can be used to estimate the covariance variance matrix of the estimator. This estimate of
## the covariance varance matrix is known as the Fisher information matrix and can be obtained by
## inverting the Hessian.
fisherInfMatrix <- solve(optim.vals$hessian) ## invert the Hessian, to estimate the covar-var matrix of parameter estimates
## Initialize plot of parameters
plot(1,1, type = 'n', log = 'xy',
## xlim = range(alpha.seq), ylim = range(Beta.seq),
xlim = c(2,15), ylim = c(.5,2),
las = 1,
xlab = expression(alpha), ylab = expression(beta),
main = "-log(likelihood) contours", bty = "n")
## Add true parameter values to the plot
with(trueParms, points(alpha, Beta, pch = 16, cex = 2, col = 'red'))
## Add MLE to the plot
points(exp(MLEfits['log_alpha']), exp(MLEfits['log_Beta']), pch = 16, cex = 2, col = 'black')
## Add 95% contour ellipse from Hessian
lines(exp(ellipse(fisherInfMatrix, centre = MLEfits, level = .95)))
legend("topleft", c('truth', 'MLE', '95% Confidence Region'), lty = c(NA, NA, 1), pch = c(16,16, NA),
col = c('red', 'black', 'black'), bg='white', bty = 'n')
######################################################################
## Contour plots with likelihood profiles
######################################################################
## With all other parameters fixed to their initial values, lets look at a
## contour likelihood plot over Beta and alpha. To do this we write wrapper
## functions of log_Beta and log_alpha to feed to outer() and then
## contour(). This is confusing so make sure you understand every function.
## This function simply takes values log_Beta and log_alpha and feeds them into
## objFXN above as a single variable called logpars, you'll see why this is
## useful below.
objXalpha_Beta <- function(alpha, Beta, fixed.params = disease_params(), browse=F)
objFXN(fit.params = c(log_alpha = log(alpha), log_Beta = log(Beta))
, fixed.params = fixed.params)
## Now instead of giving a single argument on the log scale we give 2
## on the untransformed scale.
objFXN(c(log_alpha = log(1/5), log_Beta = log(25)))
objXalpha_Beta(1/5, 25)
## If we try to give this function multiple values of alpha and Beta,
## however, it gets confused and only analyzes the first one.
objXalpha_Beta(c(1/5:8), 25:30)
## So we "Vectorize" this function so it can take in vectors of the parameters
## and return the output. objXalpha_BetaVEC then calls on objXalpha_Beta() to
## take xx,yy as pairwise values and returns objXalpha_Beta() for all pairs
objXalpha_BetaVEC <- Vectorize(objXalpha_Beta, list("alpha", "Beta"))
## Task: Explain how the following three lines are related.
objXalpha_Beta(25, 1/5)
objXalpha_Beta(26, 1/6)
objXalpha_Beta(8, .9)
objXalpha_BetaVEC(c(25:26), c(1/5,1/6))
## Now we use the R function outer() to evaluate objXalpha_BetaVEC() over a grid
## of {alpha, Beta} combinations. This can take a long time because we have to do
## res^2 evaluations of nllikelihood(), and recall that each time we do this we are
## running lsoda() inside simEpidemic()
## Grid resolution resXres, increasing it makes contours smoother but takes a lot longer
res <- 15
## Now create a sequence of alpha values for the grid
alpha.seq <- exp(seq(log_alpha.fit-1, log_alpha.fit+1, l = res))
alpha.seq
## Now create a sequence of Beta values for the grid
Beta.seq <- exp(seq(log_Beta.fit-1, log_Beta.fit + 1, l = res))
Beta.seq
## The function outer() now evaluates objXalpha_BetaVEC on this grid. ?outer
mat <- outer(alpha.seq, Beta.seq, objXalpha_BetaVEC) # this can take a long time
## Make a contour plot that shows the confidence regions in red. Likelihood
## Ratio Test confidence regions use the chi squared distribution cutoff with
## degrees of freedom 2 (2 parameters)
ml.val <- optim.vals$value
conf.cutoff <- ml.val + qchisq(.95,2)/2
## Show likelihood contours
par(cex = 1.2)
plot(1,1, type = 'n', log = 'xy',
## xlim = range(alpha.seq), ylim = range(Beta.seq),
xlim = c(3,15), ylim = c(.5,2),
xlab = expression(alpha), ylab = expression(beta),
main = "-log(likelihood) contours", bty = "n")
.filled.contour(alpha.seq, Beta.seq, mat, levels = seq(min(mat), max(mat), l=20), col = topo.colors(20))
## Add contour for 95% CI from likelihood ratio
contour(alpha.seq, Beta.seq, mat, levels = c(conf.cutoff),
col = "black", lwd = 2, labels = "", labcex = .2, add = T)
## Add contour for 95% CI from Hessian
lines(exp(ellipse(fisherInfMatrix, centre = MLEfits, level = .95)), lty = 2)
## Add MLE to the plot
points(exp(log_alpha.fit), exp(log_Beta.fit), pch = 16, cex = 1, col = 'black')
## Add true parameter values to the plot
with(trueParms, points(alpha, Beta, pch = 16, cex = 1, col = 'red'))
legend("topleft",
c('truth', 'MLE', '95% contour (profile likelihood)', '95% contour (Fisher information matrix)')
, lty = c(NA, NA, 1, 2), pch = c(16,16, NA, NA),
col = c('red', rep('black',3)), bg='white', bty = 'n')