Julian Faraway 2022-01-06
- Setup
- Data
- Comparison to Infection Survey
- Glasgow only
- Level of Aggregation
- Relationships across Scotland
- Varying Coefficient models
- Lagged Models for Prediction
Set some graphics options. The here package ensures files are found in the expected places.
library(svglite)
library(here)
library(tidyverse)
library(lubridate)
library(mgcv)
library(broom)
Load previously processed data
load(here("data/scot.rda"))
Restrict to one year
startdate = ymd("2020-09-01")
enddate = ymd("2021-08-31")
allscot %>%
filter(between(Date, startdate,enddate)) -> allscot
Merge in population data and create cases per 100,000 population variable. Need to add +1 to deal with zeroes on log scale
left_join(allscot, scotpop, by = c("hab" = "HBName")) %>%
mutate(rollper = (rollcases+1)*1e5/hbpop) %>%
select(Date,hab,N1,cases,hbpop,rollper,rollcases) -> allscot
allhabs = unique(allscot$hab)
Glasgow as the reference level
allscot$hab = fct_relevel(allscot$hab,"Greater Glasgow and Clyde")
Some information about the data
Population and WWTP coverage fraction
scotpop %>% select(HBName,hbpop,wwpop,covfrac) %>%
knitr::kable(digits=2,
col.names = c("Health Area", "Population",
"WWTP Population", "Fraction"))
Health Area | Population | WWTP Population | Fraction |
---|---|---|---|
Ayrshire and Arran | 367990 | 305559 | 0.83 |
Borders | 115240 | 62498 | 0.54 |
Dumfries and Galloway | 148290 | 61560 | 0.42 |
Forth Valley | 305930 | 214959 | 0.70 |
Grampian | 585550 | 372632 | 0.64 |
Highland | 320860 | 98719 | 0.31 |
Lothian | 912490 | 781775 | 0.86 |
Orkney | 22400 | 7757 | 0.35 |
Shetland | 22870 | 7843 | 0.34 |
Western Isles | 26500 | 7247 | 0.27 |
Fife | 374130 | 305019 | 0.82 |
Tayside | 416550 | 268994 | 0.65 |
Greater Glasgow and Clyde | 1185240 | 1368617 | 1.15 |
Lanarkshire | 661960 | 304440 | 0.46 |
WWTP sites information
scotrna %>%
filter(between(Date, startdate,enddate)) %>%
group_by(hab) %>%
summarise(popsize = sum(as.numeric(unique(pop))),
nobs = n(),
nsites = length(unique(site))) %>%
knitr::kable(col.names = c("Health Area", "WW Population",
"WW observations", "Number of sites"))
Health Area | WW Population | WW observations | Number of sites |
---|---|---|---|
Ayrshire and Arran | 305559 | 377 | 9 |
Borders | 62498 | 364 | 9 |
Dumfries and Galloway | 61560 | 330 | 8 |
Fife | 305019 | 635 | 12 |
Forth Valley | 214959 | 442 | 9 |
Grampian | 372632 | 535 | 12 |
Greater Glasgow and Clyde | 1368617 | 501 | 9 |
Highland | 98719 | 342 | 6 |
Lanarkshire | 304440 | 603 | 17 |
Lothian | 781775 | 610 | 12 |
Orkney | 7757 | 99 | 1 |
Shetland | 7843 | 67 | 1 |
Tayside | 268994 | 216 | 4 |
Western Isles | 7247 | 33 | 1 |
isd = read_csv(here(file.path("data","isd.csv")))
pick out and group PH data
scotcc %>% group_by(Date) %>% summarise(cases = sum(DailyPositive)) -> scc
compute weekly rolling sums
scc %>% mutate(wcases = zoo::rollsumr(scc$cases, k=7, fill="extend")) -> scc
join
left_join(isd, scc, by=c("end" = "Date")) -> phinf
ggplot(phinf, aes(x=end, y=wcases/ninf)) + geom_point() +
geom_smooth(color='black') +
ylab("Ratio") + xlab("Date") + xlim(as_date(c("2020-10-31","2021-09-25")))
- Ratio peaks around 0.35
- Underreporting and shedding profiles mean ratio will not be 1
- Ratio is increasing in time which has several possible explanations
Only consider Glasgow (highest population, most measurements)
allscot %>%
filter(hab == "Greater Glasgow and Clyde") -> glasgow
glasgow %>%
ggplot(aes(x=N1+20, y = rollper)) +
geom_point(size=0.5,alpha=0.25) +
scale_x_continuous(name = "RNA gc/l", trans = "log10") +
scale_y_continuous(name = "COVID-19 case rate", trans = "log10") +
geom_smooth(method="lm", color='black')
Looks fine on a log scale but now consider untransformed
glasgow %>%
ggplot(aes(x=N1+20, y = rollper)) +
geom_point(size=0.5,alpha=0.25) +
scale_x_continuous(name = "RNA gc/l") +
scale_y_continuous(name = "COVID-19 case rate") +
geom_smooth(color='black')
Can see non-linearity on the untransformed scale
Fit the a linear model with the variables on a log scale
glasmod = lm(log(rollper) ~ log(N1), glasgow)
summary(glasmod)
Call:
lm(formula = log(rollper) ~ log(N1), data = glasgow)
Residuals:
Min 1Q Median 3Q Max
-1.039 -0.267 -0.073 0.326 1.571
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.2868 0.2790 -15.4 <2e-16
log(N1) 0.6874 0.0261 26.4 <2e-16
Residual standard error: 0.441 on 363 degrees of freedom
Multiple R-squared: 0.657, Adjusted R-squared: 0.656
F-statistic: 695 on 1 and 363 DF, p-value: <2e-16
Create a whole of Scotland dataset
left_join(allscot, scotpop, by = c("hab" = "HBName")) %>%
mutate(wbyn1 = N1*wwpop) %>%
group_by(Date) %>%
summarise(N1 = sum(wbyn1)/sum(scotpop$wwpop),
cases = sum(rollper)*1e5/sum(scotpop$hbpop)) %>%
ungroup() -> aggscot
ggplot(aggscot,aes(N1, cases)) +
geom_point(size=0.5,alpha=0.25) +
scale_x_continuous(name = "RNA gc/l", trans = "log10") +
scale_y_continuous(name = "COVID-19 case rate", trans = "log10") +
geom_smooth(method="lm", color='black')
A stronger relationship is seen compared to the individual areas
Now fit a model
lmodagg = lm(log(cases) ~ log(N1), aggscot)
summary(lmodagg)
Call:
lm(formula = log(cases) ~ log(N1), data = aggscot)
Residuals:
Min 1Q Median 3Q Max
-0.618 -0.186 -0.027 0.197 0.768
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.9881 0.1552 -45.0 <2e-16
log(N1) 0.7744 0.0147 52.5 <2e-16
Residual standard error: 0.266 on 363 degrees of freedom
Multiple R-squared: 0.884, Adjusted R-squared: 0.883
F-statistic: 2.76e+03 on 1 and 363 DF, p-value: <2e-16
exp(predict(lmodagg,data.frame(N1=mean(aggscot$N1)),interval = "prediction"))
fit lwr upr
1 4.4051 2.6097 7.4356
compared to a Glasgow only model
glasmod = lm(log(rollper) ~ log(N1), glasgow)
summary(glasmod)
Call:
lm(formula = log(rollper) ~ log(N1), data = glasgow)
Residuals:
Min 1Q Median 3Q Max
-1.039 -0.267 -0.073 0.326 1.571
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.2868 0.2790 -15.4 <2e-16
log(N1) 0.6874 0.0261 26.4 <2e-16
Residual standard error: 0.441 on 363 degrees of freedom
Multiple R-squared: 0.657, Adjusted R-squared: 0.656
F-statistic: 695 on 1 and 363 DF, p-value: <2e-16
exp(predict(glasmod,data.frame(N1=mean(aggscot$N1)),interval = "prediction"))
fit lwr upr
1 25.34 10.624 60.437
Four selected areas as seen in the paper
selset = c("Borders","Lothian","Lanarkshire","Orkney")
allscot %>%
filter(hab %in% selset) %>%
ggplot(aes(N1,rollper)) + geom_point(alpha=0.25,size=0.25) +
ylab("COVID-19 case rate") + xlab("RNA gc/l") +
facet_wrap(~ hab, scales = "free") +
geom_smooth(se=FALSE,color='black') +
scale_x_continuous(labels = scales::scientific)
Now all of them
allscot %>%
ggplot(aes(N1,rollper)) + geom_point(alpha=0.25,size=0.25) +
ylab("COVID-19 case rate") + xlab("RNA gc/l") +
facet_wrap(~ hab, scales = "free") +
geom_smooth(se=FALSE,color='black') +
scale_x_continuous(labels = scales::scientific) +
theme(axis.text.x = element_text(angle = 30,hjust=1))
Assemble data
scotplus = allscot %>%
filter(between(Date, as.Date("2020-09-01"),as.Date("2021-08-31"))) %>%
mutate(y = log10(rollper),
x = log10(N1 + 20),
timet = as.numeric(Date)-18505,
hab = factor(hab))
Fit a model where each HAB has its own time varying intercept but the slope is in common for all y = αi(t) + β(t)x
gmod = gam(y ~ s(timet,by=hab) + s(timet,by=x), data=scotplus)
summary(gmod)
Family: gaussian
Link function: identity
Formula:
y ~ s(timet, by = hab) + s(timet, by = x)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.5070 0.0135 37.4 <2e-16
Approximate significance of smooth terms:
edf Ref.df F p-value
s(timet):habGreater Glasgow and Clyde 9.00 9.00 20.4 <2e-16
s(timet):habAyrshire and Arran 9.00 9.00 25.8 <2e-16
s(timet):habBorders 9.00 9.00 36.7 <2e-16
s(timet):habDumfries and Galloway 9.00 9.00 51.2 <2e-16
s(timet):habFife 9.00 9.00 25.9 <2e-16
s(timet):habForth Valley 9.00 9.00 27.8 <2e-16
s(timet):habGrampian 9.00 9.00 31.4 <2e-16
s(timet):habHighland 8.99 9.00 32.7 <2e-16
s(timet):habLanarkshire 8.99 9.00 25.2 <2e-16
s(timet):habLothian 8.95 9.00 19.3 <2e-16
s(timet):habOrkney 9.00 9.00 18.6 <2e-16
s(timet):habShetland 8.98 9.00 28.9 <2e-16
s(timet):habTayside 8.86 8.99 23.4 <2e-16
s(timet):habWestern Isles 9.00 9.00 31.7 <2e-16
s(timet):x 10.00 10.00 274.3 <2e-16
R-sq.(adj) = 0.811 Deviance explained = 81.6%
GCV = 0.03523 Scale est. = 0.034288 n = 5110
Plot the coefficient functions, gray lines at Jan1, Apr1 and Jul1
par(mgp=c(1.5,0.5,0), mar=c(3.1,3.1,3.1,0), pch=20, mfrow=c(2,2))
for(i in 1:4){
pgam = plot(gmod, rug=FALSE, xlab="Time", ylab='a(t)',
main=allhabs[i], select=i)
abline(v=c(90,180,270)+30,col=gray(0.75))
}
for(i in 5:8){
plot(gmod, rug=FALSE, xlab="Time", ylab='a(t)',
main=allhabs[i], select=i)
abline(v=c(90,180,270)+30,col=gray(0.75))
}
for(i in 9:12){
plot(gmod, rug=FALSE, xlab="Time", ylab='a(t)',
main=allhabs[i], select=i)
abline(v=c(90,180,270)+30,col=gray(0.75))
}
par(mfrow=c(1,2))
for(i in 13:14){
plot(gmod, rug=FALSE, xlab="Time", ylab='a(t)',
main=allhabs[i], select=i)
abline(v=c(90,180,270)+30,col=gray(0.75))
}
par(mfrow=c(1,1))
- Most HABs have a similar shape
- Number of WWTPs sampled increased from around 30 to 90 early in 2021
plot(gmod, rug=FALSE, xlab="Time", ylab='b(t)',
select=15,scale=0)
abline(v=c(90,180,270)+30,col=gray(0.75))
Improvement on slope over time but not reaching the desired target of one. Maybe lab procedures are improving
Generate nicer plots
pgam = plot(gmod, pages = 1)
pgdf = data.frame(x=unlist(map(pgam,"x")),
fit=unlist(map(pgam,"fit")),
se=unlist(map(pgam,"se")),
hab = rep(c(allhabs,"beta"),each=100))
pgdf$ub = pgdf$fit+2*pgdf$se
pgdf$lb = pgdf$fit-2*pgdf$se
pgdf %>% filter(hab %in% allhabs[1:4]) %>%
ggplot(aes(x=x)) +
geom_ribbon(aes(ymin = lb, ymax = ub),fill="gray90") +
geom_line(aes(y=fit)) +
facet_wrap(~ hab) +
ylab("a(t)") +
scale_x_continuous(name="Date",breaks=c(90,180,270),
labels=c("Jan1","Apr1","Jul1"))
pgdf %>% filter(hab == "beta") %>%
ggplot(aes(x=x)) +
geom_ribbon(aes(ymin = lb, ymax = ub),fill="gray90") +
geom_line(aes(y=fit)) +
ylab("b(t)") +
scale_x_continuous(name="Date",breaks=c(90,180,270),
labels=c("Jan1","Apr1","Jul1"))
scotrna %>% mutate(Mon = floor_date(Date, "month")) %>%
group_by(Mon) %>% count() %>%
ggplot(aes(Mon,n)) + geom_step() +
xlab("Date") + ylab("Samples per Month")
Can we predict the future? Try lagging the N1 variable:
timewindow = 7
ldf = tibble(lag = 0:timewindow, r2 = rep(NA, timewindow+1),
se = rep(NA,timewindow+1))
for(i in 0:timewindow){
allscot %>% group_by(hab) %>% mutate(N1lag = lag(N1,i)) %>% ungroup() -> scotlag
lmodlag = lm(log(rollper) ~ log(N1lag+20)*hab, scotlag)
gd = glance(lmodlag)
ldf$r2[i+1] = gd$r.squared
ldf$se[i+1] = gd$sigma
}
ldf %>% knitr::kable(digits = 3)
lag | r2 | se |
---|---|---|
0 | 0.715 | 0.526 |
1 | 0.702 | 0.536 |
2 | 0.687 | 0.548 |
3 | 0.671 | 0.561 |
4 | 0.653 | 0.575 |
5 | 0.634 | 0.589 |
6 | 0.615 | 0.603 |
7 | 0.596 | 0.617 |
We can see that the fit of the model decreases as the lag is increased while the standard error increases. While we might obtain better predictions by using more than one lagged variable, this does not indicate any strong precognitive ability for N1.
Consider trying to predict a week ahead:
allscot %>% group_by(hab) %>% mutate(rollperlag = lag(rollper,7)) %>%
ungroup() -> scotlag
lmodlag7 = lm(log(rollper) ~ log(rollperlag)*hab, scotlag)
glance(lmodlag7)[c('r.squared','sigma')]
# A tibble: 1 × 2
r.squared sigma
<dbl> <dbl>
1 0.845 0.383
This model predicts a week ahead using current cases. We can see this works better than using N1. Now suppose we have the current N1 count and we add this information to the model:
allscot %>% group_by(hab) %>%
mutate(rollperlag = lag(rollper,7),
N1lag7 = lag(N1,7)) %>%
ungroup() -> scotlag
lmodlagcase7 = lm(log(rollper) ~ (log(rollperlag) + log(N1lag7+20))*hab, scotlag)
glance(lmodlagcase7)[c('r.squared','sigma')]
# A tibble: 1 × 2
r.squared sigma
<dbl> <dbl>
1 0.851 0.375
We can see this is only fractionally better than using cases alone i.e N1