Skip to content

Latest commit

 

History

History
442 lines (365 loc) · 14 KB

usamodel.md

File metadata and controls

442 lines (365 loc) · 14 KB

US RNA and Covid case modelling

Julian Faraway 2022-01-06

Setup

library(svglite)
library(here)
library(dplyr)
library(ggplot2)
library(purrr)
library(lubridate)
library(mgcv)
library(broom)

load(here("data/usa.rda"))

Plot of all counties

usa %>% ggplot(aes(N1+1,cases)) + 
  geom_point(size=0.5) +
  scale_x_continuous(name="N1",trans="log10") +
  scale_y_continuous(name= "Cases", trans="log10") +
  facet_wrap(~ county) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        axis.text.y = element_text(size = 6))

Correlations between N1 and incidence

usa %>% 
  group_by(county) %>% 
  summarise(incidence = cor(log(N1+1),log(cases+1),use = "complete.obs")) %>% 
  knitr::kable(digits=3)
county incidence
Arapahoe 0.800
Berkshire 0.886
Dauphin 0.917
Elko 0.885
Erie 0.944
Essex 0.908
Fairfield 0.714
Franklin 0.890
Hamilton 0.758
Hampshire 0.846
Hennepin 0.763
Indiana 0.888
Jackson 0.832
Jefferson 0.828
Lake 0.852
Miami-Dade 0.618
Middlesex 0.872
Monterey 0.394
Multnomah 0.865
Nantucket 0.811
Nassau 0.887
New Castle 0.766
Peoria 0.801
Placer 0.907
Sacramento 0.930
Shelby 0.885
Stafford 0.800
Steuben 0.776
Suffolk 0.875
Union 0.730

Indiana county as seen in the paper

ssite = "Indiana"
usa %>% filter(county == ssite) %>% 
  ggplot(aes(N1+1,cases+1)) + geom_point() + 
  geom_smooth(method="lm",color='black') +
  xlab("RNA gc/l") + ylab("Case rate") +
  scale_y_log10() + scale_x_log10() 

usa %>% filter(county == ssite) %>% 
  ggplot(aes(N1,cases)) + geom_point() + 
  geom_smooth(color='black') +
  xlab("RNA gc/l") + ylab("Case rate") 

Models

Linear fit in each county on log scales

Need +1 for zero values

lmodpop = lm(log(cases+1) ~ log(N1 + 1) * county, usa)
anova(lmodpop)
Analysis of Variance Table

Response: log(cases + 1)
                     Df Sum Sq Mean Sq F value Pr(>F)
log(N1 + 1)           1   1141    1141 3313.36 <2e-16
county               29    145       5   14.52 <2e-16
log(N1 + 1):county   29     77       3    7.71 <2e-16
Residuals          1770    609       0               
summary(lmodpop)
Call:
lm(formula = log(cases + 1) ~ log(N1 + 1) * county, data = usa)

Residuals:
   Min     1Q Median     3Q    Max 
-3.990 -0.296  0.058  0.351  2.323 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)
(Intercept)                  -0.077046   0.323772   -0.24  0.81194
log(N1 + 1)                   0.592698   0.063620    9.32  < 2e-16
countyBerkshire               0.051980   0.404536    0.13  0.89777
countyDauphin                -0.923651   0.453149   -2.04  0.04167
countyElko                   -0.123736   0.395868   -0.31  0.75465
countyErie                   -1.232861   0.411549   -3.00  0.00278
countyEssex                   1.152843   0.360994    3.19  0.00143
countyFairfield               0.297945   0.399179    0.75  0.45553
countyFranklin               -2.339375   0.708846   -3.30  0.00099
countyHamilton                0.243637   0.472592    0.52  0.60624
countyHampshire               0.099549   0.390690    0.25  0.79891
countyHennepin               -1.151158   0.588428   -1.96  0.05058
countyIndiana                -0.260582   0.371997   -0.70  0.48371
countyJackson                -1.423197   0.923196   -1.54  0.12335
countyJefferson              -1.320227   0.501756   -2.63  0.00858
countyLake                    0.256116   0.495211    0.52  0.60509
countyMiami-Dade             -1.489844   0.539579   -2.76  0.00582
countyMiddlesex              -0.176230   0.434482   -0.41  0.68508
countyMonterey                1.695227   0.355187    4.77    2e-06
countyMultnomah              -2.113763   0.747049   -2.83  0.00471
countyNantucket               0.109611   0.369613    0.30  0.76684
countyNassau                  1.244345   0.370729    3.36  0.00081
countyNew Castle             -0.265214   0.524090   -0.51  0.61289
countyPeoria                 -1.738834   0.507622   -3.43  0.00063
countyPlacer                  0.241118   0.366005    0.66  0.51012
countySacramento             -1.224135   0.677558   -1.81  0.07098
countyShelby                 -0.154421   0.528347   -0.29  0.77011
countyStafford               -1.060720   0.567356   -1.87  0.06171
countySteuben                -0.096502   0.426880   -0.23  0.82118
countySuffolk                -0.776878   0.419946   -1.85  0.06449
countyUnion                  -0.496597   0.453986   -1.09  0.27417
log(N1 + 1):countyBerkshire  -0.033054   0.080008   -0.41  0.67956
log(N1 + 1):countyDauphin     0.091221   0.084479    1.08  0.28038
log(N1 + 1):countyElko        0.037133   0.077037    0.48  0.62985
log(N1 + 1):countyErie        0.171256   0.079466    2.16  0.03129
log(N1 + 1):countyEssex      -0.123663   0.073324   -1.69  0.09187
log(N1 + 1):countyFairfield  -0.084923   0.079095   -1.07  0.28311
log(N1 + 1):countyFranklin    0.349172   0.128089    2.73  0.00647
log(N1 + 1):countyHamilton   -0.083750   0.087079   -0.96  0.33630
log(N1 + 1):countyHampshire  -0.086753   0.078212   -1.11  0.26749
log(N1 + 1):countyHennepin    0.183659   0.114557    1.60  0.10907
log(N1 + 1):countyIndiana    -0.008132   0.072636   -0.11  0.91087
log(N1 + 1):countyJackson     0.129514   0.161465    0.80  0.42259
log(N1 + 1):countyJefferson   0.235530   0.095873    2.46  0.01412
log(N1 + 1):countyLake       -0.069151   0.093074   -0.74  0.45760
log(N1 + 1):countyMiami-Dade  0.321494   0.102826    3.13  0.00180
log(N1 + 1):countyMiddlesex   0.088381   0.089574    0.99  0.32394
log(N1 + 1):countyMonterey   -0.353810   0.074113   -4.77    2e-06
log(N1 + 1):countyMultnomah   0.327671   0.152354    2.15  0.03163
log(N1 + 1):countyNantucket  -0.000455   0.073163   -0.01  0.99504
log(N1 + 1):countyNassau     -0.163874   0.073873   -2.22  0.02666
log(N1 + 1):countyNew Castle  0.019295   0.104480    0.18  0.85351
log(N1 + 1):countyPeoria      0.271617   0.097539    2.78  0.00541
log(N1 + 1):countyPlacer     -0.104051   0.072721   -1.43  0.15266
log(N1 + 1):countySacramento  0.241562   0.132471    1.82  0.06840
log(N1 + 1):countyShelby      0.055683   0.102746    0.54  0.58792
log(N1 + 1):countyStafford    0.100252   0.105375    0.95  0.34154
log(N1 + 1):countySteuben     0.010230   0.087009    0.12  0.90642
log(N1 + 1):countySuffolk     0.115923   0.081696    1.42  0.15609
log(N1 + 1):countyUnion       0.083457   0.093938    0.89  0.37443

Residual standard error: 0.587 on 1770 degrees of freedom
  (72 observations deleted due to missingness)
Multiple R-squared:  0.691, Adjusted R-squared:  0.681 
F-statistic: 67.1 on 59 and 1770 DF,  p-value: <2e-16
  • Intercepts vary greatly by county (more so than in Scotland) indicating large heterogeneity in sewage systems
  • Slopes are less than one (sometimes substantially) which shows the same less than linear (on the original scale) behaviour as also seen in the Scotland data

Time Varying Models

Create a week number for the time index

Date start on

min(usa$date)
[1] "2020-03-04"
usa %>% 
  mutate(timet = (as.numeric(date) - as.numeric(min(usa$date)))/7,
         y = log(cases+1),
         x = log(N1+1),
         county = factor(county)) -> usa

Fit the model

gmod = gam(y ~ s(timet,by=county) + s(timet,by=x), data=usa)
summary(gmod)
Family: gaussian 
Link function: identity 

Formula:
y ~ s(timet, by = county) + s(timet, by = x)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   1.0828     0.0795    13.6   <2e-16

Approximate significance of smooth terms:
                            edf Ref.df      F p-value
s(timet):countyArapahoe    9.00   9.00  19.39 < 2e-16
s(timet):countyBerkshire   8.72   8.94  13.96 < 2e-16
s(timet):countyDauphin     9.00   9.00  20.80 < 2e-16
s(timet):countyElko        9.00   9.00  15.73 < 2e-16
s(timet):countyErie        9.00   9.00  25.88 < 2e-16
s(timet):countyEssex       7.91   8.25  21.22 < 2e-16
s(timet):countyFairfield   9.00   9.00  28.94 < 2e-16
s(timet):countyFranklin    8.92   8.99   3.95 5.5e-05
s(timet):countyHamilton    8.80   8.97  17.33 < 2e-16
s(timet):countyHampshire   8.99   9.00  15.18 < 2e-16
s(timet):countyHennepin    7.39   8.08  20.27 < 2e-16
s(timet):countyIndiana     9.00   9.00  19.47 < 2e-16
s(timet):countyJackson     2.13   2.45   7.47 0.00026
s(timet):countyJefferson   9.00   9.00  16.97 < 2e-16
s(timet):countyLake        8.82   8.98  10.50 < 2e-16
s(timet):countyMiami-Dade  9.00   9.00  31.71 < 2e-16
s(timet):countyMiddlesex   7.69   7.91  19.16 < 2e-16
s(timet):countyMonterey    8.55   8.90  32.20 < 2e-16
s(timet):countyMultnomah   7.03   7.74  23.08 < 2e-16
s(timet):countyNantucket   9.00   9.00  21.65 < 2e-16
s(timet):countyNassau      8.69   8.95  10.57 < 2e-16
s(timet):countyNew Castle  7.00   7.69  17.62 < 2e-16
s(timet):countyPeoria      8.79   8.98  22.63 < 2e-16
s(timet):countyPlacer      9.00   9.00  13.92 < 2e-16
s(timet):countySacramento  4.91   5.64  16.63 < 2e-16
s(timet):countyShelby      8.14   8.49  13.59 < 2e-16
s(timet):countyStafford    7.85   8.32  17.59 < 2e-16
s(timet):countySteuben     2.01   2.38   4.96 0.00286
s(timet):countySuffolk     9.00   9.00  28.87 < 2e-16
s(timet):countyUnion       8.70   8.95  20.89 < 2e-16
s(timet):x                10.00  10.00 112.10 < 2e-16

R-sq.(adj) =  0.847   Deviance explained = 86.8%
GCV = 0.19073  Scale est. = 0.16456   n = 1830

First plot the intercept coefficient function αi(t). Grey vertical lines represent quarters (start Jul20, Oct20, Jan21, Apr21, Jul21, Oct21)

allcounties = levels(usa$county)
par(mgp=c(1.5,0.5,0), mar=c(1.5,1.5,1.1,0), pch=20, mfrow=c(2,3))
for(i in 1:6){
  plot(gmod, rug=FALSE, xlab="", ylab="",
       main=allcounties[i], select=i,ylim=c(-5,5))
  abline(v=c(17,30,43,56,69, 82),col=gray(0.75))
}

for(i in 7:12){
  plot(gmod, rug=FALSE, xlab="", ylab="",
       main=allcounties[i], select=i,ylim=c(-5,5))
  abline(v=c(17,30,43,56,69,82),col=gray(0.75))
}

for(i in 13:18){
  plot(gmod, rug=FALSE, xlab="", ylab="",
       main=allcounties[i], select=i,ylim=c(-5,5))
  abline(v=c(17,30,43,56,69, 82),col=gray(0.75))
}

for(i in 19:24){
  plot(gmod, rug=FALSE, xlab="", ylab="",
       main=allcounties[i], select=i,ylim=c(-5,5))
  abline(v=c(17,30,43,56,69,82),col=gray(0.75))
}

for(i in 25:30){
  plot(gmod, rug=FALSE, xlab="", ylab="",
       main=allcounties[i], select=i,ylim=c(-5,5))
  abline(v=c(17,30,43,56,69, 82),col=gray(0.75))
}

par(mfrow=c(1,1))
  • Some counties have missing data, particularly at the beginning of the period which leads to greater uncertainty (wide bands)

Nicer plots for publication

pgam = plot(gmod)
pgdf = data.frame(x=unlist(map(pgam,"x")),
                  fit=unlist(map(pgam,"fit")),
                  se=unlist(map(pgam,"se")),
                  county = rep(c(allcounties,"beta"),each=100))
pgdf$ub = pgdf$fit+2*pgdf$se
pgdf$lb = pgdf$fit-2*pgdf$se
pgdf %>% filter(county == ssite) %>% 
  ggplot(aes(x=x)) + 
  geom_ribbon(aes(ymin = lb, ymax = ub),fill="gray90") +
  geom_line(aes(y=fit)) +
  ylab("a(t)") +
  scale_x_continuous(name="Date",
                     breaks=c(17,30,43,56,69,82),
                     labels=c("Jul20", "Oct20", "Jan21", "Apr21", 
                              "Jul21", "Oct21"))

pgdf %>% filter(county == "beta") %>% 
  ggplot(aes(x=x)) + 
  geom_ribbon(aes(ymin = lb, ymax = ub),fill="gray90") +
  geom_line(aes(y=fit)) +
  ylab("b(t)") +
  scale_x_continuous(name="Date",
                     breaks=c(17,30,43,56,69,82),
                     labels=c("Jul20", "Oct20", "Jan21", "Apr21", 
                              "Jul21", "Oct21"))

Lagged Models for Prediction

Can we predict the future? Fit the concurrent model

lmodpop = lm(log(cases+1) ~ log(N1 + 1) * county, usa)
glance(lmodpop)[c('sigma','adj.r.squared')]
# A tibble: 1 × 2
  sigma adj.r.squared
  <dbl>         <dbl>
1 0.587         0.681

Try lagging the N1 variable by one week

lmodlag1 = lm(log(cases+1) ~ lag(log(N1 + 1),1) * county, usa)
glance(lmodlag1)[c('sigma','adj.r.squared')]
# A tibble: 1 × 2
  sigma adj.r.squared
  <dbl>         <dbl>
1 0.755         0.470

Worse than the concurrent model

Now suppose we use case data to predict one week ahead:

lmodnoww = lm(log(cases+1) ~ lag(log(cases + 1),1) * county, usa)
glance(lmodnoww)[c('sigma','adj.r.squared')]
# A tibble: 1 × 2
  sigma adj.r.squared
  <dbl>         <dbl>
1 0.685         0.597

Works better than using WW even though it is a very simplistic model

lmodboth = lm(log(cases+1) ~ lag(log(cases + 1),1) + lag(log(N1+1),1) * county, usa)
glance(lmodboth)[c('sigma','adj.r.squared')]
# A tibble: 1 × 2
  sigma adj.r.squared
  <dbl>         <dbl>
1 0.671         0.580

Adding WW info results in no improvement to the predictive ability of the model