-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathREADME.Rmd
737 lines (497 loc) · 30.1 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
---
always_allow_html: yes
output:
md_document:
variant: markdown_github
---
<!-- README.md is generated from README.Rmd. Please edit that file -->
```{r setup, include=FALSE}
#### Global chunk options -----------------------------
knitr::opts_chunk$set(
eval = TRUE, # whether to run code in code chunk
include = TRUE, # whether to include the chunk output
echo = TRUE, # Whether to show code chunk in final output
error = TRUE, # whether to display error messages
message = FALSE, # whether to preserve messages
warning = FALSE, # whether to preserve warnings
comment = NA, # a character string to append at start
# of each line of results in final document
tidy = FALSE # whether to tidy code chunks for display
)
#### Other options --------------------------------
## Turn off scientific notation ----------------
options(scipen = 999)
## define format ----------------
# If you don't define format here, you'll need put `format = "html"` in
# every kable function.
# options(knitr.table.format = "html") # use this option when knitting html
# options(knitr.table.format = "latex") # use this option when knitting pdf
## Set random seed ----------------
seed_for_imp <- 8675309
set.seed(seed_for_imp)
#### Packages --------------------------------------
# This is where we load in all the packages we plan to use
## Define the repository for packages ----------------
options(repos = c(CRAN = "http://cran.rstudio.com"))
# universally useful packages
if (!require("pacman")) install.packages("pacman")
if (!require("devtools")) install.packages("devtools")
# github packages
if (!require("janitor")) devtools::install_github("sfirke/janitor")
if (packageVersion("janitor") < "0.4.0.9000") {
devtools::install_github("sfirke/janitor")
}
# Load the list of packages
pacman::p_load(
Hmisc, # contains many functions useful for data analysis
psych, # a general purpose toolbox
tidyverse, # packages ggplot2, tibble, tidyr, readr, purrr, and dplyr
forcats, # functions for factors, forcats is an anagram for factors
broom, # functions tidy(), glance(), augment()
magrittr, # includes the %<>% assignment-pipe (%>% is loaded from dplyr)
rlang, # For use in tidyevaluation
readxl, # read in excel files
writexl, # write excel files, zero dependency
# xlsx, # read, write, format Excel files
janitor, # for working with dirty data
lubridate, # for working with dates and times
stringr, # handy string operations
tableone, # Create Table 1 to Describe Baseline Characteristics
DT, # render R objects as HTML tables
knitr, # A General-Purpose Package for Dynamic Report Generation in R
kableExtra, # Enahced table functions
ggthemes, # Extra Themes, Scales and Geoms for 'ggplot2'
scales, # Scale Functions for Visualization
visdat, # Preliminary data visualisation
naniar, # structures, summaries, and visualisations for missing data
here, # Constructs paths to your project's files
mice # Multivariate Imputation by Chained Equations
)
#### Store time for later --------------------------------
start_time <- Sys.time()
#### Confirm directory --------------------------------
here::here()
```
# Overview
The repository shows example code that I used to
+ Make a simulated data set,
+ Examine the missingness,
+ Specify the imputation model,
+ Pefrom multiple impuation with the `mice` package,
+ Calculate a number of agreement statistics, and
+ Pool the results with `mice`.
Also included are the slides from my presentation at CSP (2018) my final submitted thesis (2019-06-29).
# Simulate a data set
Due to the sensitive nature of electronic health record (EHR) data, in order to provide a data set for a reproducible example, I wrote code below to simulate a data set that can be used with the rest of the code in this example. The only similarities to the original data are the variable names and the data categories.
Note that the eligibility requirements for screening services are based on sex, age, and medical history. I do not simulate medical history and so the eligibility here is based only on sex and age.
The proportions of screenings in the data were chosen to try to simulate some data that would make for interesting example with the rest of the code. The proportions of missingness are similar to the actual work that I presented in that I did a little bit of rounding before simulation.
Disclaimer: All the data in this example is simulated from the following code. Any similarities to the original data set or any other existing data is purely by chance alone.
## Simulate the data
```{r make-simulated-data, results=FALSE}
source(here::here("src", "simulate-data-set.R"))
valdata <- make_sim_data(n_rows = 14000, seed = seed_for_imp)
```
## Do some checks and take a glimpse
```{r}
dplyr::glimpse(valdata)
```
# EDA / explore missingness
## Table One
A quick table one to look at the demographics of the sample data.
```{r, results='asis'}
#### Create a table one --------------------------------
# Table one summary stats using the tableone package
tab1 <- tableone::CreateTableOne(
vars = c("sex", "race_cat", "ethnic_cat", "lang_cat", "fpl_cat", "age_cat"),
data = valdata,
factorVars =
c("sex", "race_cat", "ethnic_cat", "lang_cat", "fpl_cat", "age_cat"),
includeNA = TRUE
) %>%
print(.,
showAllLevels = TRUE,
printToggle = FALSE,
noSpaces = TRUE
)
#### Print table one --------------------------------
tab1 %>%
as.data.frame() %>%
tibble::rownames_to_column(., var = "rowname") %>%
tibble::as_tibble() %>%
mutate(rowname = rownames(tab1)) %>%
mutate(level = as.character(level),
level = tidyr::replace_na(level, "(Missing)")) %>%
knitr::kable(booktabs = TRUE,
longtable = TRUE,
col.names = c("", names(.)[-1])) %>%
kableExtra::kable_styling(full_width = FALSE,
latex_options = c("repeat_header")) %>%
kableExtra::column_spec(1, width = "10em")
```
## Visualizing missingness in the data set
The table one above shows that there are missing data in the variables `race_cat`, `ethnic_cat`, and `fpl_cat`. When beginning to think about how to handle the missing data in your project, visualization is a great place to begin. The [`naniar`](https://cran.r-project.org/web/packages/naniar/index.html) and the [`visdat`](https://cran.r-project.org/web/packages/visdat/index.html) packages provide helpful plots.
### The `vis_miss()` plot
In the figure below, we get an overview of the missing values in the data set. Missing are shown in black and observed values are shown in gray. We see that there are only 3 variables in the data set with missing values.
```{r}
naniar::vis_miss(valdata)
```
We can also get a numeric percent missing for the variables.
```{r}
naniar::miss_var_summary(valdata) %>%
head(., n = 10)
```
### Setting `cluster = TRUE`
It can also be useful to look at the missingness plot with the values clustered. This gives a sense of how many values are missing in one row. Many rows with multiple missing values can be problematic when trying to do imputation.
I'm only going to show this plot for a few variables since we saw above that many were completely observed.
```{r}
valdata %>%
dplyr::select(race_cat, ethnic_cat, fpl_cat, lang_cat, sex, age_cat) %>%
naniar::vis_miss(.,
cluster = TRUE)
```
### The `VIM` package
The [`VIM`](https://cran.r-project.org/web/packages/VIM/VIM.pdf) package also has many helpful tools for visualizing missing values. I really like their combination bar plot and aggregation plot.
Here I have filtered the data to just those with missing values; I think that this helps the plot to be more clear. The bar plot on the left shows the proportion of missing values in each variable. The aggregation plot on the right shows the combinations of missing (dark gray) and observed (light gray) values that exist in the data.
```{r}
valdata %>%
dplyr::select(race_cat, ethnic_cat, fpl_cat) %>%
VIM::aggr(.,
col = c("gray", "gray29"),
numbers = TRUE,
sortVars = TRUE,
labels = names(.),
cex.axis = .7,
gap = 3,
ylab = c("Histogram of missing data","Pattern"))
```
## Numerical summaries
```{r, fig.width=8}
mssng_pattern <- valdata %>%
dplyr::select(race_cat, ethnic_cat, fpl_cat) %>%
mice::md.pattern(.) %>%
as.data.frame() %>%
tibble::as_tibble(.) %>%
tibble::rownames_to_column(., var = "count") %>%
dplyr::rename(.,
# new = old
num_available = V4)
mssng_pattern %>%
knitr::kable(., escape = FALSE)
```
See Stef Van Buuren's vignette for more in depth interpretation of the table above. There's a lot of good info there. How I interpret it is to pay attention to the 1's and 0's in the main body of the table. They correspond to what is observed (1) and missing (0) for the variables listed in the top row.
So the first row of all 1's means that all listed variables are observed; the count to the left shows the number of rows in the data set that fit this description. The last row of all 0's means that all variables are missing; similar that the count on the left shows the number of rows in the data set that are missing for all the variables shown. Then all the 1's and 0's in between represent some combination of missingness among the variables.
The `naniar` package has summary that is a little simpler and gives some of the same information. But not as much detail as the `md.pattern()` in the `mice` package.
```{r}
valdata %>%
naniar::miss_case_table(.) %>%
knitr::kable(.)
```
### Number of observations per pattern of missing pairs
```{r}
mssng_pairs <- valdata %>%
dplyr::select(ethnic_cat, race_cat, fpl_cat) %>%
mice::md.pairs(.)
mssng_pairs
```
Four missingness patterns:
+ `rr` both are observed,
+ `rm` first variable is observed, the second is missing,
+ `mr` first variable is missing, the second is observed, and
+ `mm` both variable are missing.
### Proportion of usable cases
Measures how many cases with missing data on the target variable actually have observed values on the predictor. The proportion will be low if both target and predictor are missing on the same cases.
```{r}
prop_usable_cases <- valdata %>%
dplyr::select(ethnic_cat, race_cat, fpl_cat) %>%
mice::md.pairs(.)
with(prop_usable_cases,
round(mr / (mr + mm), digits = 3))
```
Target on the vertical axis (i.e. left), predictor on the horizontal (i.e. top).
Interpret: Of the records with values for `ethnic_cat`, xx% have observed information on `race_cat` and xx% have observed information on `fplp_cat`. Etc.
This gives a sense of what variables may be good to include/exclude in the imputation model. Higher % indicates more information and likely good predictor; lower % indicates that the variables are missing for the same observations and may not be good predictor.
### Number of incomplete cases
More recent advice on how many imputations to perform suggests a rule of thumb that the number of imputations should be similar to the percentage of cases that are incomplete. So, not just as a part of the EDA, it is important to know the number of incomplete cases to inform the specification of the imputation model later.
An incomplete case would be an observation (or row) with at least one missing value. There are a number of ways to get at this information, but the `naniar` package makes it super easy.
```{r}
n_missing <- naniar::prop_miss_case(valdata)
n_missing
```
So we know that in the data set `r scales::percent(n_missing)` of observations have missing values. I round this up to `r max(pretty(100 * n_missing))` to select the number of imputations to perform.
# Imputation
Below I show some of the set up that I went through to perform the multiple imputations using the `mice` package in R. Some resources were absolutely indispensable in my set up and learning:
+ the [`mice` package documentation](https://cran.r-project.org/web/packages/mice/mice.pdf)
+ the [vignette](https://www.jstatsoft.org/article/view/v045i03/v45i03.pdf) from the Journal of Statistical Software (December 2011, Volume 45, Issue 3)
+ [Flexible Imputation of Missing Data](https://stefvanbuuren.name/fimd/) by Stef van Buuren
Also, since my work was done, [online resources](https://stefvanbuuren.name/mice/) and [github info](https://github.com/stefvanbuuren/mice) have been added.
For more information about MICE: Multivariate Imputation by Chained Equations, sometimes called Fully Conditional Specification, I highly recommend any of the materials (code or books) or published papers by Stef van Buuren. Much can be found through his website, [www.multiple-imputation.com](http://www.stefvanbuuren.nl/mi/).
## Some set up
Here I am just going to define the number of imputations and the number of iterations to objects for use in later code. This is just a convenience step so that I only have to update these values in one place if I want to change them later.
The default number of imputations in the `mice` software is 5. Based on the exploration of the missingness above, we saw that they number of imputations suggested is much higher. Here in this example, I am going to keep it set at the default 5, just to limit the computation time if someone want to run this code on their own.
In practice, my suggestion would be to "tune" the imputation using a lower number like the default setting. Then once the imputation model is set, perform you final or near final imputations using the higher number as suggested by recent literature.
As far as iterations go, they tend to converge rather quickly with the `mice` algorithm. My advice is to run them out long enough to see whether there is convergence or not, while not getting super bogged down with computation time.
Here I am going to continue to use the software default of 5 for our example. In my actual work, I used 20. In your practice I would suggest to try to use 10 to 20.
```{r}
imp_num <- 5 # number of imputations, dflt = 5
iter_num <- 5 # number of iterations, dflt = 5
```
We can also tel the `mice` software to run an "initial" empty imputation. The only effect of this is to give us some objects in R to work with as we go through the steps. See below where I run the initial imputation. Note that the maximum number of iterations (`maxit`) is set to zero.
```{r}
init <- mice::mice(valdata, maxit = 0)
meth <- init$method
predM <- init$predictorMatrix
# print(ini)
```
The `init` object contains lots of information that we will work with as we go forward. Of interest for now though is the method selected for the variables. This is the form of the imputation model for the variables to be imputed.
```{r}
meth
```
We see that the software made no choice for the variables without missing data. For those with missing data, based on the type of variable, it makes some default choices. We can override these later.
We also get the default matrix of predictors that the software chose. This is an object of 1's and 0's. For those variables with no missing and that we won't be imputing, the values are zero. Here is a glimpse of just those that we intend to impute.
```{r}
predM[rowSums(predM) > 0, ]
```
## Specify the imputation model
Here I will follow the 7 steps that van Buuren suggests in order to set up the algorithm. See his writings for more details than I will go into here.
### Step 1 - Decide if the missing at random (MAR) assumption is reasonable
In this example, we randomly assigned missing values. So here, it kind of has to be reasonable. In practice though, this can be challenging to know for sure which is why the exploration of the data and the missingness is such an important step to take as I showed above.
Assuming MAR is typically a reasonable place to start. There is literature on sensitivity analysis with the imputations to examine if this assumption is met. And there are techniques to model the missing mechanism with the imputation if there is violation. This work is outside the scope of what I hope to share here.
### Step 2 - Decide on the form of the imputation model
We want to decide the form of the model used to impute the missing values of each variable. This can be specified on a variable by variable basis. We saw above from the `meth` object that the software made default decisions for us.
__FPL__ -- logistic regression (`logreg`), for factor with 2 levels.
__Race__ -- Multinomial logit regression (`polyreg`), factor with > 2 levels.
__Ethnicity__ -- Multinomial logit regression (`polyreg`), factor with > 2 levels.
I am going to overwrite those just to show how it is done. By overwriting the meth object we can force the algorithm to use this later.
```{r}
meth[c("ethnic_cat")] <- "polyreg"
meth[c("race_cat")] <- "polyreg"
meth[c("fpl_cat")] <- "logreg"
meth
```
### Step 3 - Decide the set of predictors to include in the imputation model
What variables to include in the multiple imputation model?
The advice is to include as many relevant variables as possible. One should include all variables that are in your scientific model of interest that will be used after imputation. Also variables that are related to the missingness of the variables you are imputing. Van Buuren has more advice here.
Including as many predictors as possible makes the MAR assumption more reasonable. But with larger data sets, this is not advisable for computation purposes. Van Buuren suggests that 15 to 25 variables will work well. He also offers advice to cull that list.
My case is interesting. I am not doing modelling; I am calculating scalar statistics of agreement. Also, my data set isn't really too large (41 variables once you ignore study ID which isn't too important for imputation purposes).
To aid in these decisions the `mice` package has a function that produces a "quick predictor matrix" that is useful for dealing with data sets with large number of variables. The software chooses by calculating two correlations with the available cases, taking the larger, and seeing if it meets a minimum threshold. Type `?quickpred` in the R console for better description.
Below I run the `quickpred()` to see what the software chooses. Only show the matrix below for those records with > 1 rows or columns
```{r}
predGuess <- valdata %>%
mice::quickpred(.)
predGuess[rowSums(predGuess) > 0, colSums(predGuess) > 0]
```
Hmm. As I am working through this example with the simulated data, the software did not choose any. In my actual work, it discovered about 2 to 3 important predictors for each variable.
Since we randomly generated each variable _independently_, there should not be a high correlation between them. So as surprised as I was at first, this really does makes sense.
In my actual work, I went to the lead investigator for insight into the data set and suggestions on which variables would be informative. The code chunk below shows how I took the list that they provided and modified the `predM` object.
```{r}
# Store the names of the variables in an object
var_names <- dput(names(valdata))
# create another vector of the names selected by the investigator
pi_list <-
c("fpl_cat", "race_cat", "ethnic_cat", "lang_cat", "age_start", "sex",
"primary_dept", "ehr_cervical", "ehr_breast", "ehr_colon",
"ehr_colonoscopy", "dmap_breast", "dmap_colonoscopy", "ehr_cholest",
"dmap_cholest", "elig_cholest", "ehr_flexsig", "ehr_fobt", "ehr_bmi",
"ehr_flu", "ehr_chlam", "ehr_smoking", "dmap_cervical", "dmap_colon",
"dmap_flexsig", "dmap_fobt", "dmap_bmi", "dmap_flu", "dmap_chlam",
"elig_cervical", "elig_breast", "elig_colon", "elig_bmi", "elig_flu",
"elig_chlam", "elig_smoking")
```
Note that the investigator had me exclude the variables, `elig_colonoscopy`, `elig_flexsig`, and `elig_fobt`, because these have the exact same information as `elig_colon`. The eligibility for all these screenings is the same.
We also did not include `dmap_smoking` because there was little to no information here.
Also, we included `age_start` as a continuous variable and did not include the categorical version of this variable, `age_cat`, hoping to get more information.
```{r}
# Make a vector of the variable names that we want to include.
vars_to_include <- var_names[(var_names %in% pi_list)]
# adjust the default prediction matrix for the variables we want to include
pred <- predM # Set equal to the orginal pred matrix
pred[, ] <- 0 # change to all zeroes to clean it out
pred[, vars_to_include] <- 1 # Set to 1 for variables that we want
diag(pred) <- 0 # set the diagonal to zero (can't predict itself)
# take a glimpse
head(pred[, 1:10], n = 10)
```
Note that there are 1s in the matrix for variables that we will not be imputing. This will have no impact.
In the actual work, I made 3 scenarios:
1. Full list of variables chosen by the investigator
2. Reduced list of about half the number
3. Very reduced list based on the defaults from the software (2-3)
Here in this example to keep it simple, I will just work with the full list. But the other scenarios could be created by adjusting the predictor matrix similar to above.
### Step 4 - Decide how to impute variables that are functions of other (incomplete) variables
Transformations, sum scores, etc. were not used in this data set so not much to consider here. In some cases, there can be a lot to think about particularly if a variable is transformed solely to meet the normality assumption in a regression model. So do a literature review if this issue applies to you.
In my example, I mentioned above that I am including continous `age_start` instead of the categorical age. `fpl_cat` is a variable that is derived from a continuous value that had missingness. We considered to included the continuous value, but decided that it is more relevant to our question to look at binary `fpl_cat`.
### Step 5 - Decide the order to impute the variables
The defualt in the software goes by appearance in the data set left to right. It can be overwritten per the user's direction. This becomes more of an issue if there is a longitudinal nature to the data set where missingness at an earlier time point would affect the missingness later. So impute early to later.
I examined the imputation order by magnitude of missingness: low to high and high to low. There did not seem to be a difference in performance or convergence, nor an impact to the estimates. In the actual work, we decided to impute from highest percent missing to lowest.
The code `init$visitSequence` gives the variables to be imputed and their column positions that were chosen by the software by default. Shown in the next chunk.
```{r}
init$visitSequence
```
I override this with the following to order things as: FPL -> Race -> Ethnicity
```{r}
visit_order <- c("fpl_cat",
"race_cat",
"ethnic_cat")
visit_order
```
### Step 6 - Decide the number of iterations
This is to ensure convergence. The default is 5. 10 to 20 are recommended. In actual practice, we chose 20. Here I will keep it set at 5 as defined above to keep the computation strain low.
```{r}
iter_num
```
### Step 7 - Decide on the number of multiply imputed data sets
The rule of thumb from more recent authors is that the number of imputations should be similar to the percentage of cases (observations) that are incomplete (at least 5).
The software default is 5 and we will use that for this example to keep computation time low. We had set this previously above.
```{r}
imp_num
```
## Run the algorithm
All the setup work has been done and considerations made. Using the specifications that saved in objects above, we will run the `mice` command to impute the data sets.
Note that I wrap this in the command `system.time()` to see how long it runs.
```{r}
system.time(
imp_full <-
mice::mice(data = valdata,
m = imp_num, # number of imputations, dflt = 5
method = meth, # specify the method
predictorMatrix = pred,
visitSequence = visit_order,
seed = seed_for_imp,
maxit = iter_num, # number of iterations, dflt = 5
print = FALSE
)
)
```
That's elapsed time in seconds.
### Plot of convergence
We plot the values of mean and standard deviation for each variable by number of iteration. We want to look for mixing of the lines and we do not want to see any trend.
```{r}
plot(imp_full, c("ethnic_cat", "race_cat", "fpl_cat"))
```
# Calculating results and pooling
Now that we have out imputed data sets, we want to analyze each of them and then pool the results.
When pooling results, the `mice` software has tools to help whether pooling model results or scalar estimates. Things get messy quickly because I am calculating a number of agreement statistics, for 11 procedures, with 7 categorical variables, and each categorical variable has a number of levels.
When I first did these steps, I wrote many nested for loops. While this worked and it ran, it was slow and a little bit like a black box. A wise person once said that if you you find yourself using nested for loops in R then you should consider using the functions from the `purrr` package. They were right. I didn't compare speed with the previous approach, but the way I present here has clear points where you can check that things make sense. This advantage makes the choice clear for me.
The first step in wrangling the results is to get the separate imputed data sets into one data frame with a variable to track which imputation number the data came from.
```{r}
# Create an empty list
mylist <- list()
# Put the imputed sets into the list
for (i in 1:imp_full$m) {
mylist[[i]] <- mice::complete(imp_full, i)
}
# mylist[[1]]
# Take the list and stack the data sets into a data frame
# with an indication for the number of imputed data set
output <- dplyr::bind_rows(mylist, .id = "n_imp")
# This would do the same thing, maybe faster, but the above has the added
# benefit of easily adding the number of the imputation so that I can track
# things. Either way, I chose the one above as my preference.
# df <- do.call(rbind, mylist)
# df
# Check it out that the dimensions make sense and spot check the data
dplyr::glimpse(output)
# dim(output)
# head(output)
# tail(output)
```
Then I gather the data into a tidy format so that I can group by category or imputation number or whatever I want really. I employ a user written function because it's a little bit of work to do this.
```{r}
source(here::here("src", "make_df_long.R"))
output_long <- make_df_long(df = output)
```
We can check that this works by grouping by category and checking that the sums and counts of some of the variables match.
```{r}
output_long %>%
group_by(cat) %>%
summarise(
n = n(),
ELIG = sum(as.numeric(elig), na.rm = TRUE),
EHR = sum(as.numeric(ehr), na.rm = TRUE),
DMAP = sum(as.numeric(dmap), na.rm = TRUE),
id = sum(as.numeric(study_id), na.rm = TRUE),
age = sum(age_start, na.rm = TRUE)
)
```
I now want the data to be grouped by (1) procedure, (2) category, (3) level, and (4) imputation number.
```{r}
output_nested <- output_long %>%
group_by(proc, cat, level, n_imp) %>%
tidyr::nest()
output_nested %>%
head(., n = 10)
# df_nested$data[[1]]
# head(df_nested$data[[1]])
# tail(df_nested$data[[1]])
```
We now have a nested column of the data by the groups and imputations that we want. Next, we need to calculate two columns: Q - the estimates, and U - their standard errors. These needed user written functions to make the process automated and efficient.
```{r, warning=FALSE}
source(here::here("src", "calc-stats.R"))
output_nested <- output_nested %>%
mutate(Q = purrr::map(.x = data, .f = calc_stats_p),
U = purrr::map(.x = data, .f = calc_stats_se))
output_nested %>%
head(., n = 10)
# output_nested$Q[[1]]
# output_nested$U[[1]]
```
Next, we want to unnest the results from calculating the statistics and the standard errors.
```{r, warning=FALSE}
output_nested <- output_nested %>%
tidyr::unnest(c(Q, U)) %>%
mutate_at(.vars = vars(Q, U),
.funs = list(~ as.numeric(.)))
output_nested %>%
head(., n = 10)
```
We want to re-`group_by` the data by procedure, category, level, and type of statistic. This step gives us the nested estimates and variances for all the imputed sets which we can now pool the resuls.
```{r, wearning=FALSE}
output_nested <- output_nested %>%
group_by(proc, cat, level, stat) %>%
tidyr::nest()
```
Pool and unnest. Note that my user written function `mi_pool` is really just a wrapper for the `pool.scalar()` function in the `mice` package.
```{r, warning=FALSE}
output_nested <- output_nested %>%
mutate(pooled = purrr::map(.x = data,
.f = ~ mi_pool(.x)))
output_nested %>%
head(., n = 10)
```
Unnest the pooled results, we see that for each procedure, by category, by strata, we have for each statistic, the results that the pooling in `mice` provides.
```{r}
output_nested <- output_nested %>%
tidyr::unnest(pooled)
output_nested %>%
dplyr::select(-data) %>%
dplyr::glimpse(.)
```
The value `qbar` is the pooled post-imputation estimate that we are most interested. See the `mice` documenation for information on the rest. So if we just want to get those results for all the agreement statistics:
Annoyingly the variables are in alphabetical order when they are unnnested, so I add an extra `select` command to order them
```{r}
output_nested %>%
dplyr::select(proc:stat, qbar) %>%
tidyr::spread(., key = stat, value = qbar) %>%
dplyr::select(
proc, cat, level,
n, a, b, c, d,
EHR.n, EHR.p, DMAP.n, DMAP.p, Combo.n, Combo.p,
Po, Pe,
Ppos, Pneg, sens, spec,
kap, Kmax, Kmin,
BI, PI, PABAK
) %>%
knitr::kable(
booktabs = TRUE,
digits = 2
)
```
Well, the procedures and the categories need should be arranged differently for formal presentation. But that's pretty straightforward so I hope that you forgive me for stopping here. Thanks for reading!
# Check the run time
I saved an object above when this all started to run. Now I want to see how long it took.
```{r}
end_time <- Sys.time()
lubridate::as.period(
lubridate::interval(start_time, end_time)
)
```