I have a dataset with missing observations so I used mice
to perform my multiple imputation.
I ran a model specifying a lasso prior to do a penalized regression with the bayesreg package which is based on rstan.
The recommendation by the author of the package is to:
Currently, missing data is not supported. However, it is possible to impute the data before fitting the model, for example using the mice package. The model can then be fitted to each of the imputed data sets sepately and the resulting posterior draws can be combined.
This seemed simple enough, until it became apparent that I was unsure what the best way to do this pooling would be.
So what I did was I created a for loop to estimate a model for each of my imputed datasets.
data_sets = 10 # create 10 imputed datasets
anes_2012 =mice::mice(anes_2012, m = data_sets, method = 'rf') #perform the mice imputation
anes_2012 = miceadds::mids2datlist(anes_2012) #convert my mice mids object to a list for the for loop
anes_2012_models = list() #initialize an empty list called anes_2012_models
for(d in 1:data_sets){
model = bayesreg(wid ~ rboff + unpast + ecfamily + inflwhite + govbias + whitediscrim + pid + female + age + edu, data = anes_2012[[d]], prior = 'lasso') # for each of the imputed datasets in anes_2012, run this model with a lasso shrinkage parameter
anes_2012_models[[d]] = model # save the results of each of these models in the anes_2012_models by creating a new list element
)
But from here, I wasn’t sure what the best approach would be. Should I extract my estimates of beta and sigma2, calculate the median for the given model, then average across the models? How then do I present something like pooled credible intervals?
Is there a way to do all of this with the tidybayes package?
If it helps, I’ve included a small subset (n = 50) of my original dataset with the missingness so that you can run quickly the code and see what the anes_2012_models list object looks like.
dput(anes_2012)
structure(list(wid = c(3L, 1L, 2L, 1L, 4L, 1L, 5L, 1L, 3L, 1L,
2L, 5L, 2L, NA, 3L, NA, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 2L, 5L, 1L,
4L, 4L, 2L, 4L, 5L, 4L, 2L, 4L, 1L, 2L, 1L, 3L, 3L, 2L, 3L, NA,
3L, 3L, 3L, 2L, 1L, 2L, NA, 5L), female = c(0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L), rboff = c(1L,
1L, 1L, -1L, -1L, 0L, -1L, 0L, 2L, -1L, 1L, -2L, -2L, 1L, 0L,
1L, -1L, -2L, 1L, -1L, -1L, -2L, -1L, -1L, -1L, 2L, 0L, 1L, 1L,
0L, -1L, -1L, -2L, 2L, -1L, 0L, 0L, 0L, 1L, -1L, 1L, 1L, 1L,
0L, 0L, -1L, -1L, -1L, 1L, 1L), pboff = c(1L, 0L, 0L, 0L, 0L,
-1L, 1L, 0L, 2L, 2L, 2L, 0L, -1L, 0L, 0L, 0L, 2L, 0L, 0L, 2L,
NA, 0L, 0L, 1L, -1L, 0L, NA, 1L, 1L, 0L, 1L, 1L, 2L, 2L, 0L,
0L, 1L, 1L, 0L, 0L, NA, 0L, 0L, 2L, 0L, 2L, -1L, 0L, 1L, 1L),
ideo = c(6L, 4L, 5L, 3L, 4L, 6L, 6L, 6L, NA, NA, 6L, NA,
3L, NA, 5L, 4L, 6L, 5L, NA, 5L, 3L, 6L, 6L, 5L, 6L, 5L, 6L,
NA, 4L, 2L, 7L, 1L, 6L, 4L, 2L, 4L, 5L, 5L, NA, NA, 3L, NA,
2L, 2L, 3L, 6L, 7L, 6L, NA, 5L), epast = c(-1L, 0L, 1L, 0L,
0L, -1L, 0L, -2L, 1L, -1L, 0L, -2L, 0L, 0L, -1L, 0L, -1L,
-2L, 0L, 0L, 1L, 0L, -1L, -2L, -2L, 1L, 1L, 0L, 1L, 1L, -2L,
-1L, 0L, 0L, 1L, 0L, 0L, -2L, -1L, -1L, 1L, 1L, 0L, 0L, 0L,
-1L, -2L, 0L, -1L, 2L), efuture = c(0L, 1L, 1L, 1L, -1L,
1L, 1L, 0L, 2L, 0L, 0L, -2L, -2L, 1L, 0L, 0L, 0L, -1L, 0L,
1L, 1L, 0L, 0L, 0L, -2L, 1L, NA, 1L, 1L, 1L, -2L, 0L, 0L,
-1L, 0L, NA, 0L, 0L, -1L, 0L, NA, 0L, 0L, 0L, 1L, 0L, -2L,
1L, -1L, 1L), unpast = c(-1L, 1L, 1L, -1L, -1L, -2L, -1L,
-2L, 1L, -2L, 0L, -1L, -1L, -1L, 2L, -1L, -1L, -2L, -2L,
0L, 0L, -2L, -1L, -1L, -1L, 0L, 0L, 0L, 1L, 1L, -2L, -2L,
0L, -2L, 0L, 1L, 1L, -1L, -2L, -1L, 1L, 1L, 1L, 0L, 1L, -1L,
-2L, 0L, -2L, 1L), unnext = c(1L, 0L, 1L, 2L, 1L, 1L, 0L,
2L, 0L, 1L, 0L, 0L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 0L, NA, 1L,
1L, 0L, 2L, 1L, NA, 0L, 0L, 0L, 2L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 2L, 2L, NA, 0L, 2L), pid = c(6L,
5L, 5L, 2L, 4L, 7L, 6L, 6L, 1L, 6L, 7L, 3L, 2L, 1L, 4L, 2L,
5L, 7L, 4L, 3L, 1L, 5L, 6L, 7L, 5L, 6L, 6L, 4L, 2L, 1L, 7L,
1L, 7L, 5L, 1L, 4L, 6L, 5L, 4L, 7L, 2L, 4L, 1L, 7L, 2L, 5L,
7L, 7L, 3L, 2L), age = c(49L, -2L, 47L, 64L, 56L, 59L, 61L,
51L, 61L, 34L, 25L, 34L, 29L, 85L, 77L, 51L, 53L, 83L, 56L,
53L, 51L, 64L, 32L, 30L, 62L, 30L, 42L, 30L, 65L, 77L, 39L,
49L, 49L, 24L, 80L, 23L, 43L, 53L, 29L, 40L, 18L, 52L, 63L,
-2L, -2L, 28L, 56L, 84L, 51L, 34L), edu = c(5L, 4L, 5L, 3L,
3L, 4L, 5L, 5L, 2L, 1L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 3L, 1L,
3L, 4L, 4L, 2L, 4L, 4L, 3L, 3L, 3L, 4L, 5L, 1L, 1L, 4L, 3L,
NA, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 5L, 4L, 4L, 3L, 2L, 3L, 1L,
1L), findjob = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5L,
NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA), losejob = c(3L,
2L, 1L, NA, NA, 1L, 1L, 1L, NA, NA, 1L, NA, 2L, NA, NA, 1L,
3L, 1L, 1L, 3L, NA, 5L, 2L, NA, NA, 1L, 2L, NA, NA, NA, 3L,
NA, NA, 2L, NA, 1L, 3L, NA, 1L, 2L, 2L, 1L, 2L, NA, 2L, NA,
3L, NA, 3L, 1L), offwork = c(1L, 0L, 0L, NA, NA, 0L, 0L,
0L, NA, NA, 0L, NA, 0L, NA, NA, 0L, 0L, 0L, 0L, 1L, 0L, NA,
0L, NA, NA, 0L, 0L, NA, NA, NA, 0L, NA, NA, 0L, NA, 0L, 0L,
NA, 0L, 0L, 1L, 0L, 0L, NA, 0L, NA, 0L, NA, 0L, 0L), fairjblacks = c(-1L,
-2L, -1L, NA, -2L, -2L, NA, -1L, -2L, 2L, -2L, -2L, 2L, NA,
2L, NA, -2L, -2L, -2L, -2L, -1L, -2L, 2L, -2L, -2L, 1L, -1L,
2L, NA, -2L, -2L, 2L, -2L, -1L, 2L, 1L, -1L, -2L, -1L, -2L,
2L, NA, 2L, 1L, 1L, -1L, -2L, -1L, NA, NA), immtakejobs = c(2L,
3L, 2L, 2L, 1L, 3L, 3L, 3L, 2L, 4L, 1L, 4L, 3L, NA, 1L, NA,
3L, 1L, 3L, 2L, 3L, 2L, 2L, 1L, 3L, 2L, 1L, 4L, 1L, 2L, 4L,
1L, 2L, 2L, 4L, 2L, 2L, 4L, 2L, 2L, 3L, NA, 1L, 2L, 2L, 3L,
2L, 3L, NA, 2L), ecfamily = c(1L, -1L, -2L, -1L, 1L, 0L,
0L, 1L, 2L, 1L, 0L, 0L, 2L, NA, -2L, NA, -2L, 0L, 1L, 1L,
2L, 1L, -1L, 2L, 0L, -2L, 0L, -1L, -1L, -2L, 0L, 0L, -1L,
-1L, -2L, -1L, 1L, 0L, 0L, 0L, 1L, NA, 0L, -1L, 2L, -1L,
0L, 0L, NA, 2L), linked = c(0L, 1L, 1L, NA, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 0L, NA, 0L, NA, 1L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 1L, NA, 1L, NA, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, NA, 1L,
NA, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 1L, NA, NA), tradbreak = c(1L,
1L, 1L, 1L, 1L, 2L, NA, 2L, 1L, 0L, -2L, 1L, 1L, NA, 2L,
NA, 2L, 2L, 1L, 0L, 1L, 2L, 0L, 2L, 2L, -1L, 2L, 2L, 2L,
-1L, 1L, -2L, -2L, -2L, 0L, 1L, -1L, 2L, 0L, 0L, 1L, NA,
-2L, -1L, -1L, 1L, 2L, 1L, NA, 2L), toofar = c(-1L, 2L, 2L,
-1L, -1L, 1L, NA, 2L, -2L, -2L, 1L, 1L, 0L, NA, 0L, NA, -1L,
0L, 1L, -2L, -2L, 1L, 0L, 2L, 2L, -2L, 1L, -1L, -1L, -1L,
2L, -2L, -1L, 0L, 1L, -2L, 1L, -1L, 0L, 0L, 2L, NA, -2L,
1L, -2L, 0L, 2L, -1L, NA, -1L), govbias = c(0L, 0L, 0L, 0L,
0L, 0L, NA, 1L, 0L, 0L, 0L, 0L, 0L, NA, 0L, NA, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, NA, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, NA, 0L, -1L, 0L, 0L, 1L, NA,
NA, 0L), inflwhite = c(0L, 0L, -1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, NA, 0L, NA, 0L, 0L, -1L, 1L, 0L, 0L, -1L,
0L, -1L, 1L, 0L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 0L, 0L, NA, 0L), inflblack = c(0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, NA, 0L, NA,
0L, -1L, 0L, -1L, 0L, 1L, 1L, 0L, 1L, -1L, 0L, 0L, -1L, 0L,
1L, 0L, -1L, 0L, 0L, -1L, 0L, 1L, 0L, 0L, 0L, NA, 0L, 0L,
0L, 0L, 1L, 0L, NA, 0L), whitediscrim = c(-2L, -2L, 1L, -1L,
0L, -1L, -2L, 0L, 0L, -2L, -1L, 2L, -1L, NA, -1L, NA, 0L,
-2L, 0L, -1L, -1L, 0L, 0L, -1L, 2L, -1L, -1L, -2L, -1L, -2L,
1L, -2L, -1L, -1L, -1L, -1L, -1L, 0L, -2L, -1L, -2L, NA,
-2L, -1L, -1L, 0L, -2L, 0L, NA, -2L)), class = "data.frame", row.names = c(NA,
-50L))
I’m super new to rstan and to doing it with penalized regression so my apologies if this is a dumb question.