This is a follow up to Impute binary outcome variable for GLM using Stan in R
. Additionally, there is some relevant material here.
I am curious about the nature of the properties of the estimates for the regression parameters once we have marginalized over the missing values.
Based on the response to the earlier questions, here is what I believe the stan model should look like.
data {
// gets called once
int<lower=0> N_obs; // number of observations
int<lower=0,upper=250> N_mis; // number of points to predict for fut check
int y_obs[N_obs]; // response variable
int X_obs[N_obs];
int X_mis[N_mis];
}
transformed data{
// gets called once (after data block)
int<lower=0> N = N_obs + N_mis;
}
parameters {
// gets called every log prob evaluation
real b0;
real b1;
}
transformed parameters {
// gets called every log prob evaluation
}
model {
// gets called every log prob evaluation
target += student_t_lpdf(b0 | 7, 0, 3);
target += student_t_lpdf(b1 | 7, 0, 3);
for(i in 1:N_obs){
real eta;
eta = (1-X_obs[i]) * b0 + X_obs[i] * b1;
// update based on observed data
target += bernoulli_logit_lpmf(y_obs[i] | eta);
}
for(i in 1:N_mis){
real eta;
// X is there for everything
eta = (1-X_mis[i]) * b0 + X_mis[i] * b1;
// but we do not have the response so marginalise over the possibilities
target += log_mix(inv_logit(eta),
bernoulli_logit_lpmf(1 | eta),
bernoulli_logit_lpmf(0 | eta));
}
}
generated quantities {
// gets called once per draw
/*
predict the values for those unenrolled
*/
real p0;
real p1;
real y_mis[N_mis]; // response variable missing
p0 = inv_logit(b0);
p1 = inv_logit(b1);
for(i in 1:N_mis) {
real eta = (1 - X_mis[i])*b0 + X_mis[i]*b1;
y_mis[i] = bernoulli_logit_rng(eta);
}
}
The stan code is saved to the file logistic_003.stan
.
I simulated data in R and incorporate this into three separate data structures. The first is for the situation where no missing-ness is present, the second assumes a complete case analysis (i.e. drop the records with missing values) and the third is the setup to invoke the marginalization.
library(rstan)
library(doParallel)
library(foreach)
rstan::rstan_options(auto_write = TRUE)
Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
options(mc.cores = parallel::detectCores())
library(dplyr)
library(tibble)
# GENERATE DATA
# generate data
set.seed(123)
ldat1 <- list()
ldat2 <- list()
ldat3 <- list()
nsim <- 2000
for(i in 1:nsim){
N <- 250
x <- rep(0:1, each = N/2)
g0 <- 0.5
g1 <- 0.8
p <- (1-x)*g0 + x*g1
y <- rbinom(N, 1, p)
idx_ymis <- as.logical(rbinom(N, 1, 0.2))
N_mis <- sum(idx_ymis)
d <- tibble(y = y,
idx_ymis = idx_ymis,
x = x)
# setup for no missing data analysis
ld1 <- list(N_obs = N - 0,
N_mis = 0,
y_obs = d$y,
X_obs = d$x,
X_mis = numeric(0))
# setup for complete case analysis
ld2 <- list(N_obs = N - N_mis,
N_mis = 0,
y_obs = d$y[!d$idx_ymis],
X_obs = d$x[!d$idx_ymis],
X_mis = numeric(0))
# set up for marginalized missing
ld3 <- list(N_obs = N - N_mis,
N_mis = N_mis,
y_obs = d$y[!d$idx_ymis],
X_obs = d$x[!d$idx_ymis],
X_mis = d$x[d$idx_ymis])
ldat1[[i]] <- ld1
ldat2[[i]] <- ld2
ldat3[[i]] <- ld3
}
I compiled the stan model and then fit all the data sets. For brevity, the code below just does the first, but the attached files have the sims across all the simulated data.
m1 <- rstan::stan_model("logistic_003.stan")
starttime <- Sys.time()
message("Start simulation at ", starttime)
# initiate clusters
debug = F
cl <- NA
if(!debug){
cl <- makeCluster(4, outfile="")
# cl <- makeCluster(3, outfile="")
registerDoParallel(cl)
} else {
registerDoSEQ()
}
# predictive probability
set.seed(66)
res1 <- foreach(i = 1:nsim,
.errorhandling = 'pass',
.packages=c("rstan")
#.options.snow=opts,
) %dopar%{
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
message(paste0("Iteration ", i))
s1 <- rstan::sampling(object = m1,
data = ldat1[[i]],
chains = 4,
thin = 1,
iter = 2000,
refresh = 2000,
control = list(max_treedepth = 15)
)
smry <- summary(s1, probs = c(0.025, 0.975), pars = c("b0", "b1", "p0", "p1"))$summary
m <- as.data.frame(cbind(desc = 1, idx = i, smry))
m$par <- rownames(m)
return(m)
}
# ...
mres1 <- do.call(rbind, res1)
mres2 <- do.call(rbind, res2)
mres3 <- do.call(rbind, res3)
mres <- rbind(mres1, mres2, mres3)
lsav <- list(mres = mres, starttime = starttime, endtime = endtime)
saveRDS(lsav, "mres.RDS")
Then I plot the results.
library(tibble)
library(dplyr)
library(ggplot2)
d <- mres
rownames(d) <- NULL
d <- d %>%
dplyr::mutate(desc = case_when(
desc == 1 ~ "All Data",
desc == 2 ~ "Missing CC",
desc == 3 ~ "Missing Marg"
))
head(d)
d2 <- d %>% filter(par %in% c("b0", "b1")) %>%
dplyr::group_by(desc, par) %>%
dplyr::summarise(mu = mean(mean))
ggplot(d %>% filter(par %in% c("b0", "b1")),
aes(x = desc, y = mean))+
geom_jitter(height = 0, width = 0.15, alpha = 0.2)+
geom_hline(data = d2, aes(yintercept = mu, col = desc))+
facet_grid(~par)
d2 <- d %>% filter(par %in% c("p0", "p1")) %>%
dplyr::group_by(desc, par) %>%
dplyr::summarise(mu = mean(mean))
ggplot(d %>% filter(par %in% c("p0", "p1")),
aes(x = desc, y = mean))+
geom_jitter(height = 0, width = 0.1, alpha = 0.2)+
geom_hline(data = d2, aes(yintercept = mu, col = desc))+
facet_grid(~par)
which gives the distribution of the parameter estimates of interest in the figures below. The set of points on the LHS of each facet show the parameter estimates using the full dataset, the middle set of points shows complete case analysis (which drops 20% of the data) and the RHS set of points showing the marginalization approach. The horizontal lines show the means of each.
I can rationalize increased variability but I am curious as to why the estimates for b1 and p1 appear biased when the marginalization is invoked.
My first thought is that there might be something wrong in the stan model, but at this stage I don’t see it. Any advice appreciated.
logistic_003.stan (1.6 KB)
test_missing_data.R (4.3 KB)