Marginalize missing binary outcome variable for GLM

This is a follow up to Impute binary outcome variable for GLM using Stan in R
. Additionally, there is some relevant material here.

I am curious about the nature of the properties of the estimates for the regression parameters once we have marginalized over the missing values.

Based on the response to the earlier questions, here is what I believe the stan model should look like.

data {   
  // gets called once
  int<lower=0> N_obs;                 // number of observations
  int<lower=0,upper=250> N_mis;       // number of points to predict for fut check

  int y_obs[N_obs];                   // response variable
  int X_obs[N_obs];
  int X_mis[N_mis];
  
}
transformed data{
  // gets called once (after data block)
  int<lower=0> N = N_obs + N_mis;
}
parameters {
  // gets called every log prob evaluation
  real b0;
  real b1;
}
transformed parameters {
  // gets called every log prob evaluation
}
model {

  // gets called every log prob evaluation
  target += student_t_lpdf(b0 | 7, 0, 3);
  target += student_t_lpdf(b1 | 7, 0, 3);

  for(i in 1:N_obs){
    
    real eta;
    eta = (1-X_obs[i]) * b0 + X_obs[i] * b1;
    // update based on observed data
    target += bernoulli_logit_lpmf(y_obs[i] | eta);
  }
  
  for(i in 1:N_mis){
    real eta;
    // X is there for everything
    eta = (1-X_mis[i]) * b0 + X_mis[i] * b1;
    
    // but we do not have the response so marginalise over the possibilities
    target += log_mix(inv_logit(eta), 
                      bernoulli_logit_lpmf(1 | eta), 
                      bernoulli_logit_lpmf(0 | eta));
  }
}
generated quantities {
  
  // gets called once per draw
  
  /*
  predict the values for those unenrolled
  */
  real p0;
  real p1;
  real y_mis[N_mis];                   // response variable missing
  
  p0 = inv_logit(b0);
  p1 = inv_logit(b1);
  
  for(i in 1:N_mis) {
    real eta = (1 - X_mis[i])*b0 + X_mis[i]*b1;
    y_mis[i] = bernoulli_logit_rng(eta);
  }
}  

The stan code is saved to the file logistic_003.stan.

I simulated data in R and incorporate this into three separate data structures. The first is for the situation where no missing-ness is present, the second assumes a complete case analysis (i.e. drop the records with missing values) and the third is the setup to invoke the marginalization.

library(rstan)
library(doParallel)
library(foreach)
rstan::rstan_options(auto_write = TRUE)
Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
options(mc.cores = parallel::detectCores())

library(dplyr)
library(tibble)

# GENERATE DATA

# generate data
set.seed(123)

ldat1 <- list()
ldat2 <- list()
ldat3 <- list()

nsim <- 2000
for(i in 1:nsim){
  N <- 250
  x <- rep(0:1, each = N/2)
  g0 <- 0.5
  g1 <- 0.8
  p <- (1-x)*g0 + x*g1
  y <- rbinom(N, 1, p)
  idx_ymis <- as.logical(rbinom(N, 1, 0.2))
  N_mis <- sum(idx_ymis)
  
  d <- tibble(y = y,
              idx_ymis = idx_ymis,
              x = x)
  
  # setup for no missing data analysis
  ld1 <- list(N_obs = N - 0,
              N_mis = 0,
              y_obs = d$y,
              X_obs = d$x,
              X_mis = numeric(0))
  
  # setup for complete case analysis
  ld2 <- list(N_obs = N - N_mis,
              N_mis = 0,
              y_obs = d$y[!d$idx_ymis],
              X_obs = d$x[!d$idx_ymis],
              X_mis = numeric(0))
  
  # set up for marginalized missing
  ld3 <- list(N_obs = N - N_mis,
              N_mis = N_mis,
              y_obs = d$y[!d$idx_ymis],
              X_obs = d$x[!d$idx_ymis],
              X_mis = d$x[d$idx_ymis])
  
  ldat1[[i]] <- ld1
  ldat2[[i]] <- ld2
  ldat3[[i]] <- ld3
}

I compiled the stan model and then fit all the data sets. For brevity, the code below just does the first, but the attached files have the sims across all the simulated data.

m1 <- rstan::stan_model("logistic_003.stan")

starttime <- Sys.time()
message("Start simulation at ", starttime)

# initiate clusters
debug = F
cl <- NA
if(!debug){
  cl <- makeCluster(4, outfile="")
  # cl <- makeCluster(3, outfile="")
  registerDoParallel(cl)
} else {
  registerDoSEQ()
}

# predictive probability
set.seed(66)

res1 <- foreach(i = 1:nsim,
                   .errorhandling = 'pass',
                   .packages=c("rstan")
                   #.options.snow=opts,
) %dopar%{
  
  options(mc.cores = parallel::detectCores())
  rstan_options(auto_write = TRUE)
  
  message(paste0("Iteration ", i))

  s1 <- rstan::sampling(object  = m1,
                        data    = ldat1[[i]],
                        chains  = 4,
                        thin    = 1,
                        iter    = 2000,
                        refresh = 2000,
                        control = list(max_treedepth = 15)
  )

  smry <- summary(s1, probs = c(0.025, 0.975), pars = c("b0", "b1", "p0", "p1"))$summary
  m <- as.data.frame(cbind(desc = 1, idx = i, smry))
  m$par <- rownames(m)
  return(m)
}

# ...
mres1 <- do.call(rbind, res1)
mres2 <- do.call(rbind, res2)
mres3 <- do.call(rbind, res3)
mres <- rbind(mres1, mres2, mres3)

lsav <- list(mres = mres, starttime = starttime, endtime = endtime)
saveRDS(lsav, "mres.RDS")

Then I plot the results.

library(tibble)
library(dplyr)
library(ggplot2)

d <- mres
rownames(d) <- NULL
d <- d %>%
  dplyr::mutate(desc = case_when(
    desc == 1 ~ "All Data",
    desc == 2 ~ "Missing CC",
    desc == 3 ~ "Missing Marg"
  ))

head(d)

d2 <- d %>% filter(par %in% c("b0", "b1")) %>%
  dplyr::group_by(desc, par) %>%
  dplyr::summarise(mu = mean(mean))

ggplot(d %>% filter(par %in% c("b0", "b1")), 
       aes(x = desc, y = mean))+
  geom_jitter(height = 0, width = 0.15, alpha = 0.2)+
  geom_hline(data = d2, aes(yintercept = mu, col = desc))+
  facet_grid(~par)

d2 <- d %>% filter(par %in% c("p0", "p1")) %>%
  dplyr::group_by(desc, par) %>%
  dplyr::summarise(mu = mean(mean))

ggplot(d %>% filter(par %in% c("p0", "p1")), 
       aes(x = desc, y = mean))+
  geom_jitter(height = 0, width = 0.1, alpha = 0.2)+
  geom_hline(data = d2, aes(yintercept = mu, col = desc))+
  facet_grid(~par)

which gives the distribution of the parameter estimates of interest in the figures below. The set of points on the LHS of each facet show the parameter estimates using the full dataset, the middle set of points shows complete case analysis (which drops 20% of the data) and the RHS set of points showing the marginalization approach. The horizontal lines show the means of each.

res1 res2

I can rationalize increased variability but I am curious as to why the estimates for b1 and p1 appear biased when the marginalization is invoked.

My first thought is that there might be something wrong in the stan model, but at this stage I don’t see it. Any advice appreciated.

logistic_003.stan (1.6 KB)
test_missing_data.R (4.3 KB)

2 Likes

I think there’s a mistake in the integrating out the missing data. I get super confused at this stuff so I’ll just show my work and you can check it yourself. I think the issue here is that the data is generated missing at random and so we don’t actually need to worry about the data that is missing.

If we have a posterior:

p(\theta, y_\text{miss} | y)

And we want to integrate out y_\text{miss}, then we want to do:

\int p(\theta, y_\text{miss} | y) dy_\text{miss}

We do our Bayes rule thing:

\frac{1}{p(y)} \int p(y | \theta, y_\text{miss})p(\theta, y_\text{miss})dy_\text{miss}

But that doesn’t really line up with our assumptions in this model. We assumed y and y_\text{miss} are independent:

\frac{1}{p(y)} \int p(y | \theta)p(y_\text{miss} | \theta) p(\theta)dy_\text{miss}

y_\text{miss} is just a Bernoulli so this is just:

\frac{1}{p(y)} p(y | \theta)p(y_\text{miss} = 0 | \theta) p(\theta) + \frac{1}{p(y)} p(y | \theta)p(y_\text{miss} = 1 | \theta) p(\theta)

This factors to:

\frac{1}{p(y)} p(y | \theta) p(\theta) \left( p(y_\text{miss} = 0 | \theta) + p(y_\text{miss} = 1 | \theta) \right)

And then just simplifies to:

\frac{p(y | \theta) p(\theta)}{p(y)}

So long story short since we’re doing missing at random we just do our inferences with the y we have and it’s the same as integrating out the missing data.

For the missingness to matter, the data we have should tell us something about the data we don’t have (y and y_\text{miss} would not be independent), otherwise we don’t learn anything. That’ll factor into the model somewhere and invalidate the algebra above.

6 Likes

Thanks for taking the time to write all this out, this is really helpful!

I hope it’s right lol.

2 Likes

Hi @maj684, welcome to the forum. Thanks for the well written and well documented question. I really want to help out when questions are so clearly stated.

In short, I believe the right way to correctly integrate out the discrete missingness is

target += log_mix(inv_logit(eta), 
                      bernoulli_logit_lpmf(1 | eta), 
                      bernoulli_logit_lpmf(0 | - eta))

I know you got this code from @bgoodri on Stack Overflow and I am really not sure whether I am right or not. I’ll also explain my working which might make it easier to check.

From the data simulation, I understand that you have two subpopulations, one where x=0 and one where x=1, and each subpopulation has an associated parameter p_0 and p_1 respectively. The goal is to estimate these two parameters. Sometimes, you have missing outcomes y but for every observations you know whether p = p_0 and p = p_1 where logit(p) = \eta and logit(1-p) = -\eta in the code.

So what the log_mix code is saying is that first for each observation there is probability p that we end up in situation 1 (S1) where there is a probability p on getting y=1 and a 1 - p probability that you end up in situation 2 (S2) where there is a probability 1 - p on getting y=0. So,

pr(y_{missing} = 1 | S1) = p \text{ and } pr(y_{missing} = 1 | S2) = 1 - (1 - p) = p \\ pr(y _{missing} = 1) = p

I think that means that the whole mixture statement can actually be simplified to

target += bernoulli_logit_lpmf(1 | eta);

or

1 ~ bernouilli_logit(eta);

and you might be able to vectorize the whole statement.

This simplification will only work with the binomial distribution, I think. Incidentally, this is also how you generate the missing y in the generated quantities block which gives me some confidence. It also reinforces Ben Bales message that whether you incorporate this part in the model or not should not matter.


Some further modeling advice.

  • The priors for b0 and b1 are pretty wide. I quickly checked for a normal prior with a scale of 3 and it implies that p0 and p1 have 45% probability of being smaller than .1 or higher than .9. A student-t will have an even higher probability
  • To really match your data simulation this
eta = (1-X_obs[i]) * b0 + X_obs[i] * b1;
// update based on observed data
target += bernoulli_logit_lpmf(y_obs[i] | eta);

should be

eta = (1-X_obs[i]) * inv_logit(b0) + X_obs[i] * inv_logit(b1);
// update based on observed data
target += bernoulli_lpmf(y_obs[i] | eta);

It doesn’t matter as long as x=0 or x=1 but it would make a difference for other values of x.

1 Like

@martinmodrak, do you mind checking my work here? If I am right, there is a slight mistake on this cross validated answer and we might want to set the record straight.

I think you’re stating that p(y,y^*\mid\theta)=p(y\mid\theta). Have I understood you right? If that equality is correct then we don’t need to marginalise out. But if it is the case that p(y,y^*\mid\theta)\ne p(y\mid\theta) then we must marginalise out. Given y=(y_1,...,y_n) and y^*=(y^*_1,...,y^*_m), it seems easy to show that the equality cannot be correct since p(y,y^*\mid\theta)=p(y\mid\theta)p(y^*\mid\theta) and p(y^*\mid\theta)\ne 1.

Forgive me if I’m being slow here, but I suspect that:

Let p=logit^{-1}(\eta) then bernoulli\_logit\_lpmf(x \mid \eta) = log(bernoulli(x \mid p))

Since we marginalising over x we cannot change the conditional so:

For x=1, then bernoulli\_logit\_lpmf(x=1 \mid \eta) = log(p)

For x=0, then bernoulli\_logit\_lpmf(x=0 \mid \eta) = log(1-p)

Hence:

log\_mix(...) = log(p \times exp(log(p))+ (1-p) \times exp(log(1-p)))

= log(p^2 + (1-p)^2)

And nothing cancels out so the marginalisation was correct as per OP.

I’m sure I’ve made some glaring error somewhere. Please point it out!

I’m assuming p(y, y^* | \theta) = p(y | \theta) p(y^* | \theta) assuming y^* is y_\text{miss}. So the two bits of data are conditionally independent given \theta.

But are you further saying that y_miss can just be ignored and does not need to be marginalised out of the likelihood because it makes no difference? Ie that p(y,y*|theta) = p(y|theta) ?

I am in no way an authority on this, but thanks @stijn for the trust :-)

I believe @bbbales2 is correct that if you assume independence of y and y_{miss} given \theta (i.e. p(y | \theta, y_{miss}) = p(y | \theta) ) you can safely just run the model dropping the unobserved data and impute it by making predictions from the model. I have nothing to add to that.

Further, this independence has to hold unless the model is altered significantly: The model assumes y depends on \theta and nothing else. If the independece didn’t hold, we would have to specify a formula for p(y | \theta, y_{miss}) that actually involves y_{miss}.

So where does the discrepance between the models by @maj684 come from? As others did, I’d rather show my work and be checked, because I am also very aware how easy it is to mess this up.

EDIT: The following is likely a bit misleading, one does in fact not need explicit parameters for the marginalization.

The simplest way to marginalize is to treat the marginalized values as explicit parameters in the model. If that was the case, we would have:

paremeters {
  real b0;
  real b1;
  vector<lower=0,upper=1>[N_mis] y_miss_1; //probability that y_miss is 1;
}

and then within the for loop, we’ll have:

 target += log_mix(y_miss_1[i], 
                      bernoulli_logit_lpmf(1 | eta), 
                      bernoulli_logit_lpmf(0 | eta));

Using this model, the inferences become identical between “Missing CC” and “Missing Marg” (run with smaller N and nsim as the explicit marginalization easily becomes computationally expensive).


This IMHO illuminates what is wrong with the original model - it doesn’t allow the flexibility of those additional parameters. It IMHO implies that p(y_{miss} = 1| x_{miss} = 1) and p(y_{miss} = 1| x_{miss} = 0) are correlated in a weird way (can’t really make out the exact maths around this).

This is IMHO validated by the fact that if you fix N_mis (and randomly choose which observations are missing), the discrepancy between “Missing CC” and “Missing Marg” increases with increasing N_mis.

I hope this illuminates more than confuses :-) I just spent too much time with this :-)


I also agree with @emiruz that @stijn’s proposed formula is suspicious, although I can’t immediately say why.

Finally a small Latex tip: you can get IMHO better typesetting by write logarithm and other common functions $\log(x)$ which renders as \log(x) (here \log is a special symbol defined in Latex).

2 Likes

@emiruz \int p(y, y^* | \theta) dy^* = p(y | \theta) is always true, but it is possible to actually compute it in this case because y and y^* are independent (generally no reason for that integral to be easy to compute).

1 Like