This example is heavily inspired from the inspiring (redundant word) rethinking book. Let us assume two predictor variables and one gaussian outcome. The two predictors are co-related by the causal effects of un unobserved factor. Moreover, the two variables have missing values. A dag for this model could be:
library(ggdag)
dagify(X_obs ~ X + X_R,
Y_obs ~ Y + Y_R,
X ~ U,
Y ~ U,
X_R ~ Y,
Y_R ~ X,
O ~ X + Y)
To simulate the data in R for example:
library(rethinking)
library(tidyverse)
set.seed(9)
N <- 100L
mu <- 5
U <- rbinom(N, size = 2, c(.3, .4, .3))
b_x <- (-3)
b_y <- -5
U_x <- 3
U_y <- 2
k_U <- c(0, U_x, U_y)
k_x <- -.5
k_y <- -1
r_x <- 0.2
r_y <- 0.2
## Using unobserved U to sample x and y
cat_x <- U %>% map_dbl(function(u){
if(u == 1){
U_x
} else {
0
}
})
cat_y <- U %>% map_dbl(function(u){
if(u == 2){
U_y
} else {
0
}
})
x <- rbern( N , inv_logit(k_x + cat_x ) )
y <- rbern( N, inv_logit(k_y + cat_y ) )
R_x <- rbern( N , r_x )
R_y <- rbern( N, r_y )
x_obs <- x
x_obs[R_x==1] <- (-9L)
y_obs <- y
y_obs[R_y==1] <- (-9L)
sigma <- 0.5
simdata <- tibble(id = 1:N,
eta = mu + b_x * x + b_y * y,
outcome = rnorm(N, eta, sigma))
Now the model in Stan:
data {
int N;
int R_x[N]; // is remove R_ if remove R_ = 1 otherwise R_= 0
int R_y[N];
int x[N]; // binary variables with -99 in missing values
int y[N];
vector[N] outcome; // outcome variable assume a gaussian outcome
}
transformed data {
int x_id[N];
int y_id[N];
for(n in 1:N){
x_id[n] = x[n] + 1;
y_id[n] = y[n] + 1;
}
}
parameters {
real mu; // intercept
real<lower=0> sigma;
real b_x; // estimated effect of x
real b_y; // estimated effect of y
real theta_raw_x; // intercept for imputation model
real theta_raw_y; //
real beta_y_x; // beta y on x in imputation model
real beta_x_y; // beta x on y in imputation model
}
transformed parameters {
real<lower = 0, upper = 1> theta_x_y0 = inv_logit(theta_raw_x);
real<lower = 0, upper = 1> theta_x_y1 = inv_logit(theta_raw_x + beta_y_x);
real<lower = 0, upper = 1> theta_y_x0 = inv_logit(theta_raw_y);
real<lower = 0, upper = 1> theta_y_x1 = inv_logit(theta_raw_y + beta_x_y);
vector<lower = 0, upper = 1>[2] theta_x = [theta_x_y0, theta_x_y1]';
vector<lower = 0, upper = 1>[2] theta_y = [theta_y_x0, theta_y_x1]';
}
model {
// priors
// Benoullli missing model, default priors
target += cauchy_lpdf (theta_raw_x | 0.0, 10.0);
target += cauchy_lpdf (beta_y_x | 0.0, 2.5);
target += cauchy_lpdf (theta_raw_y | 0.0, 10.0);
target += cauchy_lpdf (beta_x_y | 0.0, 2.5);
// priors for outcome model
target += normal_lpdf (mu | 0.0, 10.0);
target += normal_lpdf (b_x | 0.0, 2.5);
target += normal_lpdf (b_y | 0.0, 2.5);
target += cauchy_lpdf (sigma | 0.0, 1.0);
// likelihood for imputation (missing sub-model)
for ( i in 1:N ) {
if ( R_x[i] == 1 && R_y[i] == 1) {
// do nothing
} else if ( R_x[i] == 1 && R_y[i] == 0) {
// missing model
target += log_sum_exp(
log ( theta_x[y_id[i]] ) +
bernoulli_logit_lpmf (y[i] | theta_raw_y),
log (1 - theta_x[y_id[i]]) +
bernoulli_logit_lpmf (y[i] | theta_raw_y + beta_x_y) );
} else if ( R_x[i] == 0 && R_y[i] == 1) {
// missing model
target += log_sum_exp(
log (theta_y[x_id[i]]) +
bernoulli_logit_lpmf (x[i] | theta_raw_x),
log (1 - theta_y[x_id[i]]) +
bernoulli_logit_lpmf (x[i] | theta_raw_x + beta_y_x) );
} else if ( R_x[i] == 0 && R_y[i] == 0 ) { // if x is not missing, calculates eta.
target += bernoulli_logit_lpmf (x[i] | theta_raw_x + beta_y_x * y[i]);
target += bernoulli_logit_lpmf (y[i] | theta_raw_y + beta_x_y * x[i]);
} // close if else missing
} // close for loop individuals
// likelihood contribution for outcome model (splitted by missing and non-missing)
for ( i in 1:N ) { // for each individual
if ( R_x[i] == 1 && R_y[i] == 1) { // if x and y are missing
target += log_sum_exp(
[log (theta_x[2]) + log (theta_y[2]) +
normal_lpdf (outcome[i] | mu + b_x * 1 + b_y * 1, sigma),
log (theta_x[1]) + log( 1 - theta_y[2]) +
normal_lpdf (outcome[i] | mu + b_x * 1 + b_y * 0, sigma),
log (1 - theta_x[2]) + log (theta_y[1]) +
normal_lpdf (outcome[i] | mu + b_x * 0 + b_y * 1, sigma),
log (1 - theta_x[1]) + log (1 - theta_y[1]) +
normal_lpdf (outcome[i] | mu + b_x * 0 + b_y * 0, sigma)]' );
} else if ( R_x[i] == 1 && R_y[i] == 0) {
target += log_sum_exp(
log(theta_x[y_id[i]]) +
normal_lpdf (outcome[i] | mu + b_x * 1 + b_y * y[i], sigma),
log(1 - theta_x[y_id[i]]) +
normal_lpdf (outcome[i] | mu + b_x * 0 + b_y * y[i], sigma) );
} else if ( R_x[i] == 0 && R_y[i] == 1) {
target += log_sum_exp(
log(theta_y[x_id[i]]) +
normal_lpdf (outcome[i] | mu + b_x * x[i] + b_y * 1, sigma),
log(1 - theta_y[x_id[i]]) +
normal_lpdf (outcome[i] | mu + b_x * x[i] + b_y * 0, sigma)) ;
} else if ( R_x[i] == 0 && R_y[i] == 0 ) { // if x is not missing, calculates eta.
target += normal_lpdf (outcome[i] | mu + b_x * x[i] + b_y * y[i], sigma);
} else {
reject("Value of R_ not recognised. Is there something wrong with the R_ (missing variables)? (events)")
} // close if else missing
} // close for loop individuals
}
All this goes well, my question then is this: there is clearly a pattern emerging here, well it is the classical forking path pattern re-emerging. Because the goal is to use this techniques with many more covariates, maybe 10? The combinations explode to 100. This will be hard to code bug-prone. Is there any better way to do this currently in Stan? In other words a more compact way to create this forking path? In case that there is no better way than hard coding the forking path would you imagine a better way in the future with new Stan releases?
Thanks you,