I’ve been working with multivariate probit models recently and one thing that occurred to me is that it would be nice if one could model missing outcomes.
Assuming one enters missing values as -1, and using @bgoodri 's parametrization, would something like this make sense or is this completely wrong?
real multiprobit_lpmf(
int[] y, // response data - integer array of 1s, 0s and -1s for missing data
real[] mu, // Intercept
real[] u, // nuisance that absorbs inequality constraints
int K, // response dimension size
matrix L_Omega // lower cholesky of correlation matrix
){
vector[K] z; // latent continuous state
vector[K] logLik; // log likelihood
real prev;
prev = 0;
for (k in 1:K) {
if (y[k] == -1 ) {
z[k] = 0;
logLik[k] = 0;
}
else {
real bound; // threshold at which utility = 0
bound = approx_Phi( -(mu[k] + prev) / L_Omega[k,k] );
if (y[k] == 1) {
real t;
t = bound + (1 - bound) * u[k];
z[k] = approx_inv_Phi(t); // implies utility is positive
logLik[k] = log1m(bound); // Jacobian adjustment
}
else {
real t;
t = bound * u[k];
z[k] = approx_inv_Phi(t); // implies utility is negative
logLik[k]= log(bound); // Jacobian adjustment
}
}
if (k < K) prev = L_Omega[k+1,1:k] * head(z, k);
// Jacobian adjustments imply z is truncated standard normal
// thus utility --- mu + L_Omega * z --- is truncated multivariate normal
}
return sum(logLik);
}
I simulated some data and got pretty reasonable results:
set.seed(1234)
y <- mutate(as.data.frame(rmvnorm(100, c(-1,1), matrix(c(1,0.5,0.5,1), nrow = 2))>0)
, across(everything(), as.numeric))
y2 <- y
y2[1:3,1] <- -1
y2[20:25,2] <- -1
## y2[5:6,2] <- -1
data_test_1 <- list(N = 100
, K = 2
, y = y)
data_test_2 <- list(N = 100
, K = 2
, y = y2)
samples_1 <- sampling(model_1
, data = data_test_1
, cores = 4
, chains = 4
, iter = 1000
, warmup = 500)
samples_2 <- sampling(model_1
, data = data_test_2
, cores = 4
, chains = 4
, iter = 1000
, warmup = 500)
Gets me:
print(samples_1, pars = c("Intercept", "Omega"))
Inference for Stan model: anon_model.
4 chains, each with iter=1000; warmup=500; thin=1;
post-warmup draws per chain=500, total post-warmup draws=2000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
Intercept[1] -0.82 0.00 0.15 -1.13 -0.92 -0.82 -0.73 -0.54 2506 1.00
Intercept[2] 0.88 0.00 0.15 0.60 0.78 0.87 0.97 1.18 1798 1.00
Omega[1,1] 1.00 NaN 0.00 1.00 1.00 1.00 1.00 1.00 NaN NaN
Omega[1,2] 0.59 0.01 0.20 0.16 0.46 0.60 0.74 0.93 471 1.01
Omega[2,1] 0.59 0.01 0.20 0.16 0.46 0.60 0.74 0.93 471 1.01
Omega[2,2] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 129 1.00
> print(samples_2, pars = c("Intercept", "Omega"))
Inference for Stan model: anon_model.
4 chains, each with iter=1000; warmup=500; thin=1;
post-warmup draws per chain=500, total post-warmup draws=2000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
Intercept[1] -0.80 0.00 0.14 -1.08 -0.90 -0.80 -0.71 -0.53 4794 1
Intercept[2] 0.89 0.00 0.16 0.59 0.79 0.89 1.00 1.21 4359 1
Omega[1,1] 1.00 NaN 0.00 1.00 1.00 1.00 1.00 1.00 NaN NaN
Omega[1,2] 0.52 0.01 0.20 0.10 0.38 0.54 0.66 0.84 951 1
Omega[2,1] 0.52 0.01 0.20 0.10 0.38 0.54 0.66 0.84 951 1
Omega[2,2] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 550 1
But then again, maybe this is just a fluke?
–
The whole model:
functions {
real approx_Phi(real x) {
return inv_logit(x * 1.702);
}
real approx_inv_Phi(real x) {
return logit(x) / 1.702;
}
real multiprobit_lpmf(
int[] y, // response data - integer array of 1s and 0s
real[] mu, // Intercept
real[] u, // nuisance that absorbs inequality constraints
int K, // response dimension size
matrix L_Omega // lower cholesky of correlation matrix
){
vector[K] z; // latent continuous state
vector[K] logLik; // log likelihood
real prev;
/* mu = Intercept + beta * x; */
prev = 0;
for (k in 1:K) { // Phi and inv_Phi may overflow and / or be numerically inaccurate
if (y[k] == -1 ) {
z[k] = 0;
logLik[k] = 0;
}
else {
real bound; // threshold at which utility = 0
bound = approx_Phi( -(mu[k] + prev) / L_Omega[k,k] );
if (y[k] == 1) {
real t;
t = bound + (1 - bound) * u[k];
z[k] = approx_inv_Phi(t); // implies utility is positive
logLik[k] = log1m(bound); // Jacobian adjustment
}
else {
real t;
t = bound * u[k];
z[k] = approx_inv_Phi(t); // implies utility is negative
logLik[k]= log(bound); // Jacobian adjustment
}
}
if (k < K) prev = L_Omega[k+1,1:k] * head(z, k);
// Jacobian adjustments imply z is truncated standard normal
// thus utility --- mu + L_Omega * z --- is truncated multivariate normal
}
return sum(logLik);
}
}
data {
int<lower=1> K; // Kesponse dimension size
int<lower=0> N; // sample size
int<lower=-1,upper=1> y[N, K]; // outcomes
}
parameters {
vector[K] Intercept;
cholesky_factor_corr[K] L_Omega;
real<lower=0,upper=1> u[N,K]; // nuisance that absorbs inequality constraints
}
transformed parameters {
real mu[N, K];
for (k in 1:K) {
mu[, k] = to_array_1d(Intercept[k] * rep_vector(1, N)); //
}
}
model {
L_Omega ~ lkj_corr_cholesky(2);
Intercept ~ std_normal();
// implicit: u is iid standard uniform a priori
{ // likelihood
for(n in 1:N){
target +=
multiprobit_lpmf(y[n] | mu[n], u[n], K, L_Omega);
}
}
}
generated quantities {
corr_matrix[K] Omega = multiply_lower_tri_self_transpose(L_Omega);
vector[K] preds[N]; // predictions
for (n in 1:N){
{
vector[K] preds_tmp = multi_normal_cholesky_rng(to_vector(mu[n]), L_Omega);
for (k in 1:K)
if (preds_tmp[k] > 0)
preds[n,k] = 1;
else
preds[n,k] = 0;
}
}
}