I[1] have coded up the bivariate probit with sample selection (“Heckprobit” as it is known on STATA - see here and here for examples) in Stan. See the Stan implementation below, and attached are the two datasets from STATA and the R code to fit these. The estimation on these is a perfect match.
The model is – unsurprisingly perhaps – rather slow to fit. Most annoyingly it can easily suffer from stuck chains etc.
I didn’t find any pre-existing Stan implementation of this on the forum, but if anyone has links or ideas I’d appreciate any hint.
The bivariate CDF is from @James_Savage (here) – if there are updated implementations of this I’d love to hear;
I’m looking for feedback on how to improve this – I’m sure there are lots of better ways to code this up.
Thanks !
functions {
// James Savage binormal_cdf
real binormal_cdf_js(real z1, real z2, real rho) {
if (z1 != 0 || z2 != 0) {
real denom = abs(rho) < 1.0 ? sqrt((1 + rho) * (1 - rho)) : not_a_number();
real a1 = (z2 / z1 - rho) / denom;
real a2 = (z1 / z2 - rho) / denom;
real product = z1 * z2;
real delta = product < 0 || (product == 0 && (z1 + z2) < 0);
return 0.5 * (Phi(z1) + Phi(z2) - delta) - owens_t(z1, a1) - owens_t(z2, a2);
}
return 0.25 + asin(rho) / (2 * pi());
}
}
data {
int<lower=1> N;
int<lower=1> P_x;
int<lower=1> P_z;
matrix[N, P_x] X; // no intercept column
matrix[N, P_z] Z; // no intercept column
array[N] int<lower=0, upper=1> s;
array[N] int<lower=0, upper=1> y;
}
transformed data {
// Compute column means/SDs and standardized matrices
vector[P_x] x_mean;
vector[P_x] x_sd;
vector[P_z] z_mean;
vector[P_z] z_sd;
matrix[N, P_x] Xs;
matrix[N, P_z] Zs;
for (j in 1:P_x) {
vector[N] colx;
for (i in 1:N) colx[i] = X[i, j];
x_mean[j] = mean(colx);
x_sd[j] = sd(colx);
// guard against zero-variance
if (x_sd[j] <= 0) x_sd[j] = 1;
for (i in 1:N) Xs[i, j] = (X[i, j] - x_mean[j]) / x_sd[j];
}
for (j in 1:P_z) {
vector[N] colz;
for (i in 1:N) colz[i] = Z[i, j];
z_mean[j] = mean(colz);
z_sd[j] = sd(colz);
if (z_sd[j] <= 0) z_sd[j] = 1;
for (i in 1:N) Zs[i, j] = (Z[i, j] - z_mean[j]) / z_sd[j];
}
}
parameters {
// Parameters on the *standardized* scale
real a_y_std;
real a_s_std;
vector[P_x] beta_std;
vector[P_z] gamma_std;
real eta_rho; // unconstrained -> rho via tanh
}
transformed parameters {
// Working linear predictors on standardized scale
vector[N] mu_y = a_y_std + Xs * beta_std;
vector[N] mu_s = a_s_std + Zs * gamma_std;
real rho = tanh(eta_rho);
// ---- Natural-scale coefficients for reporting ----
vector[P_x] beta_nat;
vector[P_z] gamma_nat;
real a_y_nat;
real a_s_nat;
for (j in 1:P_x) beta_nat[j] = beta_std[j] / x_sd[j];
for (j in 1:P_z) gamma_nat[j] = gamma_std[j] / z_sd[j];
a_y_nat = a_y_std - dot_product(beta_nat, x_mean);
a_s_nat = a_s_std - dot_product(gamma_nat, z_mean);
}
model {
// Priors on standardized scale
a_y_std ~ normal(0, 5);
a_s_std ~ normal(0, 5);
beta_std ~ normal(0, 1);
gamma_std ~ normal(0, 1);
eta_rho ~ normal(0, 1.5);
// Likelihood via CDFs
for (i in 1:N) {
real Phis = Phi(mu_s[i]); // exact Φ
if (s[i] == 0) {
target += log1m(Phis); // log{1 - Φ(zγ)}
} else {
// s=1
if (y[i] == 1) {
target += log( binormal_cdf_js( mu_y[i], mu_s[i], rho) ); // Φ2(xβ, zγ; ρ)
} else {
target += log( binormal_cdf_js( -mu_y[i], mu_s[i], -rho) ); // Φ2(-xβ, zγ; -ρ)
}
}
}
}
generated quantities {
vector[N] log_lik;
array[N] int<lower=0, upper=1> s_rep;
array[N] int<lower=0, upper=1> y_rep;
real ppc_rate_s_rep;
real ppc_rate_y_given_s1_rep;
real ppc_p11_rep;
real ppc_p01_rep;
real ppc_p10_rep;
real ppc_p00_rep;
real athrho = atanh(rho); // Stata’s /athrho
// RNG setup
matrix[2,2] L;
{
matrix[2,2] Omega;
Omega[1,1] = 1; Omega[1,2] = rho;
Omega[2,1] = rho; Omega[2,2] = 1;
L = cholesky_decompose(Omega);
}
int n_s1 = 0;
int n_y1_s1 = 0;
int c11 = 0; int c01 = 0; int c10 = 0; int c00 = 0;
for (i in 1:N) {
real Phis = Phi(mu_s[i]);
real Phi2 = binormal_cdf_js(mu_y[i], mu_s[i], rho);
if (s[i] == 0) log_lik[i] = bernoulli_lpmf(0 | Phis);
else if (y[i] == 1) log_lik[i] = log(Phi2);
else log_lik[i] = log_diff_exp(log(Phis), log(Phi2));
// Posterior predictive via latent normals
{
vector[2] z;
vector[2] eu;
z[1] = normal_rng(0, 1);
z[2] = normal_rng(0, 1);
eu = L * z;
s_rep[i] = (a_s_std + dot_product(row(Zs, i), gamma_std) + eu[2]) > 0;
y_rep[i] = (a_y_std + dot_product(row(Xs, i), beta_std) + eu[1]) > 0;
if (s_rep[i] == 1) {
n_s1 += 1;
if (y_rep[i] == 1) { n_y1_s1 += 1; c11 += 1; } else { c01 += 1; }
} else {
if (y_rep[i] == 1) c10 += 1; else c00 += 1;
}
}
}
ppc_rate_s_rep = n_s1 / (1.0 * N);
ppc_rate_y_given_s1_rep = n_s1 > 0 ? (n_y1_s1 / (1.0 * n_s1)) : not_a_number();
ppc_p11_rep = c11 / (1.0 * N);
ppc_p01_rep = c01 / (1.0 * N);
ppc_p10_rep = c10 / (1.0 * N);
ppc_p00_rep = c00 / (1.0 * N);
}
school.csv (6.3 KB)
heartsm.csv (20.4 KB)
heckprobit_canon.R (11.7 KB)
[1] With significant help form gpt-5.
1 ↩︎