I[1] have coded up the bivariate probit with sample selection (“Heckprobit” as it is known on STATA - see here and here for examples) in Stan. See the Stan implementation below, and attached are the two datasets from STATA and the R code to fit these. The estimation on these is a perfect match.
The model is – unsurprisingly perhaps – rather slow to fit. Most annoyingly it can easily suffer from stuck chains etc.
I didn’t find any pre-existing Stan implementation of this on the forum, but if anyone has links or ideas I’d appreciate any hint.
The bivariate CDF is from @James_Savage (here) – if there are updated implementations of this I’d love to hear;
I’m looking for feedback on how to improve this – I’m sure there are lots of better ways to code this up.
Thanks !
functions {
  // James Savage binormal_cdf 
  real binormal_cdf_js(real z1, real z2, real rho) {
    if (z1 != 0 || z2 != 0) {
      real denom = abs(rho) < 1.0 ? sqrt((1 + rho) * (1 - rho)) : not_a_number();
      real a1 = (z2 / z1 - rho) / denom;
      real a2 = (z1 / z2 - rho) / denom;
      real product = z1 * z2;
      real delta = product < 0 || (product == 0 && (z1 + z2) < 0);
      return 0.5 * (Phi(z1) + Phi(z2) - delta) - owens_t(z1, a1) - owens_t(z2, a2);
    }
    return 0.25 + asin(rho) / (2 * pi());
  }
}
data {
  int<lower=1> N;
  int<lower=1> P_x;
  int<lower=1> P_z;
  matrix[N, P_x] X;  // no intercept column
  matrix[N, P_z] Z;  // no intercept column
  array[N] int<lower=0, upper=1> s;
  array[N] int<lower=0, upper=1> y;
}
transformed data {
  // Compute column means/SDs and standardized matrices
  vector[P_x] x_mean;
  vector[P_x] x_sd;
  vector[P_z] z_mean;
  vector[P_z] z_sd;
  matrix[N, P_x] Xs;
  matrix[N, P_z] Zs;
  
  for (j in 1:P_x) {
    vector[N] colx;
    for (i in 1:N) colx[i] = X[i, j];
    x_mean[j] = mean(colx);
    x_sd[j]   = sd(colx);
    // guard against zero-variance
    if (x_sd[j] <= 0) x_sd[j] = 1;
    for (i in 1:N) Xs[i, j] = (X[i, j] - x_mean[j]) / x_sd[j];
  }
  for (j in 1:P_z) {
    vector[N] colz;
    for (i in 1:N) colz[i] = Z[i, j];
    z_mean[j] = mean(colz);
    z_sd[j]   = sd(colz);
    if (z_sd[j] <= 0) z_sd[j] = 1;
    for (i in 1:N) Zs[i, j] = (Z[i, j] - z_mean[j]) / z_sd[j];
  }
}
parameters {
  // Parameters on the *standardized* scale
  real a_y_std;
  real a_s_std;
  vector[P_x] beta_std;
  vector[P_z] gamma_std;
  
  real eta_rho;   // unconstrained -> rho via tanh
}
transformed parameters {
  // Working linear predictors on standardized scale
  vector[N] mu_y = a_y_std + Xs * beta_std;
  vector[N] mu_s = a_s_std + Zs * gamma_std;
  real rho = tanh(eta_rho);
  
  // ---- Natural-scale coefficients for reporting ----
  vector[P_x] beta_nat;
  vector[P_z] gamma_nat;
  real a_y_nat;
  real a_s_nat;
  
  for (j in 1:P_x) beta_nat[j] = beta_std[j] / x_sd[j];
  for (j in 1:P_z) gamma_nat[j] = gamma_std[j] / z_sd[j];
  
  a_y_nat = a_y_std - dot_product(beta_nat, x_mean);
  a_s_nat = a_s_std - dot_product(gamma_nat, z_mean);
}
model {
  // Priors on standardized scale
  a_y_std   ~ normal(0, 5);
  a_s_std   ~ normal(0, 5);
  beta_std  ~ normal(0, 1);
  gamma_std ~ normal(0, 1);
  eta_rho   ~ normal(0, 1.5);
  
  // Likelihood via CDFs
for (i in 1:N) {
    real Phis = Phi(mu_s[i]);  // exact Φ
    if (s[i] == 0) {
      target += log1m(Phis);   // log{1 - Φ(zγ)}
    } else {
      // s=1
      if (y[i] == 1) {
        target += log( binormal_cdf_js(  mu_y[i],  mu_s[i],  rho) );   // Φ2(xβ, zγ;  ρ)
      } else {
        target += log( binormal_cdf_js( -mu_y[i],  mu_s[i], -rho) );   // Φ2(-xβ, zγ; -ρ)
      }
    }
  }
}
generated quantities {
  vector[N] log_lik;
  array[N] int<lower=0, upper=1> s_rep;
  array[N] int<lower=0, upper=1> y_rep;
  
  real ppc_rate_s_rep;
  real ppc_rate_y_given_s1_rep;
  real ppc_p11_rep;
  real ppc_p01_rep;
  real ppc_p10_rep;
  real ppc_p00_rep;
  real athrho = atanh(rho);   // Stata’s /athrho
  // RNG setup
  matrix[2,2] L;
  {
    matrix[2,2] Omega;
    Omega[1,1] = 1;    Omega[1,2] = rho;
    Omega[2,1] = rho;  Omega[2,2] = 1;
    L = cholesky_decompose(Omega);
  }
  
  int n_s1 = 0;
  int n_y1_s1 = 0;
  int c11 = 0; int c01 = 0; int c10 = 0; int c00 = 0;
  
  for (i in 1:N) {
    real Phis = Phi(mu_s[i]);
    real Phi2 = binormal_cdf_js(mu_y[i], mu_s[i], rho);
    if (s[i] == 0)       log_lik[i] = bernoulli_lpmf(0 | Phis);
    else if (y[i] == 1)  log_lik[i] = log(Phi2);
    else                 log_lik[i] = log_diff_exp(log(Phis), log(Phi2));
    
    // Posterior predictive via latent normals
    {
      vector[2] z;
      vector[2] eu;
      z[1] = normal_rng(0, 1);
      z[2] = normal_rng(0, 1);
      eu   = L * z;
      s_rep[i] = (a_s_std + dot_product(row(Zs, i), gamma_std) + eu[2]) > 0;
      y_rep[i] = (a_y_std + dot_product(row(Xs, i), beta_std) + eu[1]) > 0;
      if (s_rep[i] == 1) {
        n_s1 += 1;
        if (y_rep[i] == 1) { n_y1_s1 += 1; c11 += 1; } else { c01 += 1; }
      } else {
        if (y_rep[i] == 1) c10 += 1; else c00 += 1;
      }
    }
  }
  
  ppc_rate_s_rep = n_s1 / (1.0 * N);
  ppc_rate_y_given_s1_rep = n_s1 > 0 ? (n_y1_s1 / (1.0 * n_s1)) : not_a_number();
  ppc_p11_rep = c11 / (1.0 * N);
  ppc_p01_rep = c01 / (1.0 * N);
  ppc_p10_rep = c10 / (1.0 * N);
  ppc_p00_rep = c00 / (1.0 * N);
}
school.csv (6.3 KB)
heartsm.csv (20.4 KB)
heckprobit_canon.R (11.7 KB)
[1] With significant help form gpt-5.
1 ↩︎