Heckman selection model code + simulation

Here’s a Stan program that implements the Heckman likelihood and fits the program to fake data. It recovers the parameters well. I haven’t yet fitted it to repeated simulated datasets though.
EDIT: The code below has an error in the likelihood, which @RachaelMeager caught in her post below (link). Follow the link for the explanation, the correct code, and an extension!

The likelihood contribution for observed units below should be:

log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]] 
                               + rho / sd1
                               * (y1[n] - mu_y1[n]))));

but is erroneously coded as:

log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]] 
                               + rho / (sd1 * sqrt(1 - rho^2))
                               * (y1[n] - mu_y1[n]))));
data {
  int<lower=1> N;
  int<lower=1> N_neg;
  int<lower=1> N_pos;
  int<lower=1> D;
  vector[N_pos] y1;
  int<lower=0,upper=1> y2[N];
  matrix[N_pos,D] X1;
  matrix[N,D] X2;
}
transformed data {
  int<lower=1,upper=N> n_pos[N_pos];
  int<lower=0,upper=N> n_neg[N_neg];
  {
    int i;
    int j;
    i = 1;
    j = 1;
    for (n in 1:N) {
      if (y2[n] == 1) {
        n_pos[i] = n;
        i = i + 1;
      } else {
        n_neg[j] = n;
        j = j + 1;
      }
    }
  }
}
parameters {
  real<lower=-1,upper=1> rho;
  vector[D] b1;
  vector[D] b2;
  real<lower=0> sd1;
}
model {
  vector[N] mu_y2;
  vector[N_pos] mu_y1;

  b1 ~ normal(0, 1);
  b2 ~ normal(0, 1);
  sd1 ~ normal(0, 1);

  mu_y2 = X2 * b2;
  mu_y1 = X1 * b1;

  for (n in 1:N_neg) {
    target += (log(Phi(-mu_y2[n_neg[n]])));
  }
  for (n in 1:N_pos) {
    target += log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]] 
                               + rho / (sd1 * sqrt(1 - rho^2))
                               * (y1[n] - mu_y1[n]))));
  }
  y1 ~ normal(mu_y1, sd1);
}
set.seed(1234)
library(MASS)
library(rstan)
library(bayesplot)
set.seed(12222)
n <- 1000
d <- 5

sig1 <- abs(rnorm(1))
sig2 <- 1
rho <- 2*runif(1)-1
Sigma <- diag(c(sig1,sig2)) %*% matrix(c(1,rho,rho,1),2,2) %*% diag(c(sig1, sig2))

X <- matrix(rnorm(n * 3 * d),n,3 * d)
X1 <- X[,1:d]
X2 <- X[,(d+1):(2 * d)]
X3 <- X[,(2 * d+1):(3 * d)]

b1 <- rnorm(d)
b2 <- rnorm(d)

noise <- mvrnorm(n = n, mu = c(0, 0), Sigma = Sigma)

y2 <- (X2 %*% b2 + noise[,2]) >= 0
y1 <- (X1 %*% b1 + noise[,1])
y1_sel <- y1[y2]
X1_sel <- X1[y2,] 

stan_data <- list(y2 = y2[,1], y1 = y1_sel, X1 = X1_sel, 
                  X2 = X2, N = n, D = d, N_pos = sum(y2[,1]), 
                  N_neg = as.integer(length(y2[,1]) - sum(y2[,1])))
heck <- stan_model(file='stan_code/heck.stan')
fit <- sampling(heck, data = stan_data, iter = 2000,chains=4,cores = 4)
draws <- as.matrix(fit, pars = c('rho','b1','b2','sd1')) 
true <- c(rho, b1, b2, sig1)

bayesplot::mcmc_recover_intervals(draws, true)

fake_data_heck.R (967 Bytes)

6 Likes