Here’s a Stan program that implements the Heckman likelihood and fits the program to fake data. It recovers the parameters well. I haven’t yet fitted it to repeated simulated datasets though.
EDIT: The code below has an error in the likelihood, which @RachaelMeager caught in her post below (link). Follow the link for the explanation, the correct code, and an extension!
The likelihood contribution for observed units below should be:
log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]]
+ rho / sd1
* (y1[n] - mu_y1[n]))));
but is erroneously coded as:
log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]]
+ rho / (sd1 * sqrt(1 - rho^2))
* (y1[n] - mu_y1[n]))));
data {
int<lower=1> N;
int<lower=1> N_neg;
int<lower=1> N_pos;
int<lower=1> D;
vector[N_pos] y1;
int<lower=0,upper=1> y2[N];
matrix[N_pos,D] X1;
matrix[N,D] X2;
}
transformed data {
int<lower=1,upper=N> n_pos[N_pos];
int<lower=0,upper=N> n_neg[N_neg];
{
int i;
int j;
i = 1;
j = 1;
for (n in 1:N) {
if (y2[n] == 1) {
n_pos[i] = n;
i = i + 1;
} else {
n_neg[j] = n;
j = j + 1;
}
}
}
}
parameters {
real<lower=-1,upper=1> rho;
vector[D] b1;
vector[D] b2;
real<lower=0> sd1;
}
model {
vector[N] mu_y2;
vector[N_pos] mu_y1;
b1 ~ normal(0, 1);
b2 ~ normal(0, 1);
sd1 ~ normal(0, 1);
mu_y2 = X2 * b2;
mu_y1 = X1 * b1;
for (n in 1:N_neg) {
target += (log(Phi(-mu_y2[n_neg[n]])));
}
for (n in 1:N_pos) {
target += log(Phi(sqrt(1 - rho^2)^(-1)*(mu_y2[n_pos[n]]
+ rho / (sd1 * sqrt(1 - rho^2))
* (y1[n] - mu_y1[n]))));
}
y1 ~ normal(mu_y1, sd1);
}
set.seed(1234)
library(MASS)
library(rstan)
library(bayesplot)
set.seed(12222)
n <- 1000
d <- 5
sig1 <- abs(rnorm(1))
sig2 <- 1
rho <- 2*runif(1)-1
Sigma <- diag(c(sig1,sig2)) %*% matrix(c(1,rho,rho,1),2,2) %*% diag(c(sig1, sig2))
X <- matrix(rnorm(n * 3 * d),n,3 * d)
X1 <- X[,1:d]
X2 <- X[,(d+1):(2 * d)]
X3 <- X[,(2 * d+1):(3 * d)]
b1 <- rnorm(d)
b2 <- rnorm(d)
noise <- mvrnorm(n = n, mu = c(0, 0), Sigma = Sigma)
y2 <- (X2 %*% b2 + noise[,2]) >= 0
y1 <- (X1 %*% b1 + noise[,1])
y1_sel <- y1[y2]
X1_sel <- X1[y2,]
stan_data <- list(y2 = y2[,1], y1 = y1_sel, X1 = X1_sel,
X2 = X2, N = n, D = d, N_pos = sum(y2[,1]),
N_neg = as.integer(length(y2[,1]) - sum(y2[,1])))
heck <- stan_model(file='stan_code/heck.stan')
fit <- sampling(heck, data = stan_data, iter = 2000,chains=4,cores = 4)
draws <- as.matrix(fit, pars = c('rho','b1','b2','sd1'))
true <- c(rho, b1, b2, sig1)
bayesplot::mcmc_recover_intervals(draws, true)
fake_data_heck.R (967 Bytes)