Hi all –
I’ve put together a little Heckman selection type model. It seems to run OK, but in simulation I’m finding the correlation parameter rho
is biased towards 1. @bgoodri or @jonah (or others) might have some ideas on what to do.
The Stan code:
// saved as heckman_correction.stan
functions {
vector inverse_mills(vector z) {
vector[rows(z)] out;
for(i in 1:rows(z)) {
out[i] = exp(normal_lpdf(z[i] | 0, 1)) / (Phi(z[i])); // yes I know but I pass it positive z rather than negative...
}
return(out);
}
}
data {
int N; // number of observations
int P; // number of covariates
int P2; // number of instruments
vector[N] Y; // income or log of income
vector<lower = 0, upper = 1>[N] participation; // participation in the workforce
matrix[N, P] X; // covariates
matrix[N, P2] Z_raw; // instruments
int estimate_model;
}
transformed data {
matrix[N, P + P2] Z = append_col(X, Z_raw);
}
parameters {
real alpha;
real alpha_1;
vector[P] beta;
vector[P + P2] gamma;
real<lower = 0> sigma_u;
real<lower = -1, upper = 1> rho;
}
transformed parameters {
vector[N] p = Phi(alpha_1 + Z * gamma);
}
model {
// priors
alpha ~ student_t(3, 0, 2);
alpha_1 ~ student_t(3, 0, 2);
beta ~ normal(0, 1);
gamma ~ normal(0, 1);
sigma_u ~ inv_gamma(1.5, 2);
rho ~ normal(0, .5);
// log likelihood for selection model
target += participation' * log(p) + (1 - participation)' * log(1 - p);
// log likelihood for outcome model
for(n in 1:N) {
if(participation[n] == 1.0) {
Y[n] ~ normal(alpha + X[n] * beta + sigma_u * rho * inverse_mills(rep_vector(p[n],1) ), sigma_u);
}
}
}
And some R code to simulate and run it
library(rstan); library(tidyverse)
# Simulate some fake data
N <- 500
P <- 5
P2 <- 5
X = matrix(rnorm(N*P, 0, .5), N, P)
Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)
gamma <- rnorm(P + P2)
Z <- cbind(X, Z_raw)
alpha_1 <- rnorm(1)
alpha <- rnorm(1)
beta <- rnorm(P)
sigma_u <- abs(rnorm(1))
# Draw the structural shocks
structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))
# Simulate reporting a salary (participation) and the wage
participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
wage <- participation * (alpha + X %*% beta + structural_shocks[,2])
# Record the actual probabilities
actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)
# Create the data list
data_list <- list(N = N,
P = P,
P2 = P2,
X = X, Z_raw = Z_raw,
Y = as.numeric(wage),
participation = participation,
estimate_model = 1)
# Compile the model
compiled_model <- stan_model("heckman_correction.stan")
# Sample
fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
fit_sim
confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)
confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)
data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"),
values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>%
unnest() %>%
mutate(estimates = confints$estimate,
lower = confints$conf.low,
upper = confints$conf.high) %>%
ggplot(aes(x = values, y = estimates, colour = parameter)) +
geom_point() +
geom_linerange(aes(ymin = lower, ymax = upper)) +
geom_abline(aes(intercept= 0, slope = 1))
data_frame(parameter = "p",
values = list(actual_p)) %>%
unnest() %>%
mutate(estimates = confints_p$estimate,
lower = confints_p$conf.low,
upper = confints_p$conf.high,
misestimated = values < lower | values > upper) %>%
ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
geom_point() +
scale_color_manual(values = c("black", "red")) +
guides(alpha = F) +
geom_linerange(aes(ymin = lower, ymax = upper)) +
geom_abline(aes(intercept= 0, slope = 1))
Which gives plots like these and
Thanks!