Dear Stan-users,
I am in the process of developing a simple heckman selection model (aka Tobit Type 2 model) using the Wooldridge data set “mroz” (Wooldridge, 2002, page 565, example 17.6). The data set contains the wages for married females and a set of covariates. However, we only observe wages for 428 women who choose to work. We do not observe the wages for the 325 women who are not working.
The heckman model works with a probit selection equation to capture whether or not a woman is working (whether or not wage is observed) and an outcome equation to analyze the influence of covariates on wage.
My code works with a latent series that links selection and outcome (wage) equation. Furthermore, it splits the estimation of the outcome equation for observed and unobserved wages (missing values).
For the outcome equation, I am able to estimate coefficients that are similar to the estimates from a frequentist model based on the mroz data set. Also, my code can recover the parameters in the outcome equation from simulated data quite well.
For the selection equation, all of my estimates are off.
I would greatly appreciate input on the following questions:
- Do you agree with the general approach that I chose?
- Where’s the mistake that causes the estimates in the selection (probit equation) to be biased?
Below is the code for the model and a R script that automatically loads and analyzes the data set. The R script also includes a simulation for a tobit type 2 selection model to show that my results are not an artifact of the mroz data set.
Any help would be greatly appreciated!
Wooldridge, J. (2002): Econonometric analysis of cross section and panel data, MIT Press.
######### heckman selection model with mroz dataset###########################
library(MASS)
library(rstan)
library(shinystan)
library(foreign)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
set.seed(20151204)
rm(list=ls())
###########simulated data###########
####number of observations
N <- 700
####correlated error terms
Sigma <- matrix(c(1, .5, .5, 1), nrow=2, ncol=2)
errors <- mvrnorm(n=N, mu=c(0, 0), Sigma=Sigma)
e1 <- errors[,1]
e2 <- errors[,2]
####variables for outcome eqn
x0a <- rnorm(N,0,1)
x0b <- rnorm(N,0,1)
x1 <- rnorm(N,0,1)
x2 <- rnorm(N,0,1)
####selection eqn
z_star <- 1*x0a+1*x0b+1*x1+1*x2+e2
####selection variable
z = as.numeric(pnorm(z_star)>.25)
####outcome variable and eqn
y <- ifelse(z==1,1+1*x1+1*x2+e2,0)
####frequentist results
ols <- lm(y ~ x1 + x2, subset=z==1)
summary(ols)
library(sampleSelection)
heckit.ml <- heckit(selection = z ~ x0a + x0b +x1 + x2, outcome = y ~ x1 + x2, method = "ml")
summary(heckit.ml)
#### create intercepts
intercept <- rep(1, N)
#### independent variables that go into selection eqn
xa <- matrix(c(intercept, x0a,x0b, x1, x2), nrow=N, ncol=5)
nxa = ncol(xa)
#### independent variables that go into outcome eqn
xb <- matrix(c(intercept, x1, x2), nrow=N, ncol=3)
nxb = ncol(xb)
################# prep data for Stan######################
dat <- list(
lwage = y,
xa = xa,
xb = xb,
nxa = nxa,
nxb = nxb,
N = N,
P = 1,
observed = z
)
############# load mroz data ############
#mydata <- read.dta("http://www.uam.es/personal_pdi/economicas/rsmanga/docs/mroz.dta")
#######prepare data########
####dependent variable for outcome eqn
#lwage <- mydata$lwage
#replace unobserved values with 0
#lwage[is.na(lwage)]<-0
####independent variables for outcome eqn
#educ <- mydata$educ
#exper <- mydata$exper
#expersq <- mydata$expersq
####dependent variable for selection eqn
#observed <- mydata$inlf
####additional independent variables for selection eqn
#nwifeinc <- mydata$nwifeinc
#age <- mydata$age
#kidslt6 <- mydata$kidslt6
#kidsge6 <- mydata$kidsge6
####total number of obs
#N <- length(lwage)
#### create intercepts
#intercept <- rep(1, N)
#### independent variables that go into selection eqn
#xa <- matrix(c(intercept, nwifeinc, educ, exper, expersq, age, kidslt6, kidsge6), nrow=N, ncol=8)
#nxa = ncol(xa)
#### independent variables that go into outcome eqn
#xb <- matrix(c(intercept, educ, exper, expersq), nrow=N, ncol=4)
#nxb = ncol(xb)
################# prep data for Stan######################
#dat <- list(
# lwage = lwage,
# xa = xa,
# xb = xb,
# nxa = nxa,
# nxb = nxb,
# N = N,
# P = 1,
# observed = observed
#)
mroz_heckman <- '
data {
int N; // number of obs
int P; // number of continuous outcomes
int observed[N]; // dependent variable for selection eqn
int<lower=1> nxa; // number of variables in selection eqn
int<lower=1> nxb; // number of variables in outcome eqn
vector[N] lwage; // dependent variable for outcome eqn
vector[nxa] xa[N]; // variables for selection eqn
vector[nxb] xb[N]; // variables for outcome eqn
}
parameters{
vector[N] latent_series; // latent series to link selection and outcome eqn
vector[N] wage_mis; // variable capturing missing lwages
row_vector[nxa] alpha; // coefficients for selection eqn
row_vector[nxb] beta; // coefficients for outcome eqn
vector<lower=0>[P] scale;
corr_matrix[P+1] Omega;
}
transformed parameters {
matrix[N, P+1] Y2;
matrix[N, P+1] Y2_m;
vector[P+1] scales;
Y2 = append_col(lwage, latent_series);
Y2_m = append_col(wage_mis, latent_series);
scales[1]=scale[1];
scales[2] = 1.0; // set to one because of probit
}
model{
vector[P+1] mu[N];
scale ~ student_t(4,0,1);
Omega ~ lkj_corr(4);
for(n in 1:N) {
mu[n,1] = beta*xb[n]; // for outcome eqn
mu[n,2] = alpha*xa[n]; // for latent series that later goes into selection eqn
}
for(n in 1:N) {
if( observed[n] == 1 ){
Y2[n] ~ multi_normal(mu[n], quad_form_diag(Omega, scales)); // linear model for outcome eqn and latent series for observed lwages
}
else{
Y2_m[n] ~ multi_normal(mu[n], quad_form_diag(Omega, scales)); // linear model for outcome eqn and latent series for missing lwages
}
observed[n] ~ bernoulli(Phi_approx(latent_series[n])); // probit for selection
}
}
'
##########fit model#######
fit <- stan(model_code = mroz_heckman, data=dat, iter = 1000 , warmup = 500 , init="random" , chains = 2 )
######inspect coefficients######
print(fit, pars = c("alpha", "beta"))
shiny <- launch_shinystan(fit)
########compare to frequentist results#######
ols <- lm(lwage ~ educ + exper + expersq, subset=observed==1)
summary(ols)
library(sampleSelection)
heckit.ml <- heckit(selection = observed ~ nwifeinc +educ + exper + expersq + age + kidslt6 + kidsge6, outcome = lwage ~ educ + exper + expersq, method = "ml")
summary(heckit.ml)