Hello, new to Bayesian statistics. I want to do an IV estimation. Roughly, I have two variables, an endogenous variable and an outcome variable. To estimate the direct effect of the endogenous variable on the outcome variable, I want to use an instrument for the endogenous variable. I also have a set of control variables.
In addition, the outcome variable is the mean of some number of observations (sometimes 0, in which case the mean is undefined), so I need to weigh the second stage.
Ideally, I’d add fixed effects by country and year for both stages. However, since there is little within-country variation for the second stage, I’ll have to use random effects by country for the second stage.
Is this model roughly right?
data {
int<lower=1> number_of_rows;
int<lower=1> number_of_controls;
int<lower=1> number_of_countries;
int<lower=1> number_of_years;
vector[number_of_rows] endogenous_variable;
vector[number_of_rows] instrument;
vector[number_of_rows] outcome_means;
matrix[number_of_rows, number_of_controls] controls;
array[number_of_rows] int<lower=0> outcome_observations_per_row;
array[number_of_rows] int<lower=1, upper=number_of_countries>
country_index_of_row;
array[number_of_rows] int<lower=1, upper=number_of_years>
year_index_of_row;
}
parameters {
real stage_1_intercept;
real instrument_coefficient;
vector[number_of_controls] stage_1_control_coefficients;
real<lower=0> stage_1_residual_standard_deviation;
sum_to_zero_vector[number_of_years] stage_1_year_effects;
sum_to_zero_vector[number_of_countries] stage_1_country_effects;
real stage_2_intercept;
real endogenous_variable_coefficient;
vector[number_of_controls] stage_2_control_coefficients;
real<lower=0> stage_2_residual_standard_deviation;
sum_to_zero_vector[number_of_years] stage_2_year_effects;
sum_to_zero_vector[number_of_countries] stage_2_z_country_effects;
real<lower=0> stage_2_country_effects_standard_deviation;
cholesky_factor_corr[2] lower_residual_correlation_matrix;
}
transformed parameters {
vector[number_of_countries] stage_2_country_effects =
stage_2_z_country_effects *
stage_2_country_effects_standard_deviation;
}
model {
stage_1_intercept ~ std_normal();
instrument_coefficient ~ std_normal();
stage_1_control_coefficients ~ std_normal();
stage_1_residual_standard_deviation ~ exponential(1);
stage_1_year_effects ~ normal(0, 5);
stage_1_country_effects ~ normal(0, 5);
stage_2_intercept ~ std_normal();
endogenous_variable_coefficient ~ std_normal();
stage_2_control_coefficients ~ std_normal();
stage_2_residual_standard_deviation ~ exponential(1);
stage_2_year_effects ~ normal(0, 5);
stage_2_z_country_effects ~ std_normal();
stage_2_country_effects_standard_deviation ~ exponential(1);
lower_residual_correlation_matrix ~ lkj_corr_cholesky(2);
for (row_number in 1:number_of_rows) {
int outcome_observations = outcome_observations_per_row[row_number];
int country_index = country_index_of_row[row_number];
int year_index = year_index_of_row[row_number];
real endogenous_variable_prediction =
stage_1_intercept
+ controls[row_number, :] * stage_1_control_coefficients
+ instrument_coefficient * instrument[row_number]
+ stage_1_year_effects[year_index]
+ stage_1_country_effects[country_index]
;
if (outcome_observations > 0) {
[endogenous_variable[row_number], outcome_means[row_number]] ~
multi_normal_cholesky(
[
endogenous_variable_prediction,
stage_2_intercept
+ controls[row_number, :] * stage_2_control_coefficients
+ endogenous_variable_coefficient * endogenous_variable[row_number]
+ stage_2_year_effects[year_index]
+ stage_2_country_effects[country_index]
],
diag_pre_multiply(
[
stage_1_residual_standard_deviation,
stage_2_residual_standard_deviation / sqrt(outcome_observations)
],
lower_residual_correlation_matrix)
);
} else {
endogenous_variable[row_number] ~
normal(
endogenous_variable_prediction,
stage_1_residual_standard_deviation
);
}
}
}