Bayesian IV model help

Hello, new to Bayesian statistics. I want to do an IV estimation. Roughly, I have two variables, an endogenous variable and an outcome variable. To estimate the direct effect of the endogenous variable on the outcome variable, I want to use an instrument for the endogenous variable. I also have a set of control variables.

In addition, the outcome variable is the mean of some number of observations (sometimes 0, in which case the mean is undefined), so I need to weigh the second stage.

Ideally, I’d add fixed effects by country and year for both stages. However, since there is little within-country variation for the second stage, I’ll have to use random effects by country for the second stage.

Is this model roughly right?

data {
  int<lower=1> number_of_rows;
  int<lower=1> number_of_controls;
  int<lower=1> number_of_countries;
  int<lower=1> number_of_years;

  vector[number_of_rows] endogenous_variable;
  vector[number_of_rows] instrument;
  vector[number_of_rows] outcome_means;
  matrix[number_of_rows, number_of_controls] controls;
  array[number_of_rows] int<lower=0> outcome_observations_per_row;
  array[number_of_rows] int<lower=1, upper=number_of_countries> 
    country_index_of_row;
  array[number_of_rows] int<lower=1, upper=number_of_years> 
    year_index_of_row;
}

parameters {
  real stage_1_intercept;
  real instrument_coefficient;
  vector[number_of_controls] stage_1_control_coefficients;
  real<lower=0> stage_1_residual_standard_deviation;

  sum_to_zero_vector[number_of_years] stage_1_year_effects;
  sum_to_zero_vector[number_of_countries] stage_1_country_effects;

  real stage_2_intercept;
  real endogenous_variable_coefficient;
  vector[number_of_controls] stage_2_control_coefficients;
  real<lower=0> stage_2_residual_standard_deviation;

  sum_to_zero_vector[number_of_years] stage_2_year_effects;

  sum_to_zero_vector[number_of_countries] stage_2_z_country_effects;
  real<lower=0> stage_2_country_effects_standard_deviation;

  cholesky_factor_corr[2] lower_residual_correlation_matrix;
}

transformed parameters {
  vector[number_of_countries] stage_2_country_effects =
    stage_2_z_country_effects * 
      stage_2_country_effects_standard_deviation;
}

model {
  stage_1_intercept ~ std_normal();
  instrument_coefficient  ~ std_normal();
  stage_1_control_coefficients  ~ std_normal();
  stage_1_residual_standard_deviation ~ exponential(1);

  stage_1_year_effects ~ normal(0, 5);
  stage_1_country_effects ~ normal(0, 5);

  stage_2_intercept ~ std_normal();
  endogenous_variable_coefficient  ~ std_normal();
  stage_2_control_coefficients  ~ std_normal();
  stage_2_residual_standard_deviation ~ exponential(1);

  stage_2_year_effects ~ normal(0, 5);

  stage_2_z_country_effects ~ std_normal();
  stage_2_country_effects_standard_deviation ~ exponential(1);

  lower_residual_correlation_matrix ~ lkj_corr_cholesky(2);

  for (row_number in 1:number_of_rows) {
    int outcome_observations = outcome_observations_per_row[row_number];
    int country_index = country_index_of_row[row_number];
    int year_index = year_index_of_row[row_number];
    real endogenous_variable_prediction = 
      stage_1_intercept 
      + controls[row_number, :] * stage_1_control_coefficients
      + instrument_coefficient * instrument[row_number]
      + stage_1_year_effects[year_index]
      + stage_1_country_effects[country_index]
    ;

    if (outcome_observations > 0) {
      [endogenous_variable[row_number], outcome_means[row_number]] ~ 
        multi_normal_cholesky(
          [
            endogenous_variable_prediction,
            stage_2_intercept 
              + controls[row_number, :] * stage_2_control_coefficients
              + endogenous_variable_coefficient * endogenous_variable[row_number]
              + stage_2_year_effects[year_index]
              + stage_2_country_effects[country_index]
          ],
          diag_pre_multiply(
            [
              stage_1_residual_standard_deviation,
              stage_2_residual_standard_deviation / sqrt(outcome_observations)
            ],
            lower_residual_correlation_matrix)
        );
    } else {
      endogenous_variable[row_number] ~ 
        normal(
          endogenous_variable_prediction, 
          stage_1_residual_standard_deviation
        );
    }
  }
}

@James_Savage I saw your previous post on Bayeisan IV here and wondered if you had any advice

The r plm package by default uses a method from this paper to estimate an IV model with random effects: Full Information Estimations of a System of Simultaneous Equations with Error Component Structure | Econometric Theory | Cambridge Core, if that is of any help

I’ve settled on something more like this:

data {
  int<lower=1> number_of_rows;
  int<lower=1> number_of_controls;
  int<lower=1> number_of_countries;
  int<lower=1> number_of_years;
  int<lower=1> number_of_rows_with_prices;
  int<lower=1> number_of_rows_without_prices;

  vector[number_of_rows] quantities;
  vector[number_of_rows] incomes;
  vector[number_of_rows] price_means;
  matrix[number_of_rows, number_of_controls] controls;
  array[number_of_rows] int<lower=1, upper=number_of_countries> 
    country_indices;
  array[number_of_rows] int<lower=1, upper=number_of_years> 
    year_indices;
  array[number_of_rows_with_prices] int<lower=1, upper=number_of_rows> 
    row_numbers_with_prices;
  array[number_of_rows_without_prices] int<lower=1, upper=number_of_rows> 
    row_numbers_without_prices;
  array[number_of_rows_with_prices] int<lower=0> price_observations;
}

transformed data {
  vector[2] two_zeroes = rep_vector(0, 2);
  vector[number_of_rows] all_zeroes = rep_vector(0, number_of_rows);
}

parameters {
  // marginal benefit is column 1
  // marginal cost is column 2

  row_vector[2] intercepts;
  vector[number_of_controls] control_coefficients;
  real income_coefficient;
  row_vector[2] quantity_coefficients;

  row_vector[2] country_effects_standard_deviations;
  sum_to_zero_vector[number_of_years] marginal_benefit_year_effects;
  sum_to_zero_vector[number_of_years] marginal_cost_year_effects;

  cholesky_factor_corr[2] lower_country_effects_correlation_matrix;
  cholesky_factor_corr[2] lower_residual_correlation_matrix;

  vector[number_of_rows_with_prices] price_errors;
  real<lower=0> price_standard_deviation;
  row_vector[2] residual_standard_deviations;

  matrix[number_of_countries, 2] z_country_effects;
}

transformed parameters {
  vector[number_of_rows_with_prices] price_mean_standard_deviations =
    price_standard_deviation / 
      sqrt(price_observations);

  matrix[number_of_rows, 2] marginal_benefit_and_cost_predictions = 
    rep_matrix(intercepts, number_of_rows) +
    append_col(
      all_zeroes,
      controls * control_coefficients
    ) +
    quantities * quantity_coefficients +
    append_col(
      incomes * income_coefficient,
      all_zeroes
    ) +
    append_col(
      marginal_benefit_year_effects[year_indices],
      marginal_cost_year_effects[year_indices]
    ) +
    diag_post_multiply(
      z_country_effects[country_indices, :],
      country_effects_standard_deviations
    );

  vector[number_of_rows_with_prices] true_prices =
    price_means[row_numbers_with_prices] - price_errors;
  
  matrix[number_of_rows_with_prices, 2] marginal_benefit_and_cost_errors = 
    append_col(
      true_prices,
      true_prices
    ) - marginal_benefit_and_cost_predictions[row_numbers_with_prices, :];

  matrix[2,2] lower_residual_matrix =
    diag_pre_multiply(
      residual_standard_deviations,
      lower_residual_correlation_matrix
    );
  
  vector[number_of_rows_without_prices] error_differences =
    marginal_benefit_and_cost_predictions[row_numbers_without_prices, 2] - 
    marginal_benefit_and_cost_predictions[row_numbers_without_prices, 1];

  real marginal_benefit_residual_standard_deviation = 
    residual_standard_deviations[1];
  
  real marginal_cost_residual_standard_deviation = 
    residual_standard_deviations[2];
  
  matrix[2, 2] residual_correlation_matrix =
    multiply_lower_tri_self_transpose(lower_residual_correlation_matrix);
  
  real difference_standard_deviation = 
    sqrt(fmax(
      marginal_benefit_residual_standard_deviation^2 +
        marginal_cost_residual_standard_deviation^2 -
        2 * residual_correlation_matrix[2, 1] *
          marginal_benefit_residual_standard_deviation *
          marginal_cost_residual_standard_deviation,
      1e-9
    ));
}

model {
  intercepts ~ std_normal();
  to_vector(control_coefficients) ~ std_normal();
  income_coefficient ~ std_normal();
  quantity_coefficients ~ std_normal();

  country_effects_standard_deviations ~ exponential(1);
  marginal_benefit_year_effects ~ std_normal();
  marginal_cost_year_effects ~ std_normal();

  lower_country_effects_correlation_matrix ~ lkj_corr_cholesky(2.0);
  lower_residual_correlation_matrix ~ lkj_corr_cholesky(2.0);

  price_standard_deviation ~ exponential(1);

  residual_standard_deviations ~ exponential(1);

  price_errors ~ normal(0, price_mean_standard_deviations);

  for (country_index in 1:number_of_countries) {
    z_country_effects[country_index, :] ~
      multi_normal_cholesky(
        two_zeroes,
        lower_country_effects_correlation_matrix
      );
  }

  for (row_number in 1:number_of_rows_with_prices) {
    marginal_benefit_and_cost_errors[row_number, :] ~ 
      multi_normal_cholesky(
        two_zeroes,
        lower_residual_matrix
      );
  }

  error_differences ~ 
    normal(0, difference_standard_deviation);
}