Error term correlations harm convergence

I am running a Bivariate Heckman selection model inspired by the work of @rtrangucci. The model consists of 4 equations (two incidence equations for the binary DVs: y1 and y3; and two conditional outcome equations for the continous DVs: y2 and y4) with correlated error terms across the four equations.

The model shows poor convergence on my real data (see plots below), so I started taking out complexity and I figured that once I set the 4 x 4 correlation matrix of error terms (Omega) to the identity matrix, the model starts converging on my real data. So I think the error term correlations are causing the convergence issue. However, conceptually it totally makes sense to include the error term correlations.

  • How can I try achieving convergence without setting the correlation matrix to the identity matrix?
    All ideas are highly appreciated.

Note: I tried initializing the model with the MLE estimates using the optimizing command, which does not help the model converge.

When the model needs to estimate the error term correlations, the sampler gets completely stuck:


When I define the error term correlation matrix as idetntity matrix, the sampler converges to a solution and continues moving:

Here is the full Stan model code I’m using on my real data and below the simulated data files.

functions {
  real binormal_cdf(real z1, real z2, real rho) {
      if (z1 != 0 || z2 != 0) {
        real denom = fabs(rho) < 1.0 ? sqrt((1 + rho) * (1 - rho)) : not_a_number();
        real a1 = (z2 / z1 - rho) / denom;
        real a2 = (z1 / z2 - rho) / denom;
        real product = z1 * z2;
        real delta = product < 0 || (product == 0 && (z1 + z2) < 0);
        return 0.5 * (Phi(z1) + Phi(z2) - delta) - owens_t(z1, a1) - owens_t(z2, a2);
      }
      return 0.25 + asin(rho) / (2 * pi());
    }
}

data {
  int<lower = 1> N;
  int<lower = 1> n_00;
  int<lower = 1> n_01;
  int<lower = 1> n_10;
  int<lower = 1> n_11;
 
  vector[N] y1;
  vector[N] y3;
  
  vector[n_01] y2_01;
  vector[n_10] y4_10;
  matrix[2,n_11] y2_y4_11;
}

parameters { 
  
  vector<lower=0>[2] sigs;

  corr_matrix[4] Omega;
  
  vector[4] alpha;
  }

model {
  vector[2] sq_sigs = square(sigs);
  vector[4] full_sigs = [1, 1, sigs[1], sigs[2]]';
  
    alpha ~ normal(0, 5);
  
    sigs ~ normal(0, 10);
  
    Omega ~ lkj_corr(30.0);

    matrix[4,4] Sigma_t = quad_form_diag(Omega, full_sigs); 
    
    real inv_cond_sd_13 = inv(sqrt(1 - Omega[1,3]^2));
    real inv_cond_sd_23 = inv(sqrt(1 - Omega[2,3]^2));
    real cond_corr_y2_y2 = (Omega[1,2] - Omega[1,3] * Omega[2,3])*inv_cond_sd_23*inv_cond_sd_13;
    
    real inv_cond_sd_14 = inv(sqrt(1 - Omega[1,4]^2));
    real inv_cond_sd_24 = inv(sqrt(1 - Omega[2,4]^2));
    real cond_corr_y2_y4 = (Omega[1,2] - Omega[1,4] * Omega[2,4])*inv_cond_sd_24*inv_cond_sd_14;
    
    matrix[2,2] Sigma_11 = Omega[1:2,1:2];
    matrix[2,2] Sigma_21 = Sigma_t[3:4,1:2];
    matrix[2,2] L_Sigma_22 = cholesky_decompose(Sigma_t[3:4,3:4]);
    matrix[2,2] inv_L_22_S_21 = mdivide_left_tri_low(L_Sigma_22, Sigma_21);
    matrix[2,2] cond_S = Sigma_11 - inv_L_22_S_21' * inv_L_22_S_21;
    real inv_cond_sd_y1_y2_y4 = inv(sqrt(cond_S[1,1]));
    real inv_cond_sd_y3_y2_y4 = inv(sqrt(cond_S[2,2]));
    real cond_corr_y2_y2_y4 = cond_S[2,1] * inv_cond_sd_y1_y2_y4 * inv_cond_sd_y3_y2_y4;
    
    for (n in 1:n_00) {
      target += log(binormal_cdf(- alpha[3], - alpha[1], Omega[1,2]));
    }
    for (n in 1:n_01) {
      target += log(binormal_cdf(-(alpha[3]
                                 + Omega[2,3]/sigs[1] * (y2_01[n] - alpha[2]))*inv_cond_sd_23,
                                 (alpha[1]
                                  + Omega[1,3]/sigs[1] * (y2_01[n] - alpha[2]))*inv_cond_sd_13,
                                 -cond_corr_y2_y2));
    }
    for (n in 1:n_10) {
      target += log(binormal_cdf((alpha[3] 
                                  + Omega[2,4]/sigs[2] * (y4_10[n] - alpha[4]))*inv_cond_sd_24,
                                 -(alpha[1]
                                 + Omega[1,4]/sigs[2] * (y4_10[n] - alpha[4] ))*inv_cond_sd_14,
                                 -cond_corr_y2_y4));
    }
    for (n in 1:n_11) {
      vector[2] y3_y3u_vec = y2_y4_11[,n] - [alpha[2] , alpha[4]]';
      vector[2] div_L_y3_y3u = mdivide_left_tri_low(L_Sigma_22, y3_y3u_vec);
      vector[2] cond_y3u =  inv_L_22_S_21' * div_L_y3_y3u;
      target += log(binormal_cdf((alpha[3] + cond_y3u[2])*inv_cond_sd_y3_y2_y4,
                                  (alpha[1] + cond_y3u[1])*inv_cond_sd_y1_y2_y4,
                                  cond_corr_y2_y2_y4));
      
      target += -0.5 * dot_self(div_L_y3_y3u);
    }
    target += -n_11 * (log(L_Sigma_22[1,1]) + log(L_Sigma_22[2,2]));
    y4_10[1:n_10] ~ normal(alpha[4], sigs[2]);
    y2_01[1:n_01] ~ normal(alpha[2], sigs[1]);
}

For setting Omega to the identity matrix, I moved Omega to the transformered parameters block and defined:

transformed parameters {
  corr_matrix[4] Omega;
         for (j in 1:4){
         for (k in 1:4){
            if(j == k){
              Omega[j, k] = 1;
            } else {
              Omega[j, k] = 0;
      }
    }
  }
}

simulated_data_validation_heck_multivar_het.R (6.6 KB)
heck_multivar_het_cor.stan (5.6 KB)