Categorical model divergences/priors

My current model is given below with the same plots I’ve been generating to diagnose problems. I’m clearly stuck…

I’ve been running some tests using optimize(). I started with the simplest model - population parameters only. This converged, so I progressively added hierarchy back into the model. It runs with b_a1, b_a2, b_a3, b_a4, and b_bc being hierarchical parameters. When I add b_b1tt, etc., it gives a line search failure error. I tried only adding hierarchy for one of the b_b#tt and a few other combinations. In theory, the model should run (if specified as I believe it to be). I’m working off a published model with the same structure - i.e., all random/hierarchical parameters and a multiplication by a common hierarchical parameter (in my model b_bc), which captures scale heterogeneity/cost. You cannot separate the cost parameter from the scale and its median should be fixed to 1.0 for identification. Others also use lognormal priors for these parameters.

Current data and R script:
mtc_equity-v13.R (2.3 KB)
time_cost_calculated.CSV (2.4 MB)

functions {
  int[] sequence(int start, int end) { 
    int seq[end - start + 1];
    for (n in 1:num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq; 
  } 

  real partial_log_lik_lpmf(int[] seq, int start, int end,
		     data int ncat, data int nvar, data int ntotvar, data int[] Y, data matrix x1, data matrix x2, data matrix x3, data matrix x4, data int[] J_1, data int[] J_2, 
                     vector b_a2, vector b_a3, vector b_a4, vector b_bc, vector b_b1tt, vector b_b2tt, vector b_b3tt, vector b_b4tt, matrix avail) {
                     
    real ptarget = 0;
    int N = end - start + 1;
    // Define matrices/vector for x, alpha, beta
    matrix[ncat,nvar] x;
    vector[ncat] alpha;
    matrix[nvar,ncat] beta;

    array[N] row_vector[ncat] a = rep_array(rep_row_vector(0,ncat), N);
      
    // initialize linear predictor term
    // Terms are btc, b1tt, b2tt, b3tt, b4tt

    array[N] row_vector[ntotvar] b = rep_array(rep_row_vector(0,ntotvar), N);

    for (n in 1:N) {
      int nn = n + start - 1;
      // add to linear predictor term
      a[n] += [0,b_a2[J_2[nn]], b_a3[J_2[nn]], b_a4[J_2[nn]]];
      // add to linear predictor term
      // Terms are btc, b1tt, b2tt, b3tt, b4tt
     // Median btc parameter fixed to 1.0 for identification
      b[n] += [b_bc[J_1[nn]], b_b1tt[J_1[nn]], b_b2tt[J_1[nn]], b_b3tt[J_1[nn]], b_b4tt[J_1[nn]]];
      // Each x and beta is a matrix with dimensions alts x variables
      // Our x will be the time/cost coming in as inputs
      x = to_matrix([x1[nn],x2[nn],x3[nn],x4[nn]]);

      // Our betas will be the hierarchical slope parameters (2 x 4)
     // Time and cost parameters multiplied by -1 because all have negative effect (positive lognormal transformed to negative lognormal)
      beta = -1*[b[n,1] * b[n,2:5], [b[n,1],0,0,b[n,1]]];
      // Our alphas will be the hierarchical intercept parameters. ln(avail[nn,i]] gives an availability measure
      alpha = [a[n,1]+log(avail[nn,1]), b[n,1] * a[n,2]+log(avail[nn,2]), b[n,1] * a[n,3]+log(avail[nn,3]), b[n,1] * a[n,4]+log(avail[nn,4])]';
     // Y f(alpha+betaX)
      ptarget += categorical_logit_glm_lupmf(Y[nn] | x, alpha, beta);
    }
    return ptarget;
  }
}

data {
  int<lower=1> N;  // total number of observations
  int<lower=2> ncat;  // number of categories
  int<lower=2> nvar;  // number of variables for each alternative
  int<lower=2> ntotvar;  // number of total variables
  int grainsize;  // grainsize for threading
  array[N] int Y;  // response variable
  // covariate matrix for alt1
  matrix[N,nvar] x1; // matrix[N,nvar] x1;
  // covariate matrix for alt2 
  matrix[N,nvar] x2; // matrix[N,nvar] x2;
  // covariate matrix for alt3 
  matrix[N,nvar] x3; // matrix[N,nvar] x3;
  // covariate matrix for alt4
  matrix[N,nvar] x4; // matrix[N,nvar] x4;
  // data for group-level effects of ID 1
  int<lower=1> N_1;  // number of grouping levels
  int<lower=1> M_1;  // number of coefficients per level
  array[N] int<lower=1> J_1;  // grouping indicator per observation
  // group-level predictor values
  int<lower=1> NC_1;  // number of group-level correlations
  // data for group-level effects of ID 2
  int<lower=1> N_2;  // number of grouping levels
  int<lower=1> M_2;  // number of coefficients per level
  array[N] int<lower=1> J_2;  // grouping indicator per observation
  matrix<lower=0,upper=1>[N,ncat] avail; // whether an alternative is available
  int prior_only;  // should the likelihood be ignored?
}
transformed data {
  int seq[N] = sequence(1, N);
}
parameters {
  real mu_b1tt; // population-level effects
  real mu_b2tt; // population-level effects
  real mu_b3tt; // population-level effects
  real mu_b4tt; // population-level effects
  real mu_a2; // population-level effects
  real mu_a3; // population-level effects
  real mu_a4; // population-level effects
  real<lower=0> sd1_bc;  // group-level standard deviations
  real<lower=0> sd1_b1tt;  // group-level standard deviations
  real<lower=0> sd1_b2tt;  // group-level standard deviations
  real<lower=0> sd1_b3tt;  // group-level standard deviations
  real<lower=0> sd1_b4tt;  // group-level standard deviations
  real<lower=0> sd2_a2;  // group-level standard deviations
  real<lower=0> sd2_a3;  // group-level standard deviations
  real<lower=0> sd2_a4;  // group-level standard deviations
  vector<offset=0, multiplier = sd1_bc>[N_1] log_b_bc;
  vector<offset=mu_b1tt, multiplier = sd1_b1tt>[N_1] log_b_b1tt;
  vector<offset=mu_b2tt, multiplier = sd1_b2tt>[N_1] log_b_b2tt;
  vector<offset=mu_b3tt, multiplier = sd1_b3tt>[N_1] log_b_b3tt;
  vector<offset=mu_b4tt, multiplier = sd1_b4tt>[N_1] log_b_b4tt;
  vector<offset=mu_a2, multiplier = sd2_a2>[N_2] b_a2;
  vector<offset=mu_a3, multiplier = sd2_a3>[N_2] b_a3;
  vector<offset=mu_a4, multiplier = sd2_a4>[N_2] b_a4;
}
// Transform normal to lognormal (all time and cost parameters should be single signed)
transformed parameters{
	vector<lower=0>[N_1] b_bc = exp(log_b_bc);
	vector<lower=0>[N_1] b_b1tt = exp(log_b_b1tt);
	vector<lower=0>[N_1] b_b2tt = exp(log_b_b2tt);
	vector<lower=0>[N_1] b_b3tt = exp(log_b_b3tt);
	vector<lower=0>[N_1] b_b4tt = exp(log_b_b4tt);
}
model {
  // likelihood including constants
  if (!prior_only) {
       target += reduce_sum(partial_log_lik_lpmf, seq,
                     grainsize, ncat, nvar, ntotvar, Y, x1, x2, x3, x4, J_1, J_2,
		        b_a2, b_a3, b_a4, b_bc, b_b1tt, b_b2tt, b_b3tt, b_b4tt, avail);

  }
  // priors
  mu_b1tt ~ normal(3,1); // Gives reasonable values of time (~$5-$75)
  mu_b2tt ~ normal(3,1);
  mu_b3tt ~ normal(3,1);
  mu_b4tt ~ normal(2,1);
  mu_a2 ~ normal(3,1);
  mu_a3 ~ normal(3,1);
  mu_a4 ~ normal(3,1);
  sd1_bc ~ std_normal();
  sd1_b1tt ~ normal(2,1);
  sd1_b2tt ~ normal(2,1);
  sd1_b3tt ~ normal(2,1);
  sd1_b4tt ~ normal(2,1);
  sd2_a2 ~ normal(1,5);
  sd2_a3 ~ normal(1,5);
  sd2_a4 ~ normal(1,5);
  b_a2 ~ normal(mu_a2, sd2_a2);
  b_a3 ~ normal(mu_a3, sd2_a3);
  b_a4 ~ normal(mu_a4, sd2_a4);
  log_b_bc ~ normal(0, sd1_bc);
  log_b_b1tt ~ normal(mu_b1tt, sd1_b1tt);
  log_b_b2tt ~ normal(mu_b2tt, sd1_b2tt);
  log_b_b3tt ~ normal(mu_b3tt, sd1_b3tt);
  log_b_b4tt ~ normal(mu_b4tt, sd1_b4tt);
}


mu_b#tt in the range of 4.0 are ok (a bit high) when combined with an sd of about 0-3.