Is there a way to accelerate this BetaBinomial and DirichletMultinomial model?

I created the following .stan file called model_tree. In essense it fits a BetaBinomial distribution when there are two categories, and a DirichletMultinomial when there are four categories. All the data that I provide are integers, since are counts, that is why I use array. They are of size 100 rows and 7 columns, i.e. 100 individuals, 7 years, which in my mind is farely small.

  int<lower=1> N;               // number of individuals (100)
  int<lower=1> TT;               // number of time points (7)
  array[N,TT] int<lower=0> n_1;       
  array[N,TT] int<lower=0> n_11;       
  array[N,TT] int<lower=0> n_111;       
  array[N,TT] int<lower=0> n_1111;       
  array[N,TT] int<lower=0> nn;     
  array[TT] int<lower=0> n;            
  array[N,TT] int<lower=0> n_med;
  array[N,TT] int<lower=0> n_ther;
  array[N,TT] int<lower=0> n_both;
  array[N,TT] int<lower=0> n_notr;
  array[N,TT] int<lower=0> nn_med;
  array[N,TT] int<lower=0> nn_ther;
  array[N,TT] int<lower=0> nn_both;
  array[N,TT] int<lower=0> nn_notr;
}

parameters {
 real<lower=0> a1;
 real<lower=0> b1;
 
  real gamma1;         
  real phi1;           
  real<lower=0> nu1;   


real<lower=0> a11;
 real<lower=0> b11;

 real gamma11;         
 real phi11;          
 real<lower=0> nu11;   
 
 real<lower=0> a111;
 real<lower=0> b111;

 real gamma111;        
 real phi111;           
 real<lower=0> nu111;   
 
 real<lower=0> a1111;
 real<lower=0> b1111;

 real gamma1111;         
 real phi1111;           
 real<lower=0> nu1111;   
 
 real<lower=0> c1;
 real<lower=0> c2;
 real<lower=0> c3;
 real<lower=0> c4;
 real<lower=0> nuc;   

 real gc1;        
 real phic1;           
 
 real gc2;         
 real phic2;           
 
 real gc3;        
 real phic3;          
 
 real gc4;        
 real phic4;           
 


 real<lower=0> d1;
 real<lower=0> d2;
 real<lower=0> d3;
 real<lower=0> d4;
 real<lower=0> nud;   

 real gd1;        
 real phid1;           
 
 real gd2;         
 real phid2;          
 
 real gd3;        
 real phid3;         
 
 real gd4;         
 real phid4;          
 
}

transformed parameters {
  vector<lower=0>[TT] alpha1;
  vector<lower=0>[TT] beta1;
  vector<lower=0,upper=1>[TT] mu1;
  alpha1[1] = a1;
  beta1[1] = b1;
  mu1[1] = a1/(a1+b1);
 vector<lower=0>[TT] alpha11;
 vector<lower=0>[TT] beta11;
 vector<lower=0,upper=1>[TT] mu11;

 alpha11[1] = a11;
 beta11[1] = b11;
 mu11[1] = a11/(a11+b11);
 vector<lower=0>[TT] alpha111;
 vector<lower=0>[TT] beta111;
 vector<lower=0,upper=1>[TT] mu111;

 alpha111[1] = a111;
 beta111[1] = b111;
 mu111[1] = a111/(a111+b111);

  vector<lower=0>[TT] alpha1111;
 vector<lower=0>[TT] beta1111;
 vector<lower=0,upper=1>[TT] mu1111;

 // Level 1
 alpha1111[1] = a1111;
 beta1111[1] = b1111;
 mu1111[1] = a1111/(a1111+b1111);


  vector<lower=0>[TT] ceta1;
  vector<lower=0>[TT] ceta2;
  vector<lower=0>[TT] ceta3;
  vector<lower=0>[TT] ceta4;
  vector<lower=0,upper=1>[TT] muceta1;
  vector<lower=0,upper=1>[TT] muceta2;
  vector<lower=0,upper=1>[TT] muceta3;
  vector<lower=0,upper=1>[TT] muceta4;

  ceta1[1] = c1;
  ceta2[1] = c2;
  ceta3[1] = c3;
  ceta4[1] = c4;
  muceta1[1] = c1/(c1+c2+c3+c4);
  muceta2[1] = c2/(c1+c2+c3+c4);
  muceta3[1] = c3/(c1+c2+c3+c4);
  muceta4[1] = c4/(c1+c2+c3+c4);

  vector<lower=0>[TT] deta1;
  vector<lower=0>[TT] deta2;
  vector<lower=0>[TT] deta3;
  vector<lower=0>[TT] deta4;
  vector<lower=0,upper=1>[TT] mudeta1;
  vector<lower=0,upper=1>[TT] mudeta2;
  vector<lower=0,upper=1>[TT] mudeta3;
  vector<lower=0,upper=1>[TT] mudeta4;

  deta1[1] = d1;
  deta2[1] = d2;
  deta3[1] = d3;
  deta4[1] = d4;
  mudeta1[1] = d1/(d1+d2+d3+d4);
  mudeta2[1] = d2/(d1+d2+d3+d4);
  mudeta3[1] = d3/(d1+d2+d3+d4);
  mudeta4[1] = d4/(d1+d2+d3+d4);
  for (i in 2:TT) {
       mu1[i] = inv_logit(gamma1 + phi1 * logit(alpha1[i-1] / (alpha1[i-1] + beta1[i-1])) );
    alpha1[i]  = mu1[i] * nu1;
    beta1[i]  = (1 - mu1[i]) * nu1;
     mu11[i] = inv_logit(gamma11 + phi11 * logit(alpha11[i-1] / (alpha11[i-1] + beta11[i-1])));
   alpha11[i]  = mu11[i] * nu11;
   beta11[i]  = (1 - mu11[i]) * nu11;
      mu111[i] = inv_logit(gamma111 + phi111 * logit(alpha111[i-1] / (alpha111[i-1] + beta111[i-1])));
   alpha111[i]  = mu111[i] * nu111;
   beta111[i]  = (1 - mu111[i]) * nu111;
   mu1111[i] = inv_logit(gamma1111 + phi1111 * logit(alpha1111[i-1] / (alpha1111[i-1] + beta1111[i-1])));
   alpha1111[i]  = mu1111[i] * nu1111;
   beta1111[i]  = (1 - mu1111[i]) * nu1111;
    
        muceta1[i] = inv_logit(gc1 + phic1 * logit(ceta1[i-1] / (ceta1[i-1] + ceta2[i-1] + ceta3[i-1] + ceta4[i-1])));
    muceta2[i] = inv_logit(gc2 + phic2 * logit(ceta2[i-1] / (ceta1[i-1] + ceta2[i-1] + ceta3[i-1] + ceta4[i-1])));
    muceta3[i] = inv_logit(gc3 + phic3 * logit(ceta3[i-1] / (ceta1[i-1] + ceta2[i-1] + ceta3[i-1] + ceta4[i-1])));
    muceta4[i] = inv_logit(gc4 + phic4 * logit(ceta4[i-1] / (ceta1[i-1] + ceta2[i-1] + ceta3[i-1] + ceta4[i-1])));

    ceta1[i]  = muceta1[i] * nuc;
    ceta2[i]  = muceta2[i] * nuc;
    ceta3[i]  = muceta3[i] * nuc;
    ceta4[i]  = muceta4[i] * nuc;
    mudeta1[i] = inv_logit(gd1 + phid1 * logit(deta1[i-1] / (deta1[i-1] + deta2[i-1] + deta3[i-1] + deta4[i-1])));
    mudeta2[i] = inv_logit(gd2 + phid2 * logit(deta2[i-1] / (deta1[i-1] + deta2[i-1] + deta3[i-1] + deta4[i-1])));
    mudeta3[i] = inv_logit(gd3 + phid3 * logit(deta3[i-1] / (deta1[i-1] + deta2[i-1] + deta3[i-1] + deta4[i-1])));
    mudeta4[i] = inv_logit(gd4 + phid4 * logit(deta4[i-1] / (deta1[i-1] + deta2[i-1] + deta3[i-1] + deta4[i-1])));

    deta1[i]  = mudeta1[i] * nud;
    deta2[i]  = mudeta2[i] * nud;
    deta3[i]  = mudeta3[i] * nud;
    deta4[i]  = mudeta4[i] * nud;

  }
  
}

model {

  // ---- PRIORS ----
  a1 ~  cauchy(0, 2);
  b1 ~  cauchy(0, 2);
  a11 ~  cauchy(0, 2);
  b11 ~  cauchy(0, 2);
  a111 ~  cauchy(0, 2);
  b111 ~  cauchy(0, 2);
  a1111 ~  cauchy(0, 2);
  b1111 ~  cauchy(0, 2);
  
  gamma1 ~ normal(0, 5);
  phi1   ~ normal(0, 5);
  nu1    ~ cauchy(0, 2);
  
  gamma11 ~ normal(0, 5);
  phi11   ~ normal(0, 5);
  nu11   ~ cauchy(0, 2);

  gamma111 ~ normal(0, 5);
  phi111   ~ normal(0, 5);
  nu111   ~ cauchy(0, 2);

  gamma1111 ~ normal(0, 5);
  phi1111   ~ normal(0, 5);
  nu1111  ~ cauchy(0, 2);
  
  
  gc1 ~ normal(0, 5);
  gc2 ~ normal(0, 5);
  gc3 ~ normal(0, 5);
  gc4 ~ normal(0, 5);

  phic1   ~ normal(0, 5);
  phic2   ~ normal(0, 5);
  phic3   ~ normal(0, 5);
  phic4   ~ normal(0, 5);

  nuc  ~ cauchy(0, 2);


  gd1 ~ normal(0, 5);
  gd2 ~ normal(0, 5);
  gd3 ~ normal(0, 5);
  gd4 ~ normal(0, 5);

  phid1   ~ normal(0, 5);
  phid2   ~ normal(0, 5);
  phid3   ~ normal(0, 5);
  phid4   ~ normal(0, 5);

  nud  ~ cauchy(0, 2);
  
  array[4] int iter_n;
  vector[4] iter_c;
  array[4] int iter_nd;
  vector[4] iter_d;
  
  for (t in 1:TT) {
 target += beta_binomial_lpmf(n_1[,t]    | n[t], alpha1[t], beta1[t]);
 target += beta_binomial_lpmf(n_11[,t]    | n_1[,t], alpha11[t], beta11[t]);
 target += beta_binomial_lpmf(n_111[,t]    | n_11[,t], alpha111[t], beta111[t]);
 target += beta_binomial_lpmf(n_1111[,t]    | nn[,t], alpha1111[t], beta1111[t]);
  iter_c[1] = ceta1[t];
    iter_c[2] = ceta2[t];
    iter_c[3] = ceta3[t];
    iter_c[4] = ceta4[t];
    iter_d[1] = deta1[t];
    iter_d[2] = deta2[t];
    iter_d[3] = deta3[t];
    iter_d[4] = deta4[t];
  for (i in 1:N) {
    iter_n[1] = n_med[i,t];
    iter_n[2] = n_ther[i,t];
    iter_n[3] = n_both[i,t];
    iter_n[4] = n_notr[i,t];
   
    iter_nd[1] = nn_med[i,t];
    iter_nd[2] = nn_ther[i,t];
    iter_nd[3] = nn_both[i,t];
    iter_nd[4] = nn_notr[i,t];
    target += lgamma(sum(iter_c))
        - lgamma(sum(iter_c) + sum(iter_n));
    target += lgamma(sum(iter_d))
        - lgamma(sum(iter_d) + sum(iter_nd));
for (k in 1:4) {
  target += lgamma(iter_n[k] + iter_c[k])
          - lgamma(iter_c[k])
          - lgamma(iter_n[k] + 1);
  target += lgamma(iter_nd[k] + iter_d[k])
          - lgamma(iter_d[k])
          - lgamma(iter_nd[k] + 1);
}
  }
}
}

The I fit the model for only 400 iterations in total. Which again look like a really small HMC algorithm, that should be quick.

fit <- stan(
  file = "model_tree. stan",
  data = stan_data,
  chains = 1,
  iter = 400,
  warmup = 200,
  control = list(
    adapt_delta = 0.85,
    max_treedepth = 12
  )
)

However, when I fit the model, it takes ages to run more than an hour. What makes the algorithm that I’ve written so slow. I tried to reduce the number of for loops used, tried to vectorise a lot of assignments. But I really cannot see what could make the algorithm so slow, because I think obviously, is not the size of individuals 100 and time points 7. I’d share the data, but they are not mind.

Can you share a summary of the resulting posterior? Have the chains converges? What do the HMC diagnostics look like?

Without knowing anything about the data (apart from the format), it is hard to develop an intuition of what’s going on.

A potential problem could be all the cauchy priors. If any of those parameters are weakly or not identified via the data, MCMC may have a very hard time. Maximum treedepth being hit all the time could be a potential symptom here - but you would have to check the diagnostics. This is just a wild guess. More info about the model and the data would be helpful.

Given the complexity of your model relative to TT=7, it might be worth following Betancourt-style Bayesian workflow: simulate from the prior and prior predictive, then from the posterior predictive, and check whether the model can (a) generate data resembling what you observe and (b) recover known parameters in a synthetic setting. That often reveals miss/over-parameterization or weak identifiability very quickly - those problems can lead to difficult sampling.