Difficulty in convergence of the model

Hello!
I am trying to use stan to build mortality models that can be applied to multiple adjacent areas at the same time.
firstly, I have tried the following mortality model and this works well on my computer.


data {
  int<lower=0> T;                  // number of years
  int<lower=0> A;                  // number of age categories
  int<lower=0> Tf;                 // number of forecast years
  array[A*T] int<lower=0> death;        // deaths
  array[A*T] real<lower=0> expos;        // exposures
}
transformed data {
  array[A*T] real loge = log(expos);        // log exposures
  int<lower = 1> J = T + A - 1;
}
parameters {
  vector[A] a;                    // alpha_x
  simplex[A] b1;                   // beta_x, strictly positive and sums to 1
  simplex[A] b2;
  vector[T-2] ks;                 // vector of k_t differences
  vector[J-3] gs;
  real c1;
  real c2;
  real psi;
  real<lower = 0> sigma_k;          // standard deviation of the random walk
  real<lower = 0> sigma_g; 
}
transformed parameters  {
  vector[T] k;   
  vector[J] g;
  array[A*T] real logm;
  
  k[1] = 0;
  k[2] = -sum(ks[1:(T-2)]);
  k[3:T] = ks;
  g[1] = 0;
  g[J] = 0;
  g[2] = -sum(gs[1:(J-3)]);
  g[3:(J-1)] = gs;
  for (j in 1:T) for (i in 1:A) {
    logm[A*(j-1)+i] = loge[A*(j-1)+i] + a[i] + b1[i] * k[j] + b2[i] * g[j-i+A];
  }
}
model {
  target += poisson_log_lpmf(death| logm);
  target += normal_lpdf(ks[1]| c1, sigma_k);
  target += normal_lpdf(ks[2:(T-2)]| c1 + ks[1:(T-3)], sigma_k);
  target += normal_lpdf(gs[1]|c2 + psi * (- c2),sigma_g);
  target += normal_lpdf(gs[2]|c2 + gs[1] + psi * (- gs[1] - c2),sigma_g);
  target += normal_lpdf(gs[3:(J-3)]|c2 + gs[2:(J-4)] + psi * (gs[2:(J-4)] - gs[1:(J-5)] - c2),sigma_g);
  target += normal_lpdf(a|0,10);
  target += normal_lpdf(c1|0,sqrt(10));
  target += normal_lpdf(c2|0,sqrt(10));
  target += normal_lpdf(psi|0,sqrt(10));
  target += dirichlet_lpdf(b1|rep_vector(1, A));
  target += dirichlet_lpdf(b2|rep_vector(1, A));
  target += student_t_lpdf(sigma_k|4, 0, 1);
  target += student_t_lpdf(sigma_g|4, 0, 1);
}
generated quantities {
  vector[Tf] k_p;
  vector[Tf] alpha_k_p;
  vector[Tf] delta_k_p;
  vector[Tf] g_p;
  vector[T+Tf+A-1] g_f;
  vector[A*Tf] mfore; // predicted death rates
  vector[A*T] log_lik;
  array[A*T] real m;
  int pos1 = 1;
  
  k_p[1] = c1 + k[T] + sigma_k * normal_rng(0,1);
  for (t in 2:Tf){
    k_p[t] = c1 + k_p[t-1] + sigma_k * normal_rng(0,1);
  }
  g_p[1] = c2 + g[J] + psi * (g[J] - g[J-1] - c2) + sigma_g * normal_rng(0,1);
  g_p[2] = c2 + g_p[1] + psi * (g_p[1] - g[J] - c2) + sigma_g * normal_rng(0,1);
  for (t in 3:Tf){g_p[t] = c2 + g_p[t-1] + psi * (g_p[t-1] - g_p[t-2] - c2) + sigma_g * normal_rng(0,1);}
  g_f = append_row(g, g_p);
  for (t in 1:Tf) for (i in 1:A) {
    mfore[pos1] = exp(a[i] + b1[i] * k_p[t] + b2[i] * g_f[T+t-i+A]);
    pos1 += 1;
  }
  for(i in 1:A*T){
    m[i] = exp(logm[i] - loge[i]);
    log_lik[i] = poisson_log_lpmf(death[i]|logm[i]);
  }
}

The following images show the model converges even with a short number of iterations(iter_warmup = iter_sampling = 1000).







I then tried to extend the model, and at the beginning I tried to apply the exact same model as before to P different regions.


data {
  int<lower=0> T;                  // number of years
  int<lower=0> A;                  // number of age categories
  int<lower=0> P;                 // number of regions
  int<lower=0> Tf;                 // number of forecast years
  array[A*T*P] int<lower=0> death;        // deaths
  array[A*T*P] real<lower=0> expos;        // exposures
  
}
transformed data {
  array[A*T*P] real loge = log(expos);        // log exposures
  int AT = A*T;
  int ATP = AT*P;
  int<lower=0> J = T+A-1;
}
parameters {
  array[P] vector[A] ap;          
  array[P] simplex[A] bp1;
  array[P] simplex[A] bp2;            
  array[P] vector[T-2] kps; 
  array[P] vector[J-3] gps;
  array[P] real cp1;
  array[P] real cp2;
  array[P] real psip;
  array[P] real<lower = 0> sigma_kp;
  array[P] real<lower = 0>sigma_gp;
  
}
transformed parameters  {
  array[P] vector[T] kp;
  array[P] vector[J] gp;
  array[ATP] real logm;
  
  for(p in 1:P){
    kp[p][1] = 1;
    kp[p][2] = -sum(kps[p][1:(T-2)]);
    kp[p][3:T] = kps[p];
    gp[p][1] = 0;
    gp[p][2] = -sum(gps[p][1:(J-3)]);;
    gp[p][3:(J-1)] = gps[p];
    gp[p][J] = 0;
  }
  for (p in 1:P) for (j in 1:T) for (i in 1:A) {
    logm[AT*(p-1) + A*(j-1) + i] = loge[AT*(p-1) + A*(j-1) + i] +
                                   ap[p][i] + bp1[p][i] * kp[p][j] + bp2[p][i] * gp[p][j-i+A];
  }
}
model {
  for(p in 1:P){
    target += normal_lpdf(kps[p][1]| cp1[p], sigma_kp[p]);
    target += normal_lpdf(kps[p][2:(T-2)]| cp1[p] + kps[p][1:(T-3)], sigma_kp[p]);
    target += normal_lpdf(gps[p][1]|cp2[p] + psip[p] * (-cp2[p]), sigma_gp[p]);
    target += normal_lpdf(gps[p][2]|cp2[p] + gps[p][1] + psip[p] * (gps[p][1] - cp2[p]), sigma_gp[p]);
    target += normal_lpdf(gps[p][3:(J-3)]|cp2[p] + gps[p][2:(J-4)] + psip[p] * (gps[p][2:(J-4)] - gps[p][1:(J-5)] - cp2[p]), sigma_gp[p]);
    target += normal_lpdf(ap[p]|0,10);
    target += dirichlet_lpdf(bp1[p]|rep_vector(1, A));
    target += dirichlet_lpdf(bp2[p]|rep_vector(1, A));
    
    target += normal_lpdf(cp1[p]|0,sqrt(10));
    target += normal_lpdf(cp2[p]|0,sqrt(10));
    target += normal_lpdf(psip[p]|0,sqrt(10));
    target += student_t_lpdf(sigma_kp[p]|4, 0, 1);
    target += student_t_lpdf(sigma_gp[p]|4, 0, 1);
  }
  
  
}
generated quantities {
  array[P] vector[Tf] kp_p;
  array[P] vector[Tf] gp_p;
  array[P] vector[T+Tf+A-1] gp_f;
  vector[A*Tf*P] mfore; // predicted death rates
  vector[ATP] log_lik;
  array[ATP] real m;
  int pos1 = 1;
  
  for(p in 1:P){
    kp_p[p][1] = cp1[p] + kp[p][T] + sigma_kp[p] * normal_rng(0,1);
    for (t in 2:Tf){kp_p[p][t] = cp1[p] + kp_p[p][t-1] + sigma_kp[p] * normal_rng(0,1);}
    gp_p[p][1] = cp2[p] + gp[p][J] + psip[p] * (gp[p][J] - gp[p][J-1] - cp2[p]) + sigma_gp[p] * normal_rng(0,1);
    gp_p[p][2] = cp2[p] + gp_p[p][1] + psip[p] * (gp_p[p][1] - gp[p][J] - cp2[p]) + sigma_gp[p] * normal_rng(0,1);
    for (t in 3:Tf){gp_p[p][t] = cp2[p] + gp_p[p][t-1] + psip[p] * (gp_p[p][t-1] - gp_p[p][t-2] - cp2[p]) + sigma_gp[p] * normal_rng(0,1);}
    gp_f[p] = append_row(gp[p], gp_p[p]);
  }
  for (p in 1:P) for (j in 1:Tf) for (i in 1:A) {
    mfore[pos1] = exp(ap[p][i] + bp1[p][i] * kp_p[p][j] + bp2[p][i] * gp_f[p][T+j-i+A]);
    pos1 += 1;              
  }
  
  for(i in 1:ATP){
    m[i] = exp(logm[i] - loge[i]);
    log_lik[i] = poisson_log_lpmf(death[i]|logm[i]);
  }
}



However, this model is extremely slow and does not converge when the data is very large (170892 pairs of deaths and expos data). So I tried to experiment with fewer data first.
I believe that this model should produce similar results to the former when P = 1 and the data used are the same. But it still fails to converge.







Is there something wrong with my coding or is there some feature of stan that I am not known?
Any help or comments are greatly needed, thanks.