I am trying to run the lee-carter mortality model, using a Core i7-12700 CPU, because this CPU has 12 cores, I tried to rewrite the original model to contain reduce_sum. As follows, the first one is the original model, and the second one is the model with reduce_sum added.

The original model is：

```
data {
int<lower=0> T; // number of years
int<lower=0> A; // number of age categories
int<lower=0> Tf; // number of forecast years
int<lower=0> death[A*T]; // deaths
vector<lower=0>[A*T] expos; // exposures
}
transformed data {
vector[A*T] loge = log(expos); // log exposures
}
parameters {
vector[A] a; // alpha_x
simplex[A] b; // beta_x, strictly positive and sums to 1
vector[T] ks; // vector of k_t(Uncentralized)
vector[T] alpha_err;
real<lower = 0> sigmasq;
real<lower = 0> tausq;
real<lower=0> sigma_a;
}
transformed parameters {
vector[T] k; // vector of k_t(Centralized)
vector[A*T] logm;
vector[T] alpha;
real<lower = 0> sigma = sqrt(sigmasq);
real<lower = 0> tau = sqrt(tausq);
real km;
km = mean(ks);
k = ks - km;
for (j in 1:T) for (i in 1:A) {
logm[A*(j-1)+i] = a[i] + b[i] * k[j];
}
alpha[1] = alpha_err[1];
for(t in 2:T){
alpha[t] = alpha[t-1] + sigma_a * alpha_err[t];
}
}
model {
target += poisson_log_lpmf(death|logm+loge);
target += normal_lpdf(ks|alpha,sigma);
target += normal_lpdf(alpha_err[2:T]|0,1);
target += normal_lpdf(a|0,10);
target += dirichlet_lpdf(b|rep_vector(1, A));
target += inv_gamma_lpdf(sigmasq| 0.001,0.001);
target += inv_gamma_lpdf(tausq| 0.001,0.001);
target += student_t_lpdf(sigma_a|7, 0, 1);
}
generated quantities {
vector[Tf] k_p;
vector[Tf] alpha_p;
vector[A*Tf] mfore; // predicted death rates
vector[A*T] log_lik;
vector[A*T] m = exp(logm);
int pos1 = 1;
alpha_p[1] = alpha[T] + sigma_a * normal_rng(0,1);
for (t in 2:Tf){alpha_p[t] = alpha_p[t-1] + sigma_a * normal_rng(0,1);}
for (t in 1:Tf){k_p[t] = alpha_p[t] - km + sigma * normal_rng(0,1);}
for (t in 1:Tf) for (i in 1:A) {
mfore[pos1] = exp(a[i] + b[i] * k_p[t]);
pos1 += 1;
}
for(i in 1:A*T){log_lik[i] = poisson_log_lpmf(death[i]|logm[i] + loge[i]);}
}
```

the model with reduce_sum added is:

```
functions {
real partial_sum_lpmf(array[] int death,
int start, int end,
vector logm,
vector loge) {
return poisson_log_lupmf(death | logm[start:end] +loge[start:end]);
}
}
data {
int<lower=0> T; // number of years
int<lower=0> A; // number of age categories
int<lower=0> Tf; // number of forecast years
array[A*T] int<lower=0> death; // deaths
vector<lower=0>[A*T] expos; // exposures
}
transformed data {
vector[A*T] loge = log(expos); // log exposures
}
parameters {
vector[A] a; // alpha_x
simplex[A] b; // beta_x, strictly positive and sums to 1
vector[T] ks;
vector[T] alpha_err;
real<lower = 0> sigmasq;
real<lower = 0> tausq;
real<lower=0> sigma_a;
}
transformed parameters {
vector[T] k;
vector[A*T] logm;
vector[T] alpha;
real<lower = 0> sigma = sqrt(sigmasq);
real<lower = 0> tau = sqrt(tausq);
real km;
km = mean(ks);
k = ks - km;
for (j in 1:T) for (i in 1:A) {
logm[A*(j-1)+i] = a[i] + b[i] * k[j];
}
alpha[1] = alpha_err[1];
for(t in 2:T){
alpha[t] = alpha[t-1] + sigma_a * alpha_err[t];
}
}
model {
int grainsize = 1;
target += reduce_sum(partial_sum_lupmf, death, grainsize, logm, loge);
//target += poisson_log_lpmf(death|logm+loge);
target += normal_lpdf(ks|alpha,sigma);
target += normal_lpdf(alpha_err[2:T]|0,1);
target += normal_lpdf(a|0,10);
target += dirichlet_lpdf(b|rep_vector(1, A));
target += inv_gamma_lpdf(sigmasq| 0.001,0.001);
target += inv_gamma_lpdf(tausq| 0.001,0.001);
target += student_t_lpdf(sigma_a|7, 0, 1);
}
generated quantities {
vector[Tf] k_p;
vector[Tf] alpha_p;
vector[A*Tf] mfore; // predicted death rates
vector[A*T] log_lik;
vector[A*T] m = exp(logm);
int pos1 = 1;
alpha_p[1] = alpha[T] + sigma_a * normal_rng(0,1);
for (t in 2:Tf){alpha_p[t] = alpha_p[t-1] + sigma_a * normal_rng(0,1);}
for (t in 1:Tf){k_p[t] = alpha_p[t] - km + sigma * normal_rng(0,1);}
for (t in 1:Tf) for (i in 1:A) {
mfore[pos1] = exp(a[i] + b[i] * k_p[t]);
pos1 += 1;
}
for(i in 1:A*T){log_lik[i] = poisson_log_lpmf(death[i]|logm[i] + loge[i]);}
}
```

When I ran the original model, the CPU speed was around 4.3GHZ, the utilization was around 50% and it took about 700 seconds to simulate two thousand times.

When using the modified model, I set threads_per_chain = 2 or 3, when the CPU speed was around 3.7GHZ, the utilization was close to 100%, but even after spending more than 1400 seconds, the simulation was still not completed 2000 times.

Maybe I have some misunderstanding about the way reduce_sum is set up. Have I coded this incorrectly?