I’m modelling hundreds of time-series (each with thousands of observations) using a Hidden Markov Model, using a strategy described here.

However, this strategy is not efficient enough to process my whole dataset, and therefore I tried to parallelise the processing of each time-series, using stan’s `map_rect`

.

Unfortunately, this has led to a dramatic performance drop which defeats this strategy altogether.

For instance, using an unrealistically short sample of data (2 time-series of 59 bins each), and pystan 3.7.0) the parallelised version is 40 times slower (2000 seconds vs 50 seconds !). CmdStan 2.32 shows similar performance.

Here are my questions:

- Did I make any mistake in my attempt to parallelize the model (see below for the code before/after parallelization)? Am I using
`map_rect`

incorrectly? - Is there anything I could do to significantly improve the performance of the parallelized version?
- If not, is there any alternative? (I had imagined that using 48 cores, parallelization could have yielded a x10 speedup that could have made my problem tractable)

Here are the unparallelised and parallelised versions of the code, for comparison:

**Model 1 (unparallelised)**

```
data {
int<lower=1> n_children;
int<lower=1> n_recs;
int<lower=1> n_bins;
int<lower=1> bins[n_recs];
int a[n_recs,n_bins];
int c[n_recs,n_bins];
real<lower=0> age[n_recs];
}
parameters {
matrix<lower=0,upper=1>[2,2] k[n_recs];
real<lower=1> T_up;
real<lower=1> T_down;
real<lower=0> mu_chi;
real<lower=0> mu_adu;
vector<lower=0>[n_recs] epsilon_chi;
vector<lower=0>[n_recs] epsilon_adu;
}
transformed parameters {
matrix<lower=0,upper=1>[4,4] A;
vector[4] logalpha[n_recs,n_bins];
for (s1c in 1:2) {
for (s2c in 1:2) {
for (s1a in 1:2) {
for (s2a in 1:2) {
int i = 1+(s1c-1)+2*(s1a-1);
int j = 1+(s2c-1)+2*(s2a-1);
if (s1c == 1 && s2c == 2) {
A[i,j] = 1/T_up;
}
else if (s1c == 2 && s2c == 1) {
A[i,j] = 1/T_down;
}
else if (s1c == s2c && s1c==1) {
A[i,j] = 1-1/T_down;
}
else if (s1c == s2c && s1c==2) {
A[i,j] = 1-1/T_up;
}
if (s1a == 1 && s2a == 2) {
A[i,j] *= 1/T_up;
}
else if (s1a == 2 && s2a == 1) {
A[i,j] *= 1/T_down;
}
else if (s1a == s2a && s1a==1) {
A[i,j] *= 1-1/T_down;
}
else if (s1a == s2a && s1a==2) {
A[i,j] *= 1-1/T_up;
}
}
}
}
}
{
real accumulator[4];
for (i in 1:n_recs) {
logalpha[i, 1] = rep_vector(0,4);
for (t in 2:n_bins) {
for (s1 in 1:4) { // s1 = state at time t
for (s2 in 1:4) { // s2 = state at time t-1
accumulator[s2] = logalpha[i, t-1, s2] + log(A[s2,s1]);
accumulator[s2] += poisson_lpmf(c[i,t] | ((s1-1)%2)*(k[i,1,1]*c[i,t-1] + k[i,2,1]*a[i,t-1] + mu_chi) + epsilon_chi[i]);
accumulator[s2] += poisson_lpmf(a[i,t] | floor((s1-1)/2.0)*(k[i,2,2]*a[i,t-1] + k[i,1,2]*c[i,t-1] + mu_adu) + epsilon_adu[i]);
}
logalpha[i, t, s1] = log_sum_exp(accumulator);
}
}
}
}
}
model {
for (i in 1:n_recs) {
target += log_sum_exp(logalpha[i,n_bins]);
}
T_up ~ gamma(2, 0.01);
T_down ~ gamma(2, 0.01);
for (i in 1:2) {
for (j in 1:2) {
k[i,j] ~ uniform(0,1);
}
}
mu_chi ~ normal(0,1);
mu_adu ~ normal(0,1);
epsilon_chi ~ normal(0,1);
epsilon_adu ~ normal(0,1);
}
```

**Model 2 (with map_rect)**

```
functions {
vector la(vector thetas, vector phis, array[] real x, array[] int y) {
int n_bins = size(y)%/%2;
array [4] real accumulator;
vector[n_bins*4] lgalpha;
lgalpha[1:4] = rep_vector(0, 4);
for (t in 2:n_bins) {
for (s1 in 1:4) { // s1 = state at time t
for (s2 in 1:4) { // s2 = state at time t-1
accumulator[s2] = lgalpha[(t-2)*4+s2] + log(thetas[2+(s1-1)*4+s2]);
accumulator[s2] += poisson_lpmf(y[(t-1)*2+1] | ((s1-1)%2)*(phis[3]*y[(t-2)*2+1] + phis[5]*y[(t-2)*2+2] + thetas[1]) + phis[1]);
accumulator[s2] += poisson_lpmf(y[(t-1)*2+2] | floor((s1-1)/2.0)*(phis[6]*y[(t-2)*2+2] + phis[4]*y[(t-2)*2+1] + thetas[2]) + phis[2]);
}
lgalpha[(t-1)*4+s1] = log_sum_exp(accumulator);
}
}
return lgalpha;
}
}
data {
int<lower=1> n_children;
int<lower=1> n_recs;
int<lower=1> n_bins;
array [n_recs] int <lower=1> bins;
array [n_recs, n_bins] int a;
array [n_recs, n_bins] int c;
array [n_recs] real<lower=0> age;
}
transformed data {
array [n_recs,n_bins*2] int vocs;
array [n_recs,1] real x;
for (i in 1:n_recs) {
x[i,1] = age[i];
for (t in 1:n_bins) {
vocs[i,(t-1)*2+1] = c[i,t];
vocs[i,(t-1)*2+2] = a[i,t];
}
}
}
parameters {
array [n_recs] matrix<lower=0,upper=1>[2,2] k;
real<lower=1> T_up;
real<lower=1> T_down;
real<lower=0> mu_chi;
real<lower=0> mu_adu;
vector<lower=0>[n_recs] epsilon_chi;
vector<lower=0>[n_recs] epsilon_adu;
}
transformed parameters {
matrix<lower=0,upper=1>[4,4] A;
vector[n_recs*n_bins*4] logalpha;
for (s1c in 1:2) {
for (s2c in 1:2) {
for (s1a in 1:2) {
for (s2a in 1:2) {
int i = 1+(s1c-1)+2*(s1a-1);
int j = 1+(s2c-1)+2*(s2a-1);
if (s1c == 1 && s2c == 2) {
A[i,j] = 1/T_up;
}
else if (s1c == 2 && s2c == 1) {
A[i,j] = 1/T_down;
}
else if (s1c == s2c && s1c==1) {
A[i,j] = 1-1/T_down;
}
else if (s1c == s2c && s1c==2) {
A[i,j] = 1-1/T_up;
}
if (s1a == 1 && s2a == 2) {
A[i,j] *= 1/T_up;
}
else if (s1a == 2 && s2a == 1) {
A[i,j] *= 1/T_down;
}
else if (s1a == s2a && s1a==1) {
A[i,j] *= 1-1/T_down;
}
else if (s1a == s2a && s1a==2) {
A[i,j] *= 1-1/T_up;
}
}
}
}
}
vector[18] thetas;
thetas[1] = mu_chi;
thetas[2] = mu_adu;
for (s1 in 1:4) {
for (s2 in 1:4) {
thetas[2+(s1-1)*4+s2] = A[s1,s2];
}
}
array[n_recs] vector[6] phis;
for (i in 1:n_recs) {
phis[i] = [epsilon_chi[i],epsilon_adu[i],k[i,1,1],k[i,1,2],k[i,2,1],k[i,2,2]]';
}
logalpha = map_rect(la, thetas, phis, x, vocs);
}
model {
for (i in 1:n_recs) {
int j = (i*n_bins-1)*4;
target += log_sum_exp(logalpha[j+1:j+4]);
}
T_up ~ gamma(2, 0.01);
T_down ~ gamma(2, 0.01);
for (i in 1:2) {
for (j in 1:2) {
k[i,j] ~ uniform(0,1);
}
}
mu_chi ~ normal(0,1);
mu_adu ~ normal(0,1);
epsilon_chi ~ normal(0,1);
epsilon_adu ~ normal(0,1);
}
```

Thank you very much for your help!

Lucas