How many (physical) cores do you have? Given that you’re running 4 chains, with 2 threads, you would need an 8-core cpu to see a real benefit. Alternatively, you could try just running 2 chains at a time and seeing how the performance compares.
Also, I experimented a little with your model, and by moving more of the calculation/construction to the partial_sum
function, I’ve gotten the runtime down to ~10secs:
functions{
real partial_sum(
int[] slice_y,
int start, int end,
vector x, vector thresh,
real alpha, real beta,
real sigma,int K
)
{
real lp = 0;
int N = (end - start + 1);
matrix[N,K] theta_slice;
vector[N] mu = alpha + beta * x[start:end];
theta_slice[, 1] = (Phi( (thresh[1] - mu)/sigma));
theta_slice[, 2] = (Phi( (thresh[2] - mu)/sigma) - Phi( (thresh[1] - mu)/sigma));
theta_slice[, 3] = (1 - Phi( (thresh[2] - mu)/sigma));
for(i in 1:N)
lp += categorical_lpmf(slice_y[i] | theta_slice[i]');
return lp;
}
}
data{
int<lower=1> N;
vector[N] x;
int y[N];
int K;
vector[K-1] thresh;
int use_reduce_sum;
}
parameters{
real alpha;
real beta;
real<lower=0> sigma;
}
model{
alpha ~ normal(0, 5);
beta ~ std_normal();
sigma ~ std_normal();
if(use_reduce_sum == 1){
target += reduce_sum(partial_sum, y, 1, x, thresh, alpha, beta, sigma, K);
}
else{
vector[N] mu;
mu = alpha + beta * x;
// likelihood (non reduce_sum)
for(n in 1:N){
vector[K] theta;
theta[1] = Phi( (thresh[1] - mu[n])/sigma);
theta[2] = Phi( (thresh[2] - mu[n])/sigma) - Phi( (thresh[1] - mu[n])/sigma);
theta[3] = 1 - Phi( (thresh[2] - mu[n])/sigma);
target += categorical_lpmf(y[n] | theta);
}
}
}