Help with multi-threading a simple ordinal probit model using reduce_sum

How many (physical) cores do you have? Given that you’re running 4 chains, with 2 threads, you would need an 8-core cpu to see a real benefit. Alternatively, you could try just running 2 chains at a time and seeing how the performance compares.

Also, I experimented a little with your model, and by moving more of the calculation/construction to the partial_sum function, I’ve gotten the runtime down to ~10secs:

functions{

  real partial_sum(
    int[] slice_y,
    int start, int end,
    vector x, vector thresh,
    real alpha, real beta,
    real sigma,int K
  )
  {
    real lp = 0;
    int N = (end - start + 1);
    matrix[N,K] theta_slice;
    vector[N] mu = alpha + beta * x[start:end];
    theta_slice[, 1] = (Phi( (thresh[1] - mu)/sigma));
    theta_slice[, 2] = (Phi( (thresh[2] - mu)/sigma) - Phi( (thresh[1] - mu)/sigma));
    theta_slice[, 3] = (1 - Phi( (thresh[2] - mu)/sigma));
    for(i in 1:N)
      lp += categorical_lpmf(slice_y[i] | theta_slice[i]');
    return lp;
  }
}

data{
  int<lower=1> N;
  vector[N] x;
  int y[N];
  int K;
  vector[K-1] thresh;
  int use_reduce_sum;
}

parameters{
  real alpha;
  real beta;
  real<lower=0> sigma;
}

model{
  alpha ~ normal(0, 5);
  beta ~ std_normal();
  sigma ~ std_normal();


  if(use_reduce_sum == 1){

    target += reduce_sum(partial_sum, y, 1, x, thresh, alpha, beta, sigma, K);
  }
  else{
    vector[N] mu;
    mu = alpha + beta * x;
    // likelihood (non reduce_sum)
    for(n in 1:N){
      vector[K] theta;
      theta[1] = Phi( (thresh[1] - mu[n])/sigma);
      theta[2] = Phi( (thresh[2] - mu[n])/sigma) - Phi( (thresh[1] - mu[n])/sigma);
      theta[3] = 1 - Phi( (thresh[2] - mu[n])/sigma);

      target += categorical_lpmf(y[n] | theta);
      }
  }
}

1 Like