How to vectorize this code?

Hi, I was wondering how to vectorize the code below to speed things up.

for (s in 1:NS) {
   for (i in 1:n[s]) { 
     real lps[2];
    lps[1] = bernoulli_lpmf(0| p[s]) //non-diseased
    +    bernoulli_logit_lpmf(res1[s,i]  | -1.7*(a11     +t[i,s]*b01) ) // 1-Sp
    +    ordered_logistic_lpmf(res2[s,i] | 1.7*(a2[s,1] +t[i,s]*b02) , c10[1,1:2] );
    lps[2] = bernoulli_lpmf(1| p[s]) // diseased
    +    bernoulli_logit_lpmf(res1[s,i]  | -1.7*(a12     +t[i,s]*b11) )// Se
    +    ordered_logistic_lpmf(res2[s,i] | 1.7*(a2[s,2] +t[i,s]*b12) ,  c11[1,1:2] );
    // marginalize
    target += log_sum_exp(lps);
   }
 }
}

I tried this, but it completely messes up the results:

for (s in 1:NS) {
     real lps[2];
    lps[1] = bernoulli_lpmf(0| p[s]) //non-diseased
    +    bernoulli_logit_lpmf(res1[s,1:n[s]]  | -1.7*a11  + to_vector(t[1:n[s],s]) * b01  ) // 1-Sp
    +    ordered_logistic_lpmf(res2[s,1:n[s]] | 1.7*a2[s,1] + to_vector(t[1:n[s],s]) * b02  , c10[1,1:2] );
    lps[2] = bernoulli_lpmf(1| p[s]) // diseased
    +    bernoulli_logit_lpmf(res1[s,1:n[s]]  | -1.7*a12  + to_vector(t[1:n[s],s]) * b11  )// Se
    +    ordered_logistic_lpmf(res2[s,1:n[s]] | 1.7*a2[s,2] + to_vector(t[1:n[s],s]) * b12  ,  c11[1,1:2] );
    // marginalize
    target += log_sum_exp(lps);
   }
}

edit: added full mode code below

data {
                int<lower=2> nt; // number of tests
		int<lower=0> NS;	// total # of studies
                int<lower=1> n[NS]; // number of individuals in each study 
                int r[NS,6];
	}

transformed data {
                int res1[NS, max(n)] ; // index as res1[s,i]
                int res2[NS, max(n)] ; 
for (s in 1:NS) {
res1[s] = append_array( append_array( append_array( rep_array(1, r[s,1]), rep_array(1, r[s,2]) ) , append_array( rep_array(1, r[s,3]), rep_array(0, r[s,4]) )), append_array( rep_array(0, r[s,5]), rep_array(0, r[s,6]) ) );
res2[s] = append_array( append_array( append_array( rep_array(1, r[s,1]), rep_array(2, r[s,2]) ) , append_array( rep_array(3, r[s,3]), rep_array(1, r[s,4]) )), append_array( rep_array(2, r[s,5]), rep_array(3, r[s,6]) ) );
    }
}

	parameters {
          real<lower=0,upper=1> p[NS]; 
          vector[NS] z1;
          vector[NS] z2;
          vector[NS] t[max(n)];
          ordered[2] a2_m_raw;
          real b01_raw; 
          real b02_raw; 
          real b11_raw; 
          real b12_raw; 
          vector<lower=0>[2] a2_sd_raw; 
          real<lower=0> c11_raw;
          real<lower=0> c10_raw;
          vector<upper=0>[1] alpha_raw1;
          vector<upper=0>[1] alpha_raw2;
          ordered[2] a1_raw; 
	}

	transformed parameters {
          real a11;
          real a12;
          ordered[2] a2_m;
          vector<lower=0>[2] a2_sd; 
          vector[2] a2[NS]; 
          real b01; 
          real b02; 
          real b11; 
          real b12; 
          ordered[2] c11[1];
          ordered[2] c10[1];

    c11[1,1] = 0; 
    c10[1,1] = 0;
    c11[1,2] = c11_raw;
    c10[1,2] = c10_raw; 

   b01 = b01_raw*0.4 ; 
   b02 = b02_raw*0.4 ; //  b02_raw*0.5 ; 
   b11 = b11_raw*0.4 ; 
   b12 = b12_raw*0.4 ; // b12_raw*0.5 ; 

   a11 = a1_raw[1] - 2; 
   a12 = a1_raw[2] + 2;
   //a12 = a11 + alpha_raw1[1] +2 ; 
   a2_m[1] = a2_m_raw[1] -2; 
   a2_m[2] = a2_m_raw[2] +2; 
   //a2_m[2] = a2_m[1] + alpha_raw2[1] +2;

   a2_sd[1] = a2_sd_raw[1]*0.6; // 0.6 sd on probit scale is equiv to ~ N(0,1)T[0,] on logit scale 
   a2_sd[2] = a2_sd_raw[2]*0.6; 

    // between-study model 
    for (s in 1:NS) {
    a2[s,1] = a2_m[1] + a2_sd[1]*z1[s];    // ~ normal(a1_m, a1_sd); 
    a2[s,2] = a2_m[2] + a2_sd[2]*z2[s];    // ~ normal(a1_m, a1_sd); 
       }
     }

model {
    c11_raw ~ std_normal(); 
    c10_raw ~ std_normal(); 
    a1_raw ~ std_normal(); 
    a2_m_raw ~ std_normal();
    a2_sd_raw ~ std_normal();
    z1 ~ std_normal(); 
    z2 ~ std_normal(); 
    b01_raw ~ std_normal();
    b02_raw ~ std_normal();
    b11_raw ~ std_normal();
    b12_raw ~ std_normal();
    alpha_raw1 ~ std_normal(); 
    alpha_raw2 ~ std_normal(); 

  //  c11[1,1:2] ~ induced_dirichlet(rep_vector(1, 3), 0);
  //  c10[1,1:2] ~ induced_dirichlet(rep_vector(1, 3), 0);

   
   for (s in 1:NS) { 
      t[1:max(n),s] ~ std_normal();  
   }

// likelihood (within-study model)
for (s in 1:NS) {
   for (i in 1:n[s]) { 
     real lps[2];
    lps[1] = bernoulli_lpmf(0| p[s]) //non-diseased
    +    bernoulli_logit_lpmf(res1[s,i]  | -1.7*(a11     +t[i,s]*b01) ) // 1-Sp
    +    ordered_logistic_lpmf(res2[s,i] | 1.7*(a2[s,1] +t[i,s]*b02) , c10[1,1:2] );
    lps[2] = bernoulli_lpmf(1| p[s]) // diseased
    +    bernoulli_logit_lpmf(res1[s,i]  | -1.7*(a12     +t[i,s]*b11) )// Se
    +    ordered_logistic_lpmf(res2[s,i] | 1.7*(a2[s,2] +t[i,s]*b12) ,  c11[1,1:2] );
    // marginalize
    target += log_sum_exp(lps);
   }
 }
}
generated quantities {
        vector<lower=0,upper=1>[nt] Se; 
        vector<lower=0,upper=1>[nt] Sp; 
        real int_d; 
        real int_nd; 
      // overall test accuracy estimates, for each test 
          Se[1] =     Phi_approx( (a12 )/sqrt(1 +b11^2) );
          Sp[1]   =  1-Phi_approx( (a11 )/sqrt(1 +b01^2) );
          Se[2] =      Phi_approx( (a2_m[2] - c11[1,2]  )/sqrt(1 +b12^2) );
          int_d =   Phi_approx( (a2_m[2] - c11[1,1])/sqrt(1 +b12^2) ) 
                     - Phi_approx( (a2_m[2] - c11[1,2])/sqrt(1 +b12^2) ) ;  
          int_nd =   Phi_approx( (a2_m[1] - c10[1,1])/sqrt(1 +b11^2) ) 
                     - Phi_approx( (a2_m[1] - c10[1,1])/sqrt(1 +b11^2) ) ; 
          Sp[2]   = 1- Phi_approx( (a2_m[1] -c10[1,1])/sqrt(1 +b02^2) );
      // 

}

All things I have tried change the posterior, maybe it’s not possible to vectorise this

There are a few things you can do here to simplify the code and avoid recomputation. Some are simple, like replacing the b01, b02, b11, b12 variables with a 2 x 2 matrix. So that’d be:

parameters {
  matrix[2, 2] b_raw;

transformed parameters {
  matrix[2, 2] b = 0.4 * b_raw;

If you’re using a recent enough Stan, this can be coded without transformed parameters as

parameters {
  matrix<multiplier = 0.4>[2, 2] b;

That implicitly defines a b_raw over which sampling will be done and then transforms it to b = 0.4 * b_raw.

For c11, c10, c11, c10, I’d replace this

ordered[2] c11[1];
ordered[2] c10[1];
c11[1,1] = 0; 
c10[1,1] = 0;
c11[1,2] = c11_raw;
c10[1,2] = c10_raw; 

with

ordered[2] c[2] = { [0, c11_raw]', [0, c10_raw]' };

and then use c[1] wherever you previously had c11[1] and c[2] for what used to be c10[1] (other way around would be more natural—I just went with the order in the declaration). Or just keep c1 and c0 as the two ordered[2] components:

ordered[2] c1 = [0, c11_raw]';
ordered[2] c0 = [0, c10_raw]';

Similarly,

ordered[2] a2_m;
...
a2_m[1] = a2_m_raw[1] -2; 
a2_m[2] = a2_m_raw[2] +2;

can be replaced with

ordered[2] a2_m = a2_m_raw + [-2, 2]';

That’s probably a wash on efficiency (creates a container, but then shortens the expression graph; this is no big deal either way).

The other transformed parameters can be handled similarly. The point is to reduce the amount of fiddly indexing.

Now consider the main loop, which was originally coded in the form

for (s in 1:NS) {
   for (i in 1:n[s]) { 
     real lps[2];
     lps[1] = ...;
     lps[2] = ...;
     target += log_sum_exp(lps);
  }
}

You really do need this to be two loops—we don’t have vectorized mixtures yet.

First, I’d recommend removing the intermediate array and renaming loop variables following our conventions so that the code looks like:

for (s in 1:S) {
  for (n in 1:N[s]) {
    real lp1 = ...;
    real lp2 = ...;
    target += log_sum_exp(lp1, lp2);
  }
}

This is the reverse of the earlier recommendations, but is motivated by the same principle of reducing lines of code. Also, it’s a bit more efficient to not create a container only to take it apart right away.

Then I’d recommend computing matrix operations then accessing elements. I’ll just go through lp1 because lp2 will be just the same.

matrix[?, ?] tb01 = -1.7 * b01 * t;   // left associative, (scalar * scalar) * matrix is efficient
matrix[?, ?] tb02 = 1.7 * b02 * t;

matrix[?, ?] v1 = -1.7 * a11 + tb01;
...
for (s in 1:S) {
  vector[?] u1 = 1.7 * a2[s, 1] + tb02[ , s]
  ...
  real lp_bern1 = bernoulli_lpmf(0 | p[s]);
  real lp_bern2 = bernoulli_lpmf(1 | p[s]);
  for (n in 1:N[s]) {
    real lp1 = lp_bern1
             + bernoulli_logit_lpmf(res1[s, n] | v1[s, n])
             + ordered_logistic_lpmf(res2[s, n] | u1[n], c10[1, 1:2]);
   ...

I probably messed up some indexes, but I hope the point is clear: favor matrix operations and move shared computations out of loops. That’ll give you the biggest boost in efficiency.

Also, we have a log_mix function, so that

real lp1 = bernoulli_lpmf(0 | p[s]) + lp1_rest;
real lp2 = bernoulli_lpmf(1 | p[s]) + lp2_rest;
target += log_sum_exp(lp1, lp2);

should be coded as:

target += log_mix(p[s], lp1_rest, lp2_rest);

This isn’t the most efficient way to do this because it’s continually computing log(p[s]) and log1m(p[s]). If you really need the last ounce of efficiency, calculate it the long way with lp_bern1 and lp_bern2 stored in the s loop as above.

3 Likes

Thanks Bob!

Moving those computationss outside of the inner likelihood loop makes it run ~ 2x faster

1 Like