Speeding Up function execution during sampling

Hello,
I’m having a bit of trouble speeding up my code.
For my model I need the peak output of a 5th order leaky integrator to a series of square pulses.
To get this I’m computing some step responses in the function ‘LPF_SinglePulse’. The expression looks long, since its the analytic solution. I initially used FFT in StanCMD, but computation was slow.
In a separate function named ‘Leaky_Output’ I obtain the maximum absolute value, which is then used to model each response (‘S_par’ in model section).
This parameters is important in my model since it determines an integration time constant for different stimuli.

The problem is that, even though I avoided convolution and switched to multiplication, running this takes a lot of time for a single subject.
Sampling from the priors is fast, but as soon as I include the calculation of ‘S_par’, the whole code runs very slow.
It is worth mentioning that each pulse (‘y1’,‘y2’, … in ‘LPF_SinglePulse’) has 2000 samples (this is the parameter ‘nsamples’).
The number of responses (‘y’) per subject is 440 (this is ‘Ntotal’=440)
I’m wondering if there’s a way to speed up these type of calculations.

 functions{
  
  vector LPF_SinglePulse(vector Step1,vector Step2,vector Step3,vector Step4,vector Step5,vector Step6,vector Step7,vector Step8, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tau, vector t, int nsamples) {
      // This Function compues the response of a 5th order Leaky integrator to a pulse
      
      vector [nsamples] y1;
      vector [nsamples] y2;
      vector [nsamples] y3;
      vector [nsamples] y4; 
      vector [nsamples] y5;
      vector [nsamples] y6;
      vector [nsamples] y7;
      vector [nsamples] y8;      
      
// Initial implementation using a for loop in RStan, switched to StanCMD
      //for(k in 1:nsamples){
        //y1[k] =   Step1[k]*( (1.0) - (exp(-(t[k]-t1)/tau)*(1/(tgamma(1))*(pow((t[k]-t1)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t1)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t1)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t1)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t1)/tau,4)))));
        //y2[k] =   Step2[k]*( (1.0) - (exp(-(t[k]-t2)/tau)*(1/(tgamma(1))*(pow((t[k]-t2)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t2)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t2)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t2)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t2)/tau,4)))));
      //  }
      
      y1 = Step1.*( (1.0) - (exp(-(t-t1)/tau).*(1/(tgamma(1)).*(pow((t-t1)/tau,0)) + 1/(tgamma(2)).*(pow((t-t1)/tau,1)) + 1/(tgamma(3)).*(pow((t-t1)/tau,2)) + 1/(tgamma(4)).*(pow((t-t1)/tau,3)) + 1/(tgamma(5)).*(pow((t-t1)/tau,4)))));
      y2 = Step2.*( (1.0) - (exp(-(t-t2)/tau).*(1/(tgamma(1)).*(pow((t-t2)/tau,0)) + 1/(tgamma(2)).*(pow((t-t2)/tau,1)) + 1/(tgamma(3)).*(pow((t-t2)/tau,2)) + 1/(tgamma(4)).*(pow((t-t2)/tau,3)) + 1/(tgamma(5)).*(pow((t-t2)/tau,4)))));
      y3 = Step3.*( (1.0) - (exp(-(t-t3)/tau).*(1/(tgamma(1)).*(pow((t-t3)/tau,0)) + 1/(tgamma(2)).*(pow((t-t3)/tau,1)) + 1/(tgamma(3)).*(pow((t-t3)/tau,2)) + 1/(tgamma(4)).*(pow((t-t3)/tau,3)) + 1/(tgamma(5)).*(pow((t-t3)/tau,4)))));
      y4 = Step4.*( (1.0) - (exp(-(t-t4)/tau).*(1/(tgamma(1)).*(pow((t-t4)/tau,0)) + 1/(tgamma(2)).*(pow((t-t4)/tau,1)) + 1/(tgamma(3)).*(pow((t-t4)/tau,2)) + 1/(tgamma(4)).*(pow((t-t4)/tau,3)) + 1/(tgamma(5)).*(pow((t-t4)/tau,4)))));
      y5 = Step5.*( (1.0) - (exp(-(t-t5)/tau).*(1/(tgamma(1)).*(pow((t-t5)/tau,0)) + 1/(tgamma(2)).*(pow((t-t5)/tau,1)) + 1/(tgamma(3)).*(pow((t-t5)/tau,2)) + 1/(tgamma(4)).*(pow((t-t5)/tau,3)) + 1/(tgamma(5)).*(pow((t-t5)/tau,4)))));
      y6 = Step6.*( (1.0) - (exp(-(t-t6)/tau).*(1/(tgamma(1)).*(pow((t-t6)/tau,0)) + 1/(tgamma(2)).*(pow((t-t6)/tau,1)) + 1/(tgamma(3)).*(pow((t-t6)/tau,2)) + 1/(tgamma(4)).*(pow((t-t6)/tau,3)) + 1/(tgamma(5)).*(pow((t-t6)/tau,4)))));
      y7 = Step7.*( (1.0) - (exp(-(t-t7)/tau).*(1/(tgamma(1)).*(pow((t-t7)/tau,0)) + 1/(tgamma(2)).*(pow((t-t7)/tau,1)) + 1/(tgamma(3)).*(pow((t-t7)/tau,2)) + 1/(tgamma(4)).*(pow((t-t7)/tau,3)) + 1/(tgamma(5)).*(pow((t-t7)/tau,4)))));
      y8 = Step8.*( (1.0) - (exp(-(t-t8)/tau).*(1/(tgamma(1)).*(pow((t-t8)/tau,0)) + 1/(tgamma(2)).*(pow((t-t8)/tau,1)) + 1/(tgamma(3)).*(pow((t-t8)/tau,2)) + 1/(tgamma(4)).*(pow((t-t8)/tau,3)) + 1/(tgamma(5)).*(pow((t-t8)/tau,4)))));
      
      return( (y1-y2) + (y3-y4) + (y5-y6) + (y7-y8));
    
      }
  
  
  real Leaky_Output(vector Step1,vector Step2, vector Step3,vector Step4, vector Step5,vector Step6, vector Step7,vector Step8, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tauA, real tauC, real polarity, vector t, int nsamples) {
      // This Function computes the peak output of a 5th order Leaky integrator to a pulse
      
      real S;
      real tau;
      vector[nsamples] P;
      
      // Determine time constant to be used (-1 or 1 is the input to polarity)
      tau = tauA * ((1+polarity)/2) + tauC * ((polarity-1)/(-2));
      
      // Compute Step response per pulse
      P = LPF_SinglePulse( Step1, Step2, Step3 , Step4 ,Step5, Step6, Step7 , Step8 ,t1, t2, t3, t4, t5, t6, t7, t8, tau, t, nsamples);
      
      // Get the maximum value
      S = max(abs(P));
    
      return(S);      
    }
    
  } // e.o. functions 
  
  
  data {
    int<lower=1> Nsubj ; 	          // subjects
    int<lower=1> Ntotal ; 		  // Number of measurements
    int<lower=1> nsamples ; 	  // Number of samples in time vector
    real<lower=0> y[Ntotal] ;    // Response vector
    int<lower=1> subj[Ntotal] ;  // subject vector
    real polarity[Ntotal] ; 		      // polarity vector (either 1 or -1)
    real Amp[Ntotal] ; 		    // Stimulus amplitude vector
    int<lower=1> nipi[Ntotal];   // sample inter-pulse interval, per stimulus
    vector[nsamples] t ;              // time vector
    matrix[Ntotal,nsamples] p1;       // Matrix with stimulus data for pulse 1
    matrix[Ntotal,nsamples] p2;       // Matrix with stimulus data for pulse 2
    matrix[Ntotal,nsamples] p3;       // Matrix with stimulus data for pulse 3
    matrix[Ntotal,nsamples] p4;       // Matrix with stimulus data for pulse 4
    matrix[Ntotal,nsamples] p5;       // Matrix with stimulus data for pulse 5
    matrix[Ntotal,nsamples] p6;       // Matrix with stimulus data for pulse 6
    matrix[Ntotal,nsamples] p7;       // Matrix with stimulus data for pulse 7
    matrix[Ntotal,nsamples] p8;       // Matrix with stimulus data for pulse 8
    matrix[Ntotal,8] t0;              // Matrix with time onsets for each pulse
    
  } // e.o. data
  
  
  parameters {
  
    // Subject dependent parameters
    real<lower=1>  tauA[Nsubj]; 	    // Time constant for polarity A
    real<lower=1>  tauC[Nsubj]; 	    // Time constant for polarity C
    real<lower=0.001>  A[Nsubj]; 		  // Parameter A (ceiling for y)
    real<lower=0.001>  theta[Nsubj]; 	// Parameter theta (half-width promptness)
    real<lower=0.001>  omega[Nsubj];	// Parameter omega (width)
    real<lower=0.001>  sigma[Nsubj]; 	// Parameter sigma (std of promptness)
    

  } // e.o. parameters
  
  model{  
    
    // loop for all stimuli presented (and responses)
    for ( i in 1:Ntotal ) {
      
    // Time constants follow uniform distributions here just for testing
     tauA[subj[i]] ~ uniform(1, 100); 
     tauC[subj[i]] ~ uniform(1, 100);
     
     // Priors for omega, theta, A defined as Gamma functions for testing
     omega[subj[i]] ~ gamma(1, 0.001);
     theta[subj[i]] ~ gamma(1, 0.001);
     A[subj[i]] ~ gamma(1, 0.001);
     sigma[subj[i]] ~ gamma(1, 0.001);
  
      // We compute the peak output of the stimulus passing through a filter, this affects our response
      // S_par takes too much time to compute!!!!
      real S_par = Leaky_Output(p1[i]', p2[i]', p3[i]', p4[i]', p5[i]', p6[i]', p7[i]', p8[i]', t0[i,1], t0[i,2], t0[i,3], t0[i,4], t0[i,5], t0[i,6], t0[i,7], t0[i,8], tauA[subj[i]], tauC[subj[i]], polarity[i], t, nsamples);


      // Response: y ~ normal(mu, sigma);
      //y[i] ~ normal( A[subj[i]] / ( 1 + exp( ( (-2*log(9))/omega[subj[i]] ) * (S_par - theta[subj[i]]) ) ) + 0.25 , sigma[subj[i]] );
    
    }  
    
  } // e.o. model
     

Thanks for the help!
Hope I was clear enough and sorry for the sloppiness in my code, I’m taking my first steps in STAN.

Kind regards,
Ignacio

Is this just 1? Similarly for pow(…, 1) is that just the input?

1 Like

And if each subject has the same prior can you move this out of the for loop and use the vectorized versions I.e.

sigma ~ gamma(...)
1 Like

In general I also think Stan (and any gradient based method) will have a hard time sampling from this as max and abs have undefined derivatives

2 Likes

Dear Steve,
Indeed, you’re right, those two are unnecessary operations.
But replacing them doesn’t speed up the process.

I’ve changed the output of the function to be 0 .

      // Get the maximum value
      //S = max(abs(P));
    
      return(0); 

This still takes a lot of time for each iteration.
I also tried commenting out the likelihood and still it takes a lot of time, so it seems that it is the function LPF_SinglePulse taking a lot of time.
If I reduce nsamples to 100, then this runs faster, but unfortunately, it’s not enough samples to compute the response.

There’s some speedup still avaliable in LPF_SinglePulse, but it’s probably not huge. But before going there, I think it’s important to really consider the implications of @stevebronder 's point above about S = max(abs(P)). In general, unless P is constructed in a special way, this thing won’t have continuous derivatives. And if you have discontinuous derivatives anywhere within the “important” part of the posterior, that will wreak havoc on sampling. So it’s highly possible that you are working hard to solve a speed problem in a situation where you are very likely to see sampling problems even if you can speed up your function.

With that said, you can probably get modest speedup by pre-computing and reusing redundant parts of the computation. For i in 1:5, you recompute 1/tgamma(i) eight times, and then recompute it at every likelihood evaluation. Move this computation to transformed data. Then, for j in 1:8, you recompute (t - ti)/tau several times. Just do it once at the beginning of the function, store the result in a temporary variable, and grab the relevant term wherever needed. Finally, because you need all of the powers of (t - ti)/tau from 2 to 4, it is possibly more efficient to compute these recursively by repeated multiplications rather than separate power functions (I’m unsure of whether there’s any speedup at all to be had with this last one, and pretty sure that any speedup must be minor since the highest power you need isn’t very large).

3 Likes

@jsocolar could log sum exp be used here to help a bit? Abs would still be bad. Maybe exp instead? Though that would change the scales

@jsocolar and @stevebronder. Thanks for all the help so far! And also for the explanation on the issues with max and abs.
I’ve tried using LogSumExp as the alternative for the max function. I also removed the Abs function to test with LogSumExp.
Still the time seems to be dominated by the computations in LPF_SinglePulse and depends mainly on the ‘nsamples’ parameter.
Computing each y(i) is multiplying a nsamples (>2000) sample vector many times.
I’ll also check alternatives to the max and abs functions, to avoid the undefined derivatives.

Cache all your 1 / tgamma things so you are only calling those once. Don’t use pow for powers of 0 or 1. That seems to be all the easy stuff. For anything else you’d have to go through the math and see if you can do some clever simplifications.

Yes for instance instead of doing

for t1-t6 in each pow you can just make a vector like vector[N} t_t1 = (t - t1)/tau and reuse the computation.

I didn’t have time to write a full working example, but I have a modded your above a bit to show what I mean. You should also change your input data so it comes in such that it can be accessed in a column-wise fashion since Stan is column major.

Also another thing that might help is revisiting your priors, the gamma(1, 0.001) is very wide. Is there a reason something like a half normal would not work? (i.e. your already constraining it to be positive so a prior of normal(0, 1) would be only the positive side of the normal)

functions{

 vector LPF_SinglePulse(matrix Steps, vector t0_i, real tau, vector t, int nsamples, vector inv_tgamma) {
     // This Function compues the response of a 5th order Leaky integrator to a pulse

     matrix [nsamples, 8] y;
     /*
     vector [nsamples] y2;
     vector [nsamples] y3;
     vector [nsamples] y4;
     vector [nsamples] y5;
     vector [nsamples] y6;
     vector [nsamples] y7;
     vector [nsamples] y8;
     */
// Initial implementation using a for loop in RStan, switched to StanCMD
     //for(k in 1:nsamples){
       //y1[k] =   Step1[k]*( (1.0) - (exp(-(t[k]-t1)/tau)*(1/(tgamma(1))*(pow((t[k]-t1)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t1)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t1)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t1)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t1)/tau,4)))));
       //y2[k] =   Step2[k]*( (1.0) - (exp(-(t[k]-t2)/tau)*(1/(tgamma(1))*(pow((t[k]-t2)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t2)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t2)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t2)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t2)/tau,4)))));
     //  }
     /**
      * TODO: Compute 1 / tgamma(j) in the transformed data block and pass it to function
      */
     for (i in 1:8) {
       vector[nsamples] t_t0 = (t - t0_i[i]) ./ tau;
       y[, i] =  ( (1.0) - (exp(-t_t0) .* inv_tgamma[1] .*
        // 1 replaces (pow(t_t0, 0)) and pow(t_t0, 1) is just t_t0
        1 + inv_tgamma[2] .* t_t0);
        t_t0 *= t_t0;
        y[, i] += inv_tgamma[3] .* t_t0;
        t_t0 *= t_t0;
        y[, i] += inv_tgamma[4] .* t_t0;
        t_t0 *= t_t0;
        y[, i] += inv_tgamma[5] .* t_t0;
        y[, i] *= Step[, i];
     }
     /*
     y1 = Step1.*( (1.0) - (exp(-(t-t1)/tau).*(1/(tgamma(1)).*(pow((t-t1)/tau,0)) + 1/(tgamma(2)).*(pow((t-t1)/tau,1)) + 1/(tgamma(3)).*(pow((t-t1)/tau,2)) + 1/(tgamma(4)).*(pow((t-t1)/tau,3)) + 1/(tgamma(5)).*(pow((t-t1)/tau,4)))));
     y2 = Step2.*( (1.0) - (exp(-(t-t2)/tau).*(1/(tgamma(1)).*(pow((t-t2)/tau,0)) + 1/(tgamma(2)).*(pow((t-t2)/tau,1)) + 1/(tgamma(3)).*(pow((t-t2)/tau,2)) + 1/(tgamma(4)).*(pow((t-t2)/tau,3)) + 1/(tgamma(5)).*(pow((t-t2)/tau,4)))));
     y3 = Step3.*( (1.0) - (exp(-(t-t3)/tau).*(1/(tgamma(1)).*(pow((t-t3)/tau,0)) + 1/(tgamma(2)).*(pow((t-t3)/tau,1)) + 1/(tgamma(3)).*(pow((t-t3)/tau,2)) + 1/(tgamma(4)).*(pow((t-t3)/tau,3)) + 1/(tgamma(5)).*(pow((t-t3)/tau,4)))));
     y4 = Step4.*( (1.0) - (exp(-(t-t4)/tau).*(1/(tgamma(1)).*(pow((t-t4)/tau,0)) + 1/(tgamma(2)).*(pow((t-t4)/tau,1)) + 1/(tgamma(3)).*(pow((t-t4)/tau,2)) + 1/(tgamma(4)).*(pow((t-t4)/tau,3)) + 1/(tgamma(5)).*(pow((t-t4)/tau,4)))));
     y5 = Step5.*( (1.0) - (exp(-(t-t5)/tau).*(1/(tgamma(1)).*(pow((t-t5)/tau,0)) + 1/(tgamma(2)).*(pow((t-t5)/tau,1)) + 1/(tgamma(3)).*(pow((t-t5)/tau,2)) + 1/(tgamma(4)).*(pow((t-t5)/tau,3)) + 1/(tgamma(5)).*(pow((t-t5)/tau,4)))));
     y6 = Step6.*( (1.0) - (exp(-(t-t6)/tau).*(1/(tgamma(1)).*(pow((t-t6)/tau,0)) + 1/(tgamma(2)).*(pow((t-t6)/tau,1)) + 1/(tgamma(3)).*(pow((t-t6)/tau,2)) + 1/(tgamma(4)).*(pow((t-t6)/tau,3)) + 1/(tgamma(5)).*(pow((t-t6)/tau,4)))));
     y7 = Step7.*( (1.0) - (exp(-(t-t7)/tau).*(1/(tgamma(1)).*(pow((t-t7)/tau,0)) + 1/(tgamma(2)).*(pow((t-t7)/tau,1)) + 1/(tgamma(3)).*(pow((t-t7)/tau,2)) + 1/(tgamma(4)).*(pow((t-t7)/tau,3)) + 1/(tgamma(5)).*(pow((t-t7)/tau,4)))));
     y8 = Step8.*( (1.0) - (exp(-(t-t8)/tau).*(1/(tgamma(1)).*(pow((t-t8)/tau,0)) + 1/(tgamma(2)).*(pow((t-t8)/tau,1)) + 1/(tgamma(3)).*(pow((t-t8)/tau,2)) + 1/(tgamma(4)).*(pow((t-t8)/tau,3)) + 1/(tgamma(5)).*(pow((t-t8)/tau,4)))));
     */
     return( (y[, 1]- y[, 2]) + (y[, 3]- y[, 4]) + (y[, 5] - y[, 6]) + (y[, 7] - y[, 8]));

     }


 real Leaky_Output(matrix Steps, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tauA, real tauC, real polarity, vector t, int nsamples) {
     // This Function computes the peak output of a 5th order Leaky integrator to a pulse

     // Determine time constant to be used (-1 or 1 is the input to polarity)
     real tau = tauA * ((1+polarity)/2) + tauC * ((polarity-1)/(-2));

     // Compute Step response per pulse
     vector[nsamples] P = LPF_SinglePulse(Steps, t1, t2, t3, t4, t5, t6, t7, t8, tau, t, nsamples);

     // Get the maximum value
     real S = log_sum_exp(P);

     return(S);
   }

 } // e.o. functions


 data {
   int<lower=1> Nsubj ; 	          // subjects
   int<lower=1> Ntotal ; 		  // Number of measurements
   int<lower=1> nsamples ; 	  // Number of samples in time vector
   vector<lower=0>[Ntotal] y ;    // Response vector
   int<lower=1> subj[Ntotal] ;  // subject vector
   vector[Ntotal] polarity; 		      // polarity vector (either 1 or -1)
   vector[Ntotal] Amp; 		    // Stimulus amplitude vector
   int<lower=1> nipi[Ntotal];   // sample inter-pulse interval, per stimulus
   vector[nsamples] t ;              // time vector
   /**
    * Array of pulse samples for each Ntotal
    */
   array[Ntotal] matrix[nsamples, 8] stimulus_data;
   /*
   matrix[nsamples, Ntotal] p1;       // Matrix with stimulus data for pulse 1
   matrix[nsamples, Ntotal] p2;       // Matrix with stimulus data for pulse 2
   matrix[nsamples, Ntotal] p3;       // Matrix with stimulus data for pulse 3
   matrix[nsamples, Ntotal] p4;       // Matrix with stimulus data for pulse 4
   matrix[nsamples, Ntotal] p5;       // Matrix with stimulus data for pulse 5
   matrix[nsamples, Ntotal] p6;       // Matrix with stimulus data for pulse 6
   matrix[nsamples, Ntotal] p7;       // Matrix with stimulus data for pulse 7
   matrix[nsamples, Ntotal] p8;       // Matrix with stimulus data for pulse 8
   */
   matrix[8, Ntotal] t0;              // Matrix with time onsets for each pulse

 } // e.o. data


 parameters {

   // Subject dependent parameters
   vector<lower=1>[Nsubj] tauA; 	    // Time constant for polarity A
   vector<lower=1>[Nsubj]  tauC; 	    // Time constant for polarity C
   vector<lower=1e-15>[Nsubj] A; 		  // Parameter A (ceiling for y)
   vector<lower=1e-15>[Nsubj] theta; 	// Parameter theta (half-width promptness)
   vector<lower=1e-15>[Nsubj] omega;	// Parameter omega (width)
   vector<lower=1e-15>[Nsubj] sigma; 	// Parameter sigma (std of promptness)


 } // e.o. parameters

 model{

   // Time constants follow uniform distributions here just for testing
    tauA ~ uniform(1, 100);
    tauC] ~ uniform(1, 100);

    // Priors for omega, theta, A defined as Gamma functions for testing
    omega ~ gamma(1, 0.001);
    theta ~ gamma(1, 0.001);
    A ~ gamma(1, 0.001);
    // Kind of a huge prior?
    sigma ~ gamma(1, 0.001);
   // loop for all stimuli presented (and responses)

   for ( i in 1:Ntotal ) {

     // We compute the peak output of the stimulus passing through a filter, this affects our response
     // S_par takes too much time to compute!!!!
     real S_par = Leaky_Output(stim_data[i], t0[, i],
        tauA[subj[i]], tauC[subj[i]], polarity[i], t, nsamples);

     // Response: y ~ normal(mu, sigma);
     y[i] ~ normal( A[subj[i]] / ( 1 + exp( ( (-2*log(9))/omega[subj[i]] ) * (S_par - theta[subj[i]]) ) ) + 0.25 , sigma[subj[i]] );

   }

 } // e.o. model
1 Like