# Speeding Up function execution during sampling

Hello,
I’m having a bit of trouble speeding up my code.
For my model I need the peak output of a 5th order leaky integrator to a series of square pulses.
To get this I’m computing some step responses in the function ‘LPF_SinglePulse’. The expression looks long, since its the analytic solution. I initially used FFT in StanCMD, but computation was slow.
In a separate function named ‘Leaky_Output’ I obtain the maximum absolute value, which is then used to model each response (‘S_par’ in model section).
This parameters is important in my model since it determines an integration time constant for different stimuli.

The problem is that, even though I avoided convolution and switched to multiplication, running this takes a lot of time for a single subject.
Sampling from the priors is fast, but as soon as I include the calculation of ‘S_par’, the whole code runs very slow.
It is worth mentioning that each pulse (‘y1’,‘y2’, … in ‘LPF_SinglePulse’) has 2000 samples (this is the parameter ‘nsamples’).
The number of responses (‘y’) per subject is 440 (this is ‘Ntotal’=440)
I’m wondering if there’s a way to speed up these type of calculations.

`````` functions{

vector LPF_SinglePulse(vector Step1,vector Step2,vector Step3,vector Step4,vector Step5,vector Step6,vector Step7,vector Step8, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tau, vector t, int nsamples) {
// This Function compues the response of a 5th order Leaky integrator to a pulse

vector [nsamples] y1;
vector [nsamples] y2;
vector [nsamples] y3;
vector [nsamples] y4;
vector [nsamples] y5;
vector [nsamples] y6;
vector [nsamples] y7;
vector [nsamples] y8;

// Initial implementation using a for loop in RStan, switched to StanCMD
//for(k in 1:nsamples){
//y1[k] =   Step1[k]*( (1.0) - (exp(-(t[k]-t1)/tau)*(1/(tgamma(1))*(pow((t[k]-t1)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t1)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t1)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t1)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t1)/tau,4)))));
//y2[k] =   Step2[k]*( (1.0) - (exp(-(t[k]-t2)/tau)*(1/(tgamma(1))*(pow((t[k]-t2)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t2)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t2)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t2)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t2)/tau,4)))));
//  }

y1 = Step1.*( (1.0) - (exp(-(t-t1)/tau).*(1/(tgamma(1)).*(pow((t-t1)/tau,0)) + 1/(tgamma(2)).*(pow((t-t1)/tau,1)) + 1/(tgamma(3)).*(pow((t-t1)/tau,2)) + 1/(tgamma(4)).*(pow((t-t1)/tau,3)) + 1/(tgamma(5)).*(pow((t-t1)/tau,4)))));
y2 = Step2.*( (1.0) - (exp(-(t-t2)/tau).*(1/(tgamma(1)).*(pow((t-t2)/tau,0)) + 1/(tgamma(2)).*(pow((t-t2)/tau,1)) + 1/(tgamma(3)).*(pow((t-t2)/tau,2)) + 1/(tgamma(4)).*(pow((t-t2)/tau,3)) + 1/(tgamma(5)).*(pow((t-t2)/tau,4)))));
y3 = Step3.*( (1.0) - (exp(-(t-t3)/tau).*(1/(tgamma(1)).*(pow((t-t3)/tau,0)) + 1/(tgamma(2)).*(pow((t-t3)/tau,1)) + 1/(tgamma(3)).*(pow((t-t3)/tau,2)) + 1/(tgamma(4)).*(pow((t-t3)/tau,3)) + 1/(tgamma(5)).*(pow((t-t3)/tau,4)))));
y4 = Step4.*( (1.0) - (exp(-(t-t4)/tau).*(1/(tgamma(1)).*(pow((t-t4)/tau,0)) + 1/(tgamma(2)).*(pow((t-t4)/tau,1)) + 1/(tgamma(3)).*(pow((t-t4)/tau,2)) + 1/(tgamma(4)).*(pow((t-t4)/tau,3)) + 1/(tgamma(5)).*(pow((t-t4)/tau,4)))));
y5 = Step5.*( (1.0) - (exp(-(t-t5)/tau).*(1/(tgamma(1)).*(pow((t-t5)/tau,0)) + 1/(tgamma(2)).*(pow((t-t5)/tau,1)) + 1/(tgamma(3)).*(pow((t-t5)/tau,2)) + 1/(tgamma(4)).*(pow((t-t5)/tau,3)) + 1/(tgamma(5)).*(pow((t-t5)/tau,4)))));
y6 = Step6.*( (1.0) - (exp(-(t-t6)/tau).*(1/(tgamma(1)).*(pow((t-t6)/tau,0)) + 1/(tgamma(2)).*(pow((t-t6)/tau,1)) + 1/(tgamma(3)).*(pow((t-t6)/tau,2)) + 1/(tgamma(4)).*(pow((t-t6)/tau,3)) + 1/(tgamma(5)).*(pow((t-t6)/tau,4)))));
y7 = Step7.*( (1.0) - (exp(-(t-t7)/tau).*(1/(tgamma(1)).*(pow((t-t7)/tau,0)) + 1/(tgamma(2)).*(pow((t-t7)/tau,1)) + 1/(tgamma(3)).*(pow((t-t7)/tau,2)) + 1/(tgamma(4)).*(pow((t-t7)/tau,3)) + 1/(tgamma(5)).*(pow((t-t7)/tau,4)))));
y8 = Step8.*( (1.0) - (exp(-(t-t8)/tau).*(1/(tgamma(1)).*(pow((t-t8)/tau,0)) + 1/(tgamma(2)).*(pow((t-t8)/tau,1)) + 1/(tgamma(3)).*(pow((t-t8)/tau,2)) + 1/(tgamma(4)).*(pow((t-t8)/tau,3)) + 1/(tgamma(5)).*(pow((t-t8)/tau,4)))));

return( (y1-y2) + (y3-y4) + (y5-y6) + (y7-y8));

}

real Leaky_Output(vector Step1,vector Step2, vector Step3,vector Step4, vector Step5,vector Step6, vector Step7,vector Step8, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tauA, real tauC, real polarity, vector t, int nsamples) {
// This Function computes the peak output of a 5th order Leaky integrator to a pulse

real S;
real tau;
vector[nsamples] P;

// Determine time constant to be used (-1 or 1 is the input to polarity)
tau = tauA * ((1+polarity)/2) + tauC * ((polarity-1)/(-2));

// Compute Step response per pulse
P = LPF_SinglePulse( Step1, Step2, Step3 , Step4 ,Step5, Step6, Step7 , Step8 ,t1, t2, t3, t4, t5, t6, t7, t8, tau, t, nsamples);

// Get the maximum value
S = max(abs(P));

return(S);
}

} // e.o. functions

data {
int<lower=1> Nsubj ; 	          // subjects
int<lower=1> Ntotal ; 		  // Number of measurements
int<lower=1> nsamples ; 	  // Number of samples in time vector
real<lower=0> y[Ntotal] ;    // Response vector
int<lower=1> subj[Ntotal] ;  // subject vector
real polarity[Ntotal] ; 		      // polarity vector (either 1 or -1)
real Amp[Ntotal] ; 		    // Stimulus amplitude vector
int<lower=1> nipi[Ntotal];   // sample inter-pulse interval, per stimulus
vector[nsamples] t ;              // time vector
matrix[Ntotal,nsamples] p1;       // Matrix with stimulus data for pulse 1
matrix[Ntotal,nsamples] p2;       // Matrix with stimulus data for pulse 2
matrix[Ntotal,nsamples] p3;       // Matrix with stimulus data for pulse 3
matrix[Ntotal,nsamples] p4;       // Matrix with stimulus data for pulse 4
matrix[Ntotal,nsamples] p5;       // Matrix with stimulus data for pulse 5
matrix[Ntotal,nsamples] p6;       // Matrix with stimulus data for pulse 6
matrix[Ntotal,nsamples] p7;       // Matrix with stimulus data for pulse 7
matrix[Ntotal,nsamples] p8;       // Matrix with stimulus data for pulse 8
matrix[Ntotal,8] t0;              // Matrix with time onsets for each pulse

} // e.o. data

parameters {

// Subject dependent parameters
real<lower=1>  tauA[Nsubj]; 	    // Time constant for polarity A
real<lower=1>  tauC[Nsubj]; 	    // Time constant for polarity C
real<lower=0.001>  A[Nsubj]; 		  // Parameter A (ceiling for y)
real<lower=0.001>  theta[Nsubj]; 	// Parameter theta (half-width promptness)
real<lower=0.001>  omega[Nsubj];	// Parameter omega (width)
real<lower=0.001>  sigma[Nsubj]; 	// Parameter sigma (std of promptness)

} // e.o. parameters

model{

// loop for all stimuli presented (and responses)
for ( i in 1:Ntotal ) {

// Time constants follow uniform distributions here just for testing
tauA[subj[i]] ~ uniform(1, 100);
tauC[subj[i]] ~ uniform(1, 100);

// Priors for omega, theta, A defined as Gamma functions for testing
omega[subj[i]] ~ gamma(1, 0.001);
theta[subj[i]] ~ gamma(1, 0.001);
A[subj[i]] ~ gamma(1, 0.001);
sigma[subj[i]] ~ gamma(1, 0.001);

// We compute the peak output of the stimulus passing through a filter, this affects our response
// S_par takes too much time to compute!!!!
real S_par = Leaky_Output(p1[i]', p2[i]', p3[i]', p4[i]', p5[i]', p6[i]', p7[i]', p8[i]', t0[i,1], t0[i,2], t0[i,3], t0[i,4], t0[i,5], t0[i,6], t0[i,7], t0[i,8], tauA[subj[i]], tauC[subj[i]], polarity[i], t, nsamples);

// Response: y ~ normal(mu, sigma);
//y[i] ~ normal( A[subj[i]] / ( 1 + exp( ( (-2*log(9))/omega[subj[i]] ) * (S_par - theta[subj[i]]) ) ) + 0.25 , sigma[subj[i]] );

}

} // e.o. model

``````

Thanks for the help!
Hope I was clear enough and sorry for the sloppiness in my code, I’m taking my first steps in STAN.

Kind regards,
Ignacio

Is this just 1? Similarly for pow(…, 1) is that just the input?

1 Like

And if each subject has the same prior can you move this out of the for loop and use the vectorized versions I.e.

``````sigma ~ gamma(...)
``````
1 Like

In general I also think Stan (and any gradient based method) will have a hard time sampling from this as max and abs have undefined derivatives

2 Likes

Dear Steve,
Indeed, you’re right, those two are unnecessary operations.
But replacing them doesn’t speed up the process.

I’ve changed the output of the function to be 0 .

``````      // Get the maximum value
//S = max(abs(P));

return(0);
``````

This still takes a lot of time for each iteration.
I also tried commenting out the likelihood and still it takes a lot of time, so it seems that it is the function LPF_SinglePulse taking a lot of time.
If I reduce nsamples to 100, then this runs faster, but unfortunately, it’s not enough samples to compute the response.

There’s some speedup still avaliable in `LPF_SinglePulse`, but it’s probably not huge. But before going there, I think it’s important to really consider the implications of @stevebronder 's point above about `S = max(abs(P))`. In general, unless `P` is constructed in a special way, this thing won’t have continuous derivatives. And if you have discontinuous derivatives anywhere within the “important” part of the posterior, that will wreak havoc on sampling. So it’s highly possible that you are working hard to solve a speed problem in a situation where you are very likely to see sampling problems even if you can speed up your function.

With that said, you can probably get modest speedup by pre-computing and reusing redundant parts of the computation. For i in 1:5, you recompute `1/tgamma(i)` eight times, and then recompute it at every likelihood evaluation. Move this computation to transformed data. Then, for j in 1:8, you recompute `(t - ti)/tau` several times. Just do it once at the beginning of the function, store the result in a temporary variable, and grab the relevant term wherever needed. Finally, because you need all of the powers of `(t - ti)/tau` from 2 to 4, it is possibly more efficient to compute these recursively by repeated multiplications rather than separate power functions (I’m unsure of whether there’s any speedup at all to be had with this last one, and pretty sure that any speedup must be minor since the highest power you need isn’t very large).

3 Likes

@jsocolar could log sum exp be used here to help a bit? Abs would still be bad. Maybe exp instead? Though that would change the scales

@jsocolar and @stevebronder. Thanks for all the help so far! And also for the explanation on the issues with max and abs.
I’ve tried using LogSumExp as the alternative for the max function. I also removed the Abs function to test with LogSumExp.
Still the time seems to be dominated by the computations in LPF_SinglePulse and depends mainly on the ‘nsamples’ parameter.
Computing each y(i) is multiplying a nsamples (>2000) sample vector many times.
I’ll also check alternatives to the max and abs functions, to avoid the undefined derivatives.

Cache all your `1 / tgamma` things so you are only calling those once. Don’t use `pow` for powers of 0 or 1. That seems to be all the easy stuff. For anything else you’d have to go through the math and see if you can do some clever simplifications.

Yes for instance instead of doing

for `t1-t6` in each pow you can just make a vector like `vector[N} t_t1 = (t - t1)/tau` and reuse the computation.

I didn’t have time to write a full working example, but I have a modded your above a bit to show what I mean. You should also change your input data so it comes in such that it can be accessed in a column-wise fashion since Stan is column major.

Also another thing that might help is revisiting your priors, the `gamma(1, 0.001)` is very wide. Is there a reason something like a half normal would not work? (i.e. your already constraining it to be positive so a prior of `normal(0, 1)` would be only the positive side of the normal)

``````functions{

vector LPF_SinglePulse(matrix Steps, vector t0_i, real tau, vector t, int nsamples, vector inv_tgamma) {
// This Function compues the response of a 5th order Leaky integrator to a pulse

matrix [nsamples, 8] y;
/*
vector [nsamples] y2;
vector [nsamples] y3;
vector [nsamples] y4;
vector [nsamples] y5;
vector [nsamples] y6;
vector [nsamples] y7;
vector [nsamples] y8;
*/
// Initial implementation using a for loop in RStan, switched to StanCMD
//for(k in 1:nsamples){
//y1[k] =   Step1[k]*( (1.0) - (exp(-(t[k]-t1)/tau)*(1/(tgamma(1))*(pow((t[k]-t1)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t1)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t1)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t1)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t1)/tau,4)))));
//y2[k] =   Step2[k]*( (1.0) - (exp(-(t[k]-t2)/tau)*(1/(tgamma(1))*(pow((t[k]-t2)/tau,0)) + 1/(tgamma(2))*(pow((t[k]-t2)/tau,1)) + 1/(tgamma(3))*(pow((t[k]-t2)/tau,2)) + 1/(tgamma(4))*(pow((t[k]-t2)/tau,3)) + 1/(tgamma(5))*(pow((t[k]-t2)/tau,4)))));
//  }
/**
* TODO: Compute 1 / tgamma(j) in the transformed data block and pass it to function
*/
for (i in 1:8) {
vector[nsamples] t_t0 = (t - t0_i[i]) ./ tau;
y[, i] =  ( (1.0) - (exp(-t_t0) .* inv_tgamma[1] .*
// 1 replaces (pow(t_t0, 0)) and pow(t_t0, 1) is just t_t0
1 + inv_tgamma[2] .* t_t0);
t_t0 *= t_t0;
y[, i] += inv_tgamma[3] .* t_t0;
t_t0 *= t_t0;
y[, i] += inv_tgamma[4] .* t_t0;
t_t0 *= t_t0;
y[, i] += inv_tgamma[5] .* t_t0;
y[, i] *= Step[, i];
}
/*
y1 = Step1.*( (1.0) - (exp(-(t-t1)/tau).*(1/(tgamma(1)).*(pow((t-t1)/tau,0)) + 1/(tgamma(2)).*(pow((t-t1)/tau,1)) + 1/(tgamma(3)).*(pow((t-t1)/tau,2)) + 1/(tgamma(4)).*(pow((t-t1)/tau,3)) + 1/(tgamma(5)).*(pow((t-t1)/tau,4)))));
y2 = Step2.*( (1.0) - (exp(-(t-t2)/tau).*(1/(tgamma(1)).*(pow((t-t2)/tau,0)) + 1/(tgamma(2)).*(pow((t-t2)/tau,1)) + 1/(tgamma(3)).*(pow((t-t2)/tau,2)) + 1/(tgamma(4)).*(pow((t-t2)/tau,3)) + 1/(tgamma(5)).*(pow((t-t2)/tau,4)))));
y3 = Step3.*( (1.0) - (exp(-(t-t3)/tau).*(1/(tgamma(1)).*(pow((t-t3)/tau,0)) + 1/(tgamma(2)).*(pow((t-t3)/tau,1)) + 1/(tgamma(3)).*(pow((t-t3)/tau,2)) + 1/(tgamma(4)).*(pow((t-t3)/tau,3)) + 1/(tgamma(5)).*(pow((t-t3)/tau,4)))));
y4 = Step4.*( (1.0) - (exp(-(t-t4)/tau).*(1/(tgamma(1)).*(pow((t-t4)/tau,0)) + 1/(tgamma(2)).*(pow((t-t4)/tau,1)) + 1/(tgamma(3)).*(pow((t-t4)/tau,2)) + 1/(tgamma(4)).*(pow((t-t4)/tau,3)) + 1/(tgamma(5)).*(pow((t-t4)/tau,4)))));
y5 = Step5.*( (1.0) - (exp(-(t-t5)/tau).*(1/(tgamma(1)).*(pow((t-t5)/tau,0)) + 1/(tgamma(2)).*(pow((t-t5)/tau,1)) + 1/(tgamma(3)).*(pow((t-t5)/tau,2)) + 1/(tgamma(4)).*(pow((t-t5)/tau,3)) + 1/(tgamma(5)).*(pow((t-t5)/tau,4)))));
y6 = Step6.*( (1.0) - (exp(-(t-t6)/tau).*(1/(tgamma(1)).*(pow((t-t6)/tau,0)) + 1/(tgamma(2)).*(pow((t-t6)/tau,1)) + 1/(tgamma(3)).*(pow((t-t6)/tau,2)) + 1/(tgamma(4)).*(pow((t-t6)/tau,3)) + 1/(tgamma(5)).*(pow((t-t6)/tau,4)))));
y7 = Step7.*( (1.0) - (exp(-(t-t7)/tau).*(1/(tgamma(1)).*(pow((t-t7)/tau,0)) + 1/(tgamma(2)).*(pow((t-t7)/tau,1)) + 1/(tgamma(3)).*(pow((t-t7)/tau,2)) + 1/(tgamma(4)).*(pow((t-t7)/tau,3)) + 1/(tgamma(5)).*(pow((t-t7)/tau,4)))));
y8 = Step8.*( (1.0) - (exp(-(t-t8)/tau).*(1/(tgamma(1)).*(pow((t-t8)/tau,0)) + 1/(tgamma(2)).*(pow((t-t8)/tau,1)) + 1/(tgamma(3)).*(pow((t-t8)/tau,2)) + 1/(tgamma(4)).*(pow((t-t8)/tau,3)) + 1/(tgamma(5)).*(pow((t-t8)/tau,4)))));
*/
return( (y[, 1]- y[, 2]) + (y[, 3]- y[, 4]) + (y[, 5] - y[, 6]) + (y[, 7] - y[, 8]));

}

real Leaky_Output(matrix Steps, real t1, real t2, real t3, real t4, real t5, real t6, real t7, real t8, real tauA, real tauC, real polarity, vector t, int nsamples) {
// This Function computes the peak output of a 5th order Leaky integrator to a pulse

// Determine time constant to be used (-1 or 1 is the input to polarity)
real tau = tauA * ((1+polarity)/2) + tauC * ((polarity-1)/(-2));

// Compute Step response per pulse
vector[nsamples] P = LPF_SinglePulse(Steps, t1, t2, t3, t4, t5, t6, t7, t8, tau, t, nsamples);

// Get the maximum value
real S = log_sum_exp(P);

return(S);
}

} // e.o. functions

data {
int<lower=1> Nsubj ; 	          // subjects
int<lower=1> Ntotal ; 		  // Number of measurements
int<lower=1> nsamples ; 	  // Number of samples in time vector
vector<lower=0>[Ntotal] y ;    // Response vector
int<lower=1> subj[Ntotal] ;  // subject vector
vector[Ntotal] polarity; 		      // polarity vector (either 1 or -1)
vector[Ntotal] Amp; 		    // Stimulus amplitude vector
int<lower=1> nipi[Ntotal];   // sample inter-pulse interval, per stimulus
vector[nsamples] t ;              // time vector
/**
* Array of pulse samples for each Ntotal
*/
array[Ntotal] matrix[nsamples, 8] stimulus_data;
/*
matrix[nsamples, Ntotal] p1;       // Matrix with stimulus data for pulse 1
matrix[nsamples, Ntotal] p2;       // Matrix with stimulus data for pulse 2
matrix[nsamples, Ntotal] p3;       // Matrix with stimulus data for pulse 3
matrix[nsamples, Ntotal] p4;       // Matrix with stimulus data for pulse 4
matrix[nsamples, Ntotal] p5;       // Matrix with stimulus data for pulse 5
matrix[nsamples, Ntotal] p6;       // Matrix with stimulus data for pulse 6
matrix[nsamples, Ntotal] p7;       // Matrix with stimulus data for pulse 7
matrix[nsamples, Ntotal] p8;       // Matrix with stimulus data for pulse 8
*/
matrix[8, Ntotal] t0;              // Matrix with time onsets for each pulse

} // e.o. data

parameters {

// Subject dependent parameters
vector<lower=1>[Nsubj] tauA; 	    // Time constant for polarity A
vector<lower=1>[Nsubj]  tauC; 	    // Time constant for polarity C
vector<lower=1e-15>[Nsubj] A; 		  // Parameter A (ceiling for y)
vector<lower=1e-15>[Nsubj] theta; 	// Parameter theta (half-width promptness)
vector<lower=1e-15>[Nsubj] omega;	// Parameter omega (width)
vector<lower=1e-15>[Nsubj] sigma; 	// Parameter sigma (std of promptness)

} // e.o. parameters

model{

// Time constants follow uniform distributions here just for testing
tauA ~ uniform(1, 100);
tauC] ~ uniform(1, 100);

// Priors for omega, theta, A defined as Gamma functions for testing
omega ~ gamma(1, 0.001);
theta ~ gamma(1, 0.001);
A ~ gamma(1, 0.001);
// Kind of a huge prior?
sigma ~ gamma(1, 0.001);
// loop for all stimuli presented (and responses)

for ( i in 1:Ntotal ) {

// We compute the peak output of the stimulus passing through a filter, this affects our response
// S_par takes too much time to compute!!!!
real S_par = Leaky_Output(stim_data[i], t0[, i],
tauA[subj[i]], tauC[subj[i]], polarity[i], t, nsamples);

// Response: y ~ normal(mu, sigma);
y[i] ~ normal( A[subj[i]] / ( 1 + exp( ( (-2*log(9))/omega[subj[i]] ) * (S_par - theta[subj[i]]) ) ) + 0.25 , sigma[subj[i]] );

}

} // e.o. model
``````
1 Like