SDE formulation of Gaussian Processes with Kalman filtering in Stan

I wrote a GP model for time series using the sde formulation as proposed by Särkkä & Solin. The covariance function is a sum of short-term, longer term and (weekly) periodic covariance fucntions. It turned out to be more complicated than I expected, but I would appreciate comments re eg the efficiency of code. As it is, 4x400 samples with warmup = 150 takes about 2 minutes for n=300, so for that small n the speed up compared with the batch approach isn’t that great.

    functions{
  matrix Fmatrix(real ell, int D) {
    // creates matrix F corresponding to a Matern kernel with nu = D - 1/2 
    matrix[D,D] F;
    real lambda = sqrt(2*(D-0.5))/ell;
    for (j in 1:D) {
      F[D,j] = -choose(D,j-1)*lambda^(D-j+1);
      for (i in 1:(D-1)) {
        if(j == i+1)
          F[i,j] = 1;
        else
          F[i,j] = 0;
      }
    }
    return F;
  }
  matrix Pmatrix(real s2, matrix F){
    // Creates steady state covariance matrix corresponding to F
    int D = rows(F);
    real lambda = -F[D,D]/D;
    matrix[D,D] P;
    vector[D*D] Pvec;
    matrix[D*D,D*D] XX;
    matrix[D,D] ID = diag_matrix(rep_vector(1,D));
    vector[D*D] Q = rep_vector(0,D*D);
    Q[D*D] = -s2 * tgamma(D)^2 / tgamma(2*D-1) * (2*lambda)^(2*D-1);
    for (i in 1:D) {
      for (k in 1:D) {
        XX[(i-1)*D+1:i*D,(k-1)*D+1:k*D] = F[i,k]*ID;
      }
      XX[(i-1)*D+1:i*D,(i-1)*D+1:i*D] = XX[(i-1)*D+1:i*D,(i-1)*D+1:i*D] + F;
    }
    Pvec = XX\Q;
    for (j in 1:D) {
      P[1:D,j] = Pvec[(j-1)*D+1:j*D];
    }
    return P;
  }

  matrix blockdiag(matrix m1, matrix m2){
    // creates a block diagonal matrix from 2 square matrices
    int d1 = rows(m1);
    int d2 = rows(m2);
    matrix[d1+d2,d1+d2] mm;
    mm[1:d1,1:d1] = m1;
    mm[(d1+1):(d1+d2),(d1+1):(d1+d2)] = m2;
    mm[d1+1:d1+d2,1:d1] = rep_matrix(0,d2,d1);
    mm[1:d1,d1+1:d1+d2] = rep_matrix(0,d1,d2);
    return mm;
  }
  matrix Fseasonal(real omega,int J){
    matrix[2*J,2*J] F = rep_matrix(0,2*J,2*J);
    for(j in 1:J){
      F[2*j,2*j-1] = omega*j;
      F[2*j-1,2*j] = -F[2*j,2*j-1];
    }
    return F;
  }
  matrix Pseas(int J, real ell){
    matrix[2*J,2*J] P= rep_matrix(0,2*J,2*J);
    for(i in 1:J){
      P[2*i,2*i] = 2*modified_bessel_first_kind(i,square(1/ell))/exp(square(1/ell));
      P[2*i-1,2*i-1] = P[2*i,2*i];
    }
    return P;
  }
}
data{
  int N; // n of time points
  int Nobs; // n of observations
  int np; // n of prediction steps
  real y[N]; // observations including dummies
  real  tt[N]; // vector of time points
  int obs[N]; // binary indicator for observations
  int obsind[Nobs]; //time indices of observations
  real pred_step; // prediction time step
}
parameters{
  real mu;
  real<lower=0> ellshort;
  real<lower=0> elllong;
  real<lower=0> ellseas;
  real<lower=0> s2short;
  real<lower=0> s2long;
  real<lower=0> epsilon;
}
transformed parameters{
  matrix[2,2] Fshort = Fmatrix(ellshort,2);
  matrix[3,3] Flong = Fmatrix(elllong,3);
  matrix[4,4] Fseas = Fseasonal(2*pi()/(7), 2);
  matrix[9,9] F = blockdiag(blockdiag(Fshort,Flong),Fseas);
  matrix[9,9] Pinf = blockdiag(blockdiag(Pmatrix(s2short,Fshort), Pmatrix(s2long,Flong)),Pseas(2,ellseas));
  matrix[9,9] A;
  matrix[9,9] QQ;
  vector[N] muvec;
  vector[N] Pvec;
  vector[9] mp;
  matrix[9,9] Pp;
  vector[9] K;
  vector[9] m = rep_vector(0,9); 
  matrix[9,9] P = Pinf;
  row_vector[9] H = [1,0,1,0,0,1,0,1,0];
  real dt = 0;
  
  // Kalman filter
  for (k in 1:N) {
    if(k==1){
      mp = m;
      Pp = P;
    } else {
      if( dt != (tt[k]-tt[k-1]) ){
        dt = tt[k]-tt[k-1];
        A = matrix_exp(F*dt); 
        QQ = Pinf-A*Pinf*A';
        }
      mp = A*m;
      Pp = A*P*A' + QQ;
      }
    muvec[k] = H*mp;
    Pvec[k] = H*Pp*H' + epsilon;
    if(obs[k]==1){
      K = Pp*H'/Pvec[k];
      m = mp + K*((y[k]-mu) - muvec[k]);
      P = Pp - K*H*Pp; }
    else {
      m = mp;
      P = Pp;
    }
  }
}
model{
  mu ~ normal(0,20);
  ellshort ~ inv_gamma(3,2);
  elllong ~ normal(50,10);
  ellseas ~ inv_gamma(3,2);
  epsilon ~ inv_gamma(3,2);
  s2short ~ inv_gamma(3,2);
  s2long ~ inv_gamma(3,2);
  y[obsind] ~ normal(mu + muvec[obsind], sqrt(Pvec[obsind]));
}
generated quantities {
  vector[np] preds;
  vector[np] simul;
  vector[9] mb = m;
  matrix[9,9] Pb = P;
  matrix[9,9] Ap = matrix_exp(F*pred_step);
  matrix[9,9] Qp = Pinf-Ap*Pinf*Ap';
  vector[9] mpb;
  matrix[9,9] Ppb;
  for(j in 1:np) {
    mpb = Ap*mb;
    Ppb = Ap*Pb*Ap' + Qp;
    preds[j] = mu + H*mpb;
    simul[j] = normal_rng(preds[j], sqrt(H*Ppb*H' + epsilon) );
    mb = mpb;
    Pb = Ppb;
  }
}

Create a index vector obsidx which has the indeces for observations and then write

 y[obsidx] ~ normal(mu + muvec[obsidx], sqrt(Pvec[obsidx]))
3 Likes

Thanks. I edited the model in my original post as you suggested. I also added a variable pred_step to the data block for defining the prediction step in the generated quantities block. In the previous version the prediction step would always be the last value of dt.

Also, a correction to my first post: inference using this model is much faster than the naive approach. The performance stats are from rstan in Windows.

1 Like

Did it change the speed?

It’s known that autodiffing through the Kalman filter loop is not the most efficient way, and with special code it would be possible to compute gradients more efficiently with one forward and one backward pass. Currently this can be implemented only in C++, but maybe someday we’ll have user defined gradients also for functions defined in Stan program with Stan language. This same thing holds for ODE and other solvers, which have C++ gradient implementations to make them faster.