Hia,
I’ve been looking at ways to scale parameter inference in dynamical systems, both brute force (courtesy of MPI & threading) and through approximations. One promising approach is that of gradient matching with a gaussian process (GP) fit to data.
Recently Wenk et al. (2018) gave a thorough treatment of this approach in enough detail that I felt comfortable trying to implement it in Stan. Their paper can be found here: https://arxiv.org/pdf/1804.04378.pdf and their source code in Python here: https://github.com/wenkph/FGPGM.
The gist of the idea is that instead of comparing data (y) to the true states (x) of a dynamical system generated by the model parameters (\theta), we first fit a GP to the data (f(x, \phi)) and compare the gradients of the optimised GP function (\dot{x}|\phi) against the gradients generated by dynamical system for a given state (\dot{x}|\theta). This sidesteps the need to integrate the entire system which becomes prohibitive with an increasing number of states (S) or samples (N), as might occur in hierarchical modelling where different instances of a system share hyperparameters. A derivation of the full likelihood is given on pg. 9. [note: any bastardisation of notation and syntax is my own.]
Implementing this in Stan was fun (in a twisted sort of way) but resulted in a moderately complicated model that doesn’t work. Below is an example of the lynx-hare system from @Bob_Carpenter 's case study. For S = 2 and N =100, 2000 iterations take about 20mins for four chains but gives n_{eff} = 2, \hat{R} > 50,000 and unhelpfully wide posterior intervals. The authors on the other hand report correct inference in about 50mins with a single Gibbs chain (which I was dubious about, hence Stan).
If anyone is game to look over the model, I’d really appreciate having any obvious mistakes I’ve probably made pointed out. Alternatively, if someone is in a better position to comment on the validity of this approach it’d be great to know if I’m barking up the wrong tree.
Model code below and attached with an R script and data. Optimising N * S GPs can take a while, so a sensible set of toy parameters are already included.
Thanks for the help,
Andrew
gp_grad_match_LV.R (2.2 KB)
gp_grad_match_proto.stan (5.5 KB)
hudson-bay-lynx-hare.csv (470 Bytes)
functions{
// Function to calculate derivatives
real[] dx_dt(vector x, real[] theta,
real[] x_r, int[] x_i) {
real u = x[1];
real v = x[2];
real alpha = theta[1];
real beta = theta[2];
real gamma = theta[3];
real delta = theta[4];
real du_dt = (alpha - beta * v) * u;
real dv_dt = (-gamma + delta * u) * v;
return { du_dt, dv_dt };
}
// Function for log determinant of matrices
real log_cholesky_determinant(matrix x){
return log(prod(diagonal(cholesky_decompose(x)))^2);
}
// Function to calculate GP-ODE match
real logProbGrad(vector dx_f, vector x,
matrix D, matrix A){
int T = rows(x);
vector[T] dx_gp = D * x;
vector[T] diff = dx_f - dx_gp;
real prob = -0.5 * dot_product(diff, A \ diff);
real det = -0.5 * log_cholesky_determinant(A);
return det + prob;
}
// Function to calculate GP-obs match
real logProbGP(vector y, vector x,
matrix C_Phi, real sigma){
int T = rows(y);
// Prob of x given GP prior
real det_phi = -0.5 * log_cholesky_determinant(C_Phi); //determinants are constant
real p_phi = -0.5 * dot_product(x, C_Phi \ x);
// Prob of x given data
real det_y = -0.5 * log(prod(rep_vector(sigma^2, T))); //could precalc as data
real p_y = -0.5 * inv_square(sigma) * squared_distance(x, y);
return det_phi + p_phi + det_y + p_y;
}
// Function to calculate log-likelhood
real logLike(vector[] y, vector[] x, vector y_mean,
vector y_sd, matrix[,] kernArr, vector sigma,
real[] pars, real[] x_r, int[] x_i){
int S = x_i[1];
int T = x_i[2];
vector[T] x_raw[S];
vector[T] dx_f[S];
real scale = 2 * S * T;
real ll = 0;
// Convert to ODE scale
for(i in 1:S)
x_raw[i] = y_mean[i] + x[i] * y_sd[i];
// Generate gradients for each timestep
for(i in 1:T)
dx_f[, i] = dx_dt(to_vector(x_raw[, i]), pars, x_r, x_i);
// Evaluate for each state
for(i in 1:S){
real p_ode;
real p_gp;
// Normalise gradients to match GP
dx_f[i] = dx_f[i] / y_sd[i];
// Compare against GP
p_ode = logProbGrad(dx_f[i], x[i], kernArr[i, 2], kernArr[i, 3]);
// Add GP prior
p_gp = logProbGP(y[i], x[i], kernArr[i, 1], sigma[i]);
// Increment log-likelihood
ll = ll + p_ode + p_gp;
}
return -ll / scale;
}
// Function to pre-calculate GP covariance over gradients
matrix[] covKern(real rbf_var, real rbf_len, vector t,
real nugget, real gamma){
int T = rows(t);
matrix[T, T] C_Phi; // Covariance over x
matrix[T, T] C_Dash; // Covariance of gradients over x
matrix[T, T] C_DoubleDash; // Covariance over gradients
matrix[T, T] D; // Linear system for gradients
matrix[T, T] A; // Combined GP + gradients kernel
matrix[T, T] kernArr[3];
for(i in 1:T){
for(j in 1:T){
real t_diff = t[i] - t[j];
real k = rbf_var * exp(- 0.5 * t_diff^2 * inv_square(rbf_len));
C_Phi[i, j] = k;
C_Dash[i, j] = inv_square(rbf_len) * t_diff * k;
C_DoubleDash[i, j] = (inv_square(rbf_len) - t_diff^2 / rbf_len^4) * k;
}
C_Phi[i, i] = C_Phi[i, i] + nugget; // ensure PSD
}
D = C_Dash' / C_Phi;
A = C_DoubleDash - C_Dash' / C_Phi * C_Dash;
for(i in 1:T)
A[i, i] = A[i, i] + gamma; // Allows for GP-ODE mismatch
// Package matrices
kernArr[1] = C_Phi;
kernArr[2] = D;
kernArr[3] = A;
return(kernArr);
}
}
data {
int<lower=1> N; // # samples
int<lower=1> S; // # states
int<lower=1> T; // # observation times
vector[T] y[N, S]; // normalised data
vector[S] y_mean[N]; // data mean
vector[S] y_sd[N]; // data scale
vector<lower=0>[T] t; // observation times
// GP hyperparameters, optimised on normalised data
vector<lower=0>[S] rbf_var[N];
vector<lower=0>[S] rbf_len[N];
vector<lower=0>[S] sigma[N];
real nugget;
real gamma;
}
transformed data{
// Lower bounds for non-negative states
vector[S] L[N];
// Real and integer data
real x_r[0];
int x_i[2] = {S, T};
// GP covariance function
matrix[T, T] kernArr[N, S, 3];
for(i in 1:N){
for(j in 1:S){
kernArr[i, j] = covKern(rbf_var[i, j], rbf_len[i, j], t, nugget, gamma);
L[i, j] = -y_mean[i, j] / y_sd[i, j];
}
}
}
parameters{
real<lower=0> theta[4]; // parameters of system
vector<lower=0>[T] x_ub[N, S]; // Unbounded states
}
transformed parameters{
vector[T] x[N, S]; // True states (normalised)
// Constrain states above zero
for(i in 1:N)
for(j in 1:S)
x[i, j] = L[i, j] + x_ub[i, j];
}
model{
// Priors
theta[{1, 3}] ~ normal(1, 0.5);
theta[{2, 4}] ~ normal(0.05, 0.05);
for(i in 1:N){
// Estimate true state
for(j in 1:S)
x_ub[i, j] ~ normal(0, 1);
// Match against observations
target += logLike(y[i], x[i], y_mean[i], y_sd[i], kernArr[i], sigma[i], theta, x_r, x_i);
}
}
generated quantities{
// x are already predicted states,
// could simulate with ODE solver for comparison
}