Gaussian Process out-of-sample predictive distribution

matthew · July 25, 2020, 11:47pm

More precisely, I have data shaped like the below:

observation_number	group_index	time_index	unit_level_covariate_1	unit_level_covariate_2	group_level_covariate_1	response
1	1	1	-0.1	0.2	0.8	y_1
2	1	1	0.4	-0.6	0.8	y_2
3	2	1	0.34	-0.45	0.5	y_3
4	2	1	-0.25	0.3	0.5	y_4
5	2	1	0.24	5	0.5	y_5
6	3	1	-0.64	5	0.2	y_6
7	1	2	0.2	0.2	0.8	y_7
8	1	2	0.6	0.35	0.8	y_8
9	1	2	0.8	0.54	0.8	y_9
10	2	2	0.9	0.6	0.5	y_{10}
11	2	2	0.2	0.5	0.5	y_{11}
12	4	2	0.1	0.5	0.4	y_{12}
13	4	2	0.6	-0.3	0.4	y_{13}
14	5	2	0.7	0.9	0.1	y_{14}

What the unit-level and group-level covariates are and their values are immaterial; it suffices to know that they have been standardized to be on unit scale. The same goes for the response variable y_i.

It is very easy for me to describe a hierarchical random effects model for this data with a centered parameterization. Let N_u be the number of unit-level covariates and N_g be the number of group-level covariates. Then the data-generating model can be written as follows:

\begin{align} y_i &\sim t(\nu, \mu, \phi) \\ \nu &\sim \Gamma(2, 0.1) \\ \phi &= \frac{1} {\theta}, \ \theta \sim t(4, 0, 1) \\ \mu &= \mu_{g} + \sum_{j=1} ^ {N_u} \beta_j \cdot \text{unit_level_covariate}_j, \ \beta_j \sim t(4, 0, 1) \\ \mu_{g} &\sim t\left(4, \sum_{j=1} ^ {N_g} \gamma_j \cdot \text{group_level_covariate}_j, \sigma_g \right), \gamma_j \sim t(4, 0, 1) \\ \sigma_g &\sim \text{half-}t(4, 0, 1) \end{align}

The accompanying stan code would look like this:

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=1> J; //Number of groups
  int<lower=1> K; //Number of group predictors
  int<lower=1> O; //Number of observation predictors
  int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
  real response[N]; // The outcome of the data generation process
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
}
parameters {
  real<lower=2> nu;
  real<lower=0> inv_phi;
  vector[O] beta;
  vector[K] gamma;
  vector[J] group_mu_raw;
  real<lower=0> sigma_group_mu;

}
transformed parameters {
  vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
  vector[N] mu =
    group_mu[group_idx] +
    observation_data * beta;

  real phi = 1 / inv_phi;
}
model {
  nu ~ gamma(2, 0.1);
  inv_phi ~ student_t(4, 0, 1);

  beta ~ student_t(4, 0, 1);
  gamma ~ student_t(4, 0, 1);

  group_mu_raw ~ student_t(4, 0, 1);
  sigma_group_mu  ~ student_t(4, 0, 1);

  // likelihood function
  response ~ student_t(nu, mu, phi);

}
generated quantities {
  real y_rep[N];
  for (n in 1:N) {
    y_rep[n] = student_t_rng(nu, mu[n], phi);
  }
}

Now, following along with the stancon tutorial, I saw that we can incorporate time varying effects by adding a linear term to the mean of the likelihood. Note that there are multiple observations and consequently groups for each time index t. If we say that the time-varying effect prior is a Gaussian Process, the data-generating model becomes:

\begin{align} y_i &\sim t(\nu, \mu, \phi) \\ \nu &\sim \Gamma(2, 0.1) \\ \phi &= \frac{1} {\theta}, \ \theta \sim t(4, 0, 1) \\ \mu &= w_t + \mu_{g} + \sum_{j=1} ^ {N_u} \beta_j \cdot \text{unit_level_covariate}_j, \ \beta_j \sim t(4, 0, 1) \\ w_t &= f + \epsilon, \ f \sim GP(0, K) \\ \epsilon &\sim N(0, \sigma_t), \ \sigma_t \sim \text{half-}N(0, 1) \\ \mu_{g} &\sim t\left(4, \sum_{j=1} ^ {N_g} \gamma_j \cdot \text{group_level_covariate}_j, \sigma_g \right), \gamma_j \sim t(4, 0, 1) \\ \sigma_g &\sim \text{half-}t(4, 0, 1) \end{align}

As we’ve been discussing, we can specify the priors for the covariance kernel to be any stationary kernel we wish and we can put appropriate priors on the the length-scale and amplitude parameters of that kernel.

The stan-code to generate this model will look like the below. Since the sum of covariance kernels is a covariance kernel, I use two squared exponential kernels (one for short trends, one for long trends) and a Matern \nu= 3/2 kernel for an auto-regressive component:

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=1> J; //Number of groups
  int<lower=1> K; //Number of group predictors
  int<lower=1> O; //Number of observation predictors
  int<lower=1> T; //Number of time indexes.
  int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
  int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
  real response[N]; // The outcome of the data generation process
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
  real t_gp_vec[T];
  real jitter = 1e-12;
  for (t in 1:T)
    t_gp_vec[t] = t;
}
parameters {
  real<lower=2> nu;
  real<lower=0> inv_phi;
  vector[O] beta;
  vector[K] gamma;
  vector[J] group_mu_raw;
  real<lower=0> sigma_group_mu;

  // GP prior parameters
  // Noise component
  vector[T] t_noise_raw;
  vector[T] gp_z;
  real<lower=0> sigma_noise;

  // Autoregressive GP Component
  real<lower=0> gp_len_ar;
  real<lower=0> sigma_gp_ar;

  // Long-term Trend GP Component
  real<lower=0> gp_len_trend_long;
  real<lower=0> sigma_gp_trend_long;

  // Short-term Trend GP Component
  real<lower=0> gp_len_trend_short;
  real<lower=0> sigma_gp_trend_short;
}
transformed parameters {
  vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
  vector[N] mu =
    group_mu[group_idx] +
    observation_data * beta;

  real phi = 1 / inv_phi;

  vector[T] f;
  vector[T] gp;
  vector[T] t_noise = sigma_noise * t_noise_raw;

  {
    matrix[T, T] C;
    matrix[T, T] L_C;

    C = gp_matern32_cov(t_gp_vec, sigma_gp_ar, gp_len_ar);
    C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short, gp_len_trend_short);
    C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long, gp_len_trend_long);
    for (t in 1:T)
      C[t,t] += jitter;
    L_C = cholesky_decompose(C);
    f = L_C * gp_z;
  }

  // gp is sum of monthly noise and the smoothly varying processes and the autoregressive process
  gp = t_noise + f;
}
model {
  nu ~ gamma(2, 0.1);
  inv_phi ~ student_t(4, 0, 1);

  beta ~ student_t(4, 0, 1);
  gamma ~ student_t(4, 0, 1);

  group_mu_raw ~ student_t(4, 0, 1);
  sigma_group_mu  ~ student_t(4, 0, 1);

  // GP priors
  // Noise component
  gp_z ~ normal(0, 1);
  sigma_noise ~ normal(0, 1);
  t_noise_raw ~ normal(0, 1);

  // Autoregressive Component
  gp_len_ar ~ inv_gamma(7, 20);
  sigma_gp_ar ~ normal(0, 1);

  // Short-term Trend Component
  gp_len_trend_short ~ inv_gamma(10, 20);
  sigma_gp_trend_short ~ normal(0, 1);

  // Long-term Trend Component
  gp_len_trend_long ~ inv_gamma(7, 20);
  sigma_gp_trend_long ~ normal(0, 1);

  // likelihood function
  response ~ student_t(nu, mu + gp[t_idx], phi);

}
generated quantities {
  real y_rep[N];
  for (n in 1:N) {
    y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
  }
}

The reason I was asking about “what the gaussian process acts on” is because I need to feed in observed values for the process f into the posterior-predictive distribution for the Gaussian Process and in turn for the posterior predictive distribution for the observed response values y_i. That is, I need to know what the observed values \textbf{y} are in eq 2.22 found here: http://www.gaussianprocess.org/gpml/chapters/RW2.pdf

From my limited understanding, for the data above, I believe \textbf{y} = \left<\frac{y_1 + y_2 + y_3 + y_4 + y_5 + y_6 }{6}, \frac{y_7 + y_8 + y_9 + y_{10} + y_{11} + y_{12} + y_{13} + y_{14}}{8}\right>, i.e. the average response for each time index t.

The reason I think this is the observed value vector is that the Gaussian Process regresses on the time index t and so the Gaussian Process would fit to the average value of time index t.

If this is incorrect, or if my use of the analytical form of the posterior predictive distribution for Gaussian processes of a Gaussian observation (the time trend), I apologize.

Clearing up any misconceptions on my part, if any, would be greatly appreciated.

If all of the above assumptions are correct, then I believe this code can be used to obtain the generating process for the posterior-predictive distribution.

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=0> J; //Number of groups
  int<lower=0> K; //Number of group predictors
  int<lower=0> O; //Number of observation predictor
  int<lower=1> T; //Number of time indexes.
  int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
  int<lower=0, upper=J> group_idx[N]; // Maps each observation to each group
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
  int<lower=0> max_group_idx; //Highest group index fit in model
  int<lower=0> max_t_idx; //Highest time index fit in model
  real response[N]; // The outcome of the data generation process
  real response_by_t[max_t_idx]; //observed average for gaussian process.

  int N_samples; // number of samples drawn from posterior distributions
  real nu[N_samples]; // nu parameter from student t likelihood
  real phi[N_samples]; // standard deviation parameter from student t likelihood
  matrix[N_samples, O] beta; // covariate matrix draws for observations
  matrix[N_samples, K] gamma; // covariate matrix draws for groups
  matrix[N_samples, max_group_idx] group_mu; //posterior draws of the group mu parameter
  real sigma_group_mu[N_samples]; // posterior draws of the hierarchal variance parameter.

  // GP posterior samples
  // Noise component
  matrix[N_samples, max_t_idx] t_noise;
  matrix[N_samples, max_t_idx] gp_z;
  real sigma_noise[N_samples];

  // Auto-regressive component
  real sigma_gp_ar[N_samples];
  real gp_len_ar[N_samples];

  real sigma_gp_trend_short[N_samples];
  real gp_len_trend_short[N_samples];

  real sigma_gp_trend_long[N_samples];
  real gp_len_trend_long[N_samples];
}
transformed data {
  real t1_vec[max_t_idx];
  real t2_vec[T - max_t_idx];
  real jitter = 1e-12;
  for (t in 1:max_t_idx)
    t1_vec[t] = t;
  for (t in 1:T - max_t_idx)
    t2_vec[t] = t + max_t_idx;
}
parameters {
}
model {
}
generated quantities {
  matrix[N_samples, N] y_pred;
  matrix[N_samples, T] gp;

  // generate the gp vector from the posterior samples.
  for(s in 1:N_samples) {
    vector[max_t_idx] f1;
    vector[T - max_t_idx] f2;

    {
       matrix[max_t_idx, max_t_idx] L_C_t1;

       matrix[max_t_idx, max_t_idx] C_t1;
       matrix[max_t_idx, T - max_t_idx] C_t1_t2;
       matrix[T - max_t_idx, max_t_idx] C_t2_t1;
       matrix[T - max_t_idx, T - max_t_idx] C_t2;

       matrix[T - max_t_idx, max_t_idx] A;

       C_t1 = gp_matern32_cov(t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t1_t2 = gp_matern32_cov(t1_vec, t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t1_t2 += gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t1_t2+= gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t2_t1 = gp_matern32_cov(t2_vec, t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t2 = gp_matern32_cov(t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       if (size(t2_vec) > 0) {
         A = C_t2_t1 * inverse(C_t1 + diag_matrix(rep_vector(square(sigma_noise[s]), max_t_idx)));
         f2 = multi_normal_rng(A * to_vector(response_by_t), C_t2 - A * C_t1_t2);
       }

       for (t in 1:max_t_idx) {
         C_t1[t, t] += jitter;
       }

       // Reuse samples from noise and multivariate normal for f1.
       L_C_t1 = cholesky_decompose(C_t1);
       f1 = L_C_t1 * to_vector(gp_z[s]) + to_vector(t_noise[s]);

       gp[s] = to_row_vector(append_row(f1, f2));
    }
  }

  for(n in 1:N) {
    row_vector[N_samples] group_mu_rep;
    row_vector[N_samples] mu;

    if (group_idx[n] > max_group_idx) {
      for (s in 1:N_samples) {
         group_mu_rep[s] = gamma[s] * to_vector(group_data[group_idx[n]]) + sigma_group_mu[s] * student_t_rng(4, 0, 1);
      }
    }
    else {
      group_mu_rep = to_row_vector(group_mu[,group_idx[n]]);
    }

    mu = group_mu_rep + to_row_vector(beta * to_vector(observation_data[n])) + to_row_vector(gp[,t_idx[n]]);

    y_pred[,n] = to_vector(student_t_rng(nu, mu, phi));

  }
}

Topic		Replies	Views
Generate replicated data for out-of-sample data for gaussian process Modeling gaussian-process	4	761	July 29, 2020
Modeling time dynamics with GP Modeling specification , gaussian-process	12	650	April 29, 2020
GP Docs - Predictive Inference for Gaussian Processes Proposals Developers gaussian-process , docu	6	857	February 5, 2024
Gaussian Process Regression with Student likelihood General gaussian-process	7	1018	February 8, 2019
Time series and Gaussian process Modeling gaussian-process	62	4572	December 20, 2019

Gaussian Process out-of-sample predictive distribution

Related topics