Gaussian Process out-of-sample predictive distribution

@anon79882417,

More precisely, I have data shaped like the below:

observation_number group_index time_index unit_level_covariate_1 unit_level_covariate_2 group_level_covariate_1 response
1 1 1 -0.1 0.2 0.8 y_1
2 1 1 0.4 -0.6 0.8 y_2
3 2 1 0.34 -0.45 0.5 y_3
4 2 1 -0.25 0.3 0.5 y_4
5 2 1 0.24 5 0.5 y_5
6 3 1 -0.64 5 0.2 y_6
7 1 2 0.2 0.2 0.8 y_7
8 1 2 0.6 0.35 0.8 y_8
9 1 2 0.8 0.54 0.8 y_9
10 2 2 0.9 0.6 0.5 y_{10}
11 2 2 0.2 0.5 0.5 y_{11}
12 4 2 0.1 0.5 0.4 y_{12}
13 4 2 0.6 -0.3 0.4 y_{13}
14 5 2 0.7 0.9 0.1 y_{14}

What the unit-level and group-level covariates are and their values are immaterial; it suffices to know that they have been standardized to be on unit scale. The same goes for the response variable y_i.

It is very easy for me to describe a hierarchical random effects model for this data with a centered parameterization. Let N_u be the number of unit-level covariates and N_g be the number of group-level covariates. Then the data-generating model can be written as follows:

\begin{align} y_i &\sim t(\nu, \mu, \phi) \\ \nu &\sim \Gamma(2, 0.1) \\ \phi &= \frac{1} {\theta}, \ \theta \sim t(4, 0, 1) \\ \mu &= \mu_{g} + \sum_{j=1} ^ {N_u} \beta_j \cdot \text{unit_level_covariate}_j, \ \beta_j \sim t(4, 0, 1) \\ \mu_{g} &\sim t\left(4, \sum_{j=1} ^ {N_g} \gamma_j \cdot \text{group_level_covariate}_j, \sigma_g \right), \gamma_j \sim t(4, 0, 1) \\ \sigma_g &\sim \text{half-}t(4, 0, 1) \end{align}

The accompanying stan code would look like this:

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=1> J; //Number of groups
  int<lower=1> K; //Number of group predictors
  int<lower=1> O; //Number of observation predictors
  int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
  real response[N]; // The outcome of the data generation process
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
}
parameters {
  real<lower=2> nu;
  real<lower=0> inv_phi;
  vector[O] beta;
  vector[K] gamma;
  vector[J] group_mu_raw;
  real<lower=0> sigma_group_mu;

}
transformed parameters {
  vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
  vector[N] mu =
    group_mu[group_idx] +
    observation_data * beta;

  real phi = 1 / inv_phi;
}
model {
  nu ~ gamma(2, 0.1);
  inv_phi ~ student_t(4, 0, 1);

  beta ~ student_t(4, 0, 1);
  gamma ~ student_t(4, 0, 1);

  group_mu_raw ~ student_t(4, 0, 1);
  sigma_group_mu  ~ student_t(4, 0, 1);

  // likelihood function
  response ~ student_t(nu, mu, phi);

}
generated quantities {
  real y_rep[N];
  for (n in 1:N) {
    y_rep[n] = student_t_rng(nu, mu[n], phi);
  }
}

Now, following along with the stancon tutorial, I saw that we can incorporate time varying effects by adding a linear term to the mean of the likelihood. Note that there are multiple observations and consequently groups for each time index t. If we say that the time-varying effect prior is a Gaussian Process, the data-generating model becomes:

\begin{align} y_i &\sim t(\nu, \mu, \phi) \\ \nu &\sim \Gamma(2, 0.1) \\ \phi &= \frac{1} {\theta}, \ \theta \sim t(4, 0, 1) \\ \mu &= w_t + \mu_{g} + \sum_{j=1} ^ {N_u} \beta_j \cdot \text{unit_level_covariate}_j, \ \beta_j \sim t(4, 0, 1) \\ w_t &= f + \epsilon, \ f \sim GP(0, K) \\ \epsilon &\sim N(0, \sigma_t), \ \sigma_t \sim \text{half-}N(0, 1) \\ \mu_{g} &\sim t\left(4, \sum_{j=1} ^ {N_g} \gamma_j \cdot \text{group_level_covariate}_j, \sigma_g \right), \gamma_j \sim t(4, 0, 1) \\ \sigma_g &\sim \text{half-}t(4, 0, 1) \end{align}

As we’ve been discussing, we can specify the priors for the covariance kernel to be any stationary kernel we wish and we can put appropriate priors on the the length-scale and amplitude parameters of that kernel.

The stan-code to generate this model will look like the below. Since the sum of covariance kernels is a covariance kernel, I use two squared exponential kernels (one for short trends, one for long trends) and a Matern \nu= 3/2 kernel for an auto-regressive component:

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=1> J; //Number of groups
  int<lower=1> K; //Number of group predictors
  int<lower=1> O; //Number of observation predictors
  int<lower=1> T; //Number of time indexes.
  int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
  int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
  real response[N]; // The outcome of the data generation process
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
  real t_gp_vec[T];
  real jitter = 1e-12;
  for (t in 1:T)
    t_gp_vec[t] = t;
}
parameters {
  real<lower=2> nu;
  real<lower=0> inv_phi;
  vector[O] beta;
  vector[K] gamma;
  vector[J] group_mu_raw;
  real<lower=0> sigma_group_mu;

  // GP prior parameters
  // Noise component
  vector[T] t_noise_raw;
  vector[T] gp_z;
  real<lower=0> sigma_noise;

  // Autoregressive GP Component
  real<lower=0> gp_len_ar;
  real<lower=0> sigma_gp_ar;

  // Long-term Trend GP Component
  real<lower=0> gp_len_trend_long;
  real<lower=0> sigma_gp_trend_long;

  // Short-term Trend GP Component
  real<lower=0> gp_len_trend_short;
  real<lower=0> sigma_gp_trend_short;
}
transformed parameters {
  vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
  vector[N] mu =
    group_mu[group_idx] +
    observation_data * beta;

  real phi = 1 / inv_phi;

  vector[T] f;
  vector[T] gp;
  vector[T] t_noise = sigma_noise * t_noise_raw;

  {
    matrix[T, T] C;
    matrix[T, T] L_C;

    C = gp_matern32_cov(t_gp_vec, sigma_gp_ar, gp_len_ar);
    C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short, gp_len_trend_short);
    C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long, gp_len_trend_long);
    for (t in 1:T)
      C[t,t] += jitter;
    L_C = cholesky_decompose(C);
    f = L_C * gp_z;
  }

  // gp is sum of monthly noise and the smoothly varying processes and the autoregressive process
  gp = t_noise + f;
}
model {
  nu ~ gamma(2, 0.1);
  inv_phi ~ student_t(4, 0, 1);

  beta ~ student_t(4, 0, 1);
  gamma ~ student_t(4, 0, 1);

  group_mu_raw ~ student_t(4, 0, 1);
  sigma_group_mu  ~ student_t(4, 0, 1);

  // GP priors
  // Noise component
  gp_z ~ normal(0, 1);
  sigma_noise ~ normal(0, 1);
  t_noise_raw ~ normal(0, 1);

  // Autoregressive Component
  gp_len_ar ~ inv_gamma(7, 20);
  sigma_gp_ar ~ normal(0, 1);

  // Short-term Trend Component
  gp_len_trend_short ~ inv_gamma(10, 20);
  sigma_gp_trend_short ~ normal(0, 1);

  // Long-term Trend Component
  gp_len_trend_long ~ inv_gamma(7, 20);
  sigma_gp_trend_long ~ normal(0, 1);

  // likelihood function
  response ~ student_t(nu, mu + gp[t_idx], phi);

}
generated quantities {
  real y_rep[N];
  for (n in 1:N) {
    y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
  }
}

The reason I was asking about “what the gaussian process acts on” is because I need to feed in observed values for the process f into the posterior-predictive distribution for the Gaussian Process and in turn for the posterior predictive distribution for the observed response values y_i. That is, I need to know what the observed values \textbf{y} are in eq 2.22 found here: http://www.gaussianprocess.org/gpml/chapters/RW2.pdf

From my limited understanding, for the data above, I believe \textbf{y} = \left<\frac{y_1 + y_2 + y_3 + y_4 + y_5 + y_6 }{6}, \frac{y_7 + y_8 + y_9 + y_{10} + y_{11} + y_{12} + y_{13} + y_{14}}{8}\right>, i.e. the average response for each time index t.

The reason I think this is the observed value vector is that the Gaussian Process regresses on the time index t and so the Gaussian Process would fit to the average value of time index t.

If this is incorrect, or if my use of the analytical form of the posterior predictive distribution for Gaussian processes of a Gaussian observation (the time trend), I apologize.

Clearing up any misconceptions on my part, if any, would be greatly appreciated.

If all of the above assumptions are correct, then I believe this code can be used to obtain the generating process for the posterior-predictive distribution.

// hierarchical model with non-centered parameterization of mu
data {
  int<lower=1> N; //Number of observations
  int<lower=0> J; //Number of groups
  int<lower=0> K; //Number of group predictors
  int<lower=0> O; //Number of observation predictor
  int<lower=1> T; //Number of time indexes.
  int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
  int<lower=0, upper=J> group_idx[N]; // Maps each observation to each group
  matrix[N,O] observation_data; //covariate data of the observations
  matrix[J,K] group_data; //covariate data of the groups
  int<lower=0> max_group_idx; //Highest group index fit in model
  int<lower=0> max_t_idx; //Highest time index fit in model
  real response[N]; // The outcome of the data generation process
  real response_by_t[max_t_idx]; //observed average for gaussian process.

  int N_samples; // number of samples drawn from posterior distributions
  real nu[N_samples]; // nu parameter from student t likelihood
  real phi[N_samples]; // standard deviation parameter from student t likelihood
  matrix[N_samples, O] beta; // covariate matrix draws for observations
  matrix[N_samples, K] gamma; // covariate matrix draws for groups
  matrix[N_samples, max_group_idx] group_mu; //posterior draws of the group mu parameter
  real sigma_group_mu[N_samples]; // posterior draws of the hierarchal variance parameter.

  // GP posterior samples
  // Noise component
  matrix[N_samples, max_t_idx] t_noise;
  matrix[N_samples, max_t_idx] gp_z;
  real sigma_noise[N_samples];

  // Auto-regressive component
  real sigma_gp_ar[N_samples];
  real gp_len_ar[N_samples];

  real sigma_gp_trend_short[N_samples];
  real gp_len_trend_short[N_samples];

  real sigma_gp_trend_long[N_samples];
  real gp_len_trend_long[N_samples];
}
transformed data {
  real t1_vec[max_t_idx];
  real t2_vec[T - max_t_idx];
  real jitter = 1e-12;
  for (t in 1:max_t_idx)
    t1_vec[t] = t;
  for (t in 1:T - max_t_idx)
    t2_vec[t] = t + max_t_idx;
}
parameters {
}
model {
}
generated quantities {
  matrix[N_samples, N] y_pred;
  matrix[N_samples, T] gp;

  // generate the gp vector from the posterior samples.
  for(s in 1:N_samples) {
    vector[max_t_idx] f1;
    vector[T - max_t_idx] f2;

    {
       matrix[max_t_idx, max_t_idx] L_C_t1;

       matrix[max_t_idx, max_t_idx] C_t1;
       matrix[max_t_idx, T - max_t_idx] C_t1_t2;
       matrix[T - max_t_idx, max_t_idx] C_t2_t1;
       matrix[T - max_t_idx, T - max_t_idx] C_t2;

       matrix[T - max_t_idx, max_t_idx] A;

       C_t1 = gp_matern32_cov(t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t1_t2 = gp_matern32_cov(t1_vec, t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t1_t2 += gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t1_t2+= gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t2_t1 = gp_matern32_cov(t2_vec, t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       C_t2 = gp_matern32_cov(t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
       C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
       C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);

       if (size(t2_vec) > 0) {
         A = C_t2_t1 * inverse(C_t1 + diag_matrix(rep_vector(square(sigma_noise[s]), max_t_idx)));
         f2 = multi_normal_rng(A * to_vector(response_by_t), C_t2 - A * C_t1_t2);
       }

       for (t in 1:max_t_idx) {
         C_t1[t, t] += jitter;
       }

       // Reuse samples from noise and multivariate normal for f1.
       L_C_t1 = cholesky_decompose(C_t1);
       f1 = L_C_t1 * to_vector(gp_z[s]) + to_vector(t_noise[s]);

       gp[s] = to_row_vector(append_row(f1, f2));
    }
  }

  for(n in 1:N) {
    row_vector[N_samples] group_mu_rep;
    row_vector[N_samples] mu;

    if (group_idx[n] > max_group_idx) {
      for (s in 1:N_samples) {
         group_mu_rep[s] = gamma[s] * to_vector(group_data[group_idx[n]]) + sigma_group_mu[s] * student_t_rng(4, 0, 1);
      }
    }
    else {
      group_mu_rep = to_row_vector(group_mu[,group_idx[n]]);
    }

    mu = group_mu_rep + to_row_vector(beta * to_vector(observation_data[n])) + to_row_vector(gp[,t_idx[n]]);

    y_pred[,n] = to_vector(student_t_rng(nu, mu, phi));

  }
}