More precisely, I have data shaped like the below:
observation_number | group_index | time_index | unit_level_covariate_1 | unit_level_covariate_2 | group_level_covariate_1 | response |
---|---|---|---|---|---|---|
1 | 1 | 1 | -0.1 | 0.2 | 0.8 | y_1 |
2 | 1 | 1 | 0.4 | -0.6 | 0.8 | y_2 |
3 | 2 | 1 | 0.34 | -0.45 | 0.5 | y_3 |
4 | 2 | 1 | -0.25 | 0.3 | 0.5 | y_4 |
5 | 2 | 1 | 0.24 | 5 | 0.5 | y_5 |
6 | 3 | 1 | -0.64 | 5 | 0.2 | y_6 |
7 | 1 | 2 | 0.2 | 0.2 | 0.8 | y_7 |
8 | 1 | 2 | 0.6 | 0.35 | 0.8 | y_8 |
9 | 1 | 2 | 0.8 | 0.54 | 0.8 | y_9 |
10 | 2 | 2 | 0.9 | 0.6 | 0.5 | y_{10} |
11 | 2 | 2 | 0.2 | 0.5 | 0.5 | y_{11} |
12 | 4 | 2 | 0.1 | 0.5 | 0.4 | y_{12} |
13 | 4 | 2 | 0.6 | -0.3 | 0.4 | y_{13} |
14 | 5 | 2 | 0.7 | 0.9 | 0.1 | y_{14} |
What the unit-level and group-level covariates are and their values are immaterial; it suffices to know that they have been standardized to be on unit scale. The same goes for the response variable y_i.
It is very easy for me to describe a hierarchical random effects model for this data with a centered parameterization. Let N_u be the number of unit-level covariates and N_g be the number of group-level covariates. Then the data-generating model can be written as follows:
The accompanying stan code would look like this:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=1> J; //Number of groups
int<lower=1> K; //Number of group predictors
int<lower=1> O; //Number of observation predictors
int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
}
parameters {
real<lower=2> nu;
real<lower=0> inv_phi;
vector[O] beta;
vector[K] gamma;
vector[J] group_mu_raw;
real<lower=0> sigma_group_mu;
}
transformed parameters {
vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
vector[N] mu =
group_mu[group_idx] +
observation_data * beta;
real phi = 1 / inv_phi;
}
model {
nu ~ gamma(2, 0.1);
inv_phi ~ student_t(4, 0, 1);
beta ~ student_t(4, 0, 1);
gamma ~ student_t(4, 0, 1);
group_mu_raw ~ student_t(4, 0, 1);
sigma_group_mu ~ student_t(4, 0, 1);
// likelihood function
response ~ student_t(nu, mu, phi);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = student_t_rng(nu, mu[n], phi);
}
}
Now, following along with the stancon tutorial, I saw that we can incorporate time varying effects by adding a linear term to the mean of the likelihood. Note that there are multiple observations and consequently groups for each time index t. If we say that the time-varying effect prior is a Gaussian Process, the data-generating model becomes:
As we’ve been discussing, we can specify the priors for the covariance kernel to be any stationary kernel we wish and we can put appropriate priors on the the length-scale and amplitude parameters of that kernel.
The stan-code to generate this model will look like the below. Since the sum of covariance kernels is a covariance kernel, I use two squared exponential kernels (one for short trends, one for long trends) and a Matern \nu= 3/2 kernel for an auto-regressive component:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=1> J; //Number of groups
int<lower=1> K; //Number of group predictors
int<lower=1> O; //Number of observation predictors
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
real t_gp_vec[T];
real jitter = 1e-12;
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
real<lower=2> nu;
real<lower=0> inv_phi;
vector[O] beta;
vector[K] gamma;
vector[J] group_mu_raw;
real<lower=0> sigma_group_mu;
// GP prior parameters
// Noise component
vector[T] t_noise_raw;
vector[T] gp_z;
real<lower=0> sigma_noise;
// Autoregressive GP Component
real<lower=0> gp_len_ar;
real<lower=0> sigma_gp_ar;
// Long-term Trend GP Component
real<lower=0> gp_len_trend_long;
real<lower=0> sigma_gp_trend_long;
// Short-term Trend GP Component
real<lower=0> gp_len_trend_short;
real<lower=0> sigma_gp_trend_short;
}
transformed parameters {
vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
vector[N] mu =
group_mu[group_idx] +
observation_data * beta;
real phi = 1 / inv_phi;
vector[T] f;
vector[T] gp;
vector[T] t_noise = sigma_noise * t_noise_raw;
{
matrix[T, T] C;
matrix[T, T] L_C;
C = gp_matern32_cov(t_gp_vec, sigma_gp_ar, gp_len_ar);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short, gp_len_trend_short);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long, gp_len_trend_long);
for (t in 1:T)
C[t,t] += jitter;
L_C = cholesky_decompose(C);
f = L_C * gp_z;
}
// gp is sum of monthly noise and the smoothly varying processes and the autoregressive process
gp = t_noise + f;
}
model {
nu ~ gamma(2, 0.1);
inv_phi ~ student_t(4, 0, 1);
beta ~ student_t(4, 0, 1);
gamma ~ student_t(4, 0, 1);
group_mu_raw ~ student_t(4, 0, 1);
sigma_group_mu ~ student_t(4, 0, 1);
// GP priors
// Noise component
gp_z ~ normal(0, 1);
sigma_noise ~ normal(0, 1);
t_noise_raw ~ normal(0, 1);
// Autoregressive Component
gp_len_ar ~ inv_gamma(7, 20);
sigma_gp_ar ~ normal(0, 1);
// Short-term Trend Component
gp_len_trend_short ~ inv_gamma(10, 20);
sigma_gp_trend_short ~ normal(0, 1);
// Long-term Trend Component
gp_len_trend_long ~ inv_gamma(7, 20);
sigma_gp_trend_long ~ normal(0, 1);
// likelihood function
response ~ student_t(nu, mu + gp[t_idx], phi);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
}
}
The reason I was asking about “what the gaussian process acts on” is because I need to feed in observed values for the process f into the posterior-predictive distribution for the Gaussian Process and in turn for the posterior predictive distribution for the observed response values y_i. That is, I need to know what the observed values \textbf{y} are in eq 2.22 found here: http://www.gaussianprocess.org/gpml/chapters/RW2.pdf
From my limited understanding, for the data above, I believe \textbf{y} = \left<\frac{y_1 + y_2 + y_3 + y_4 + y_5 + y_6 }{6}, \frac{y_7 + y_8 + y_9 + y_{10} + y_{11} + y_{12} + y_{13} + y_{14}}{8}\right>, i.e. the average response for each time index t.
The reason I think this is the observed value vector is that the Gaussian Process regresses on the time index t and so the Gaussian Process would fit to the average value of time index t.
If this is incorrect, or if my use of the analytical form of the posterior predictive distribution for Gaussian processes of a Gaussian observation (the time trend), I apologize.
Clearing up any misconceptions on my part, if any, would be greatly appreciated.
If all of the above assumptions are correct, then I believe this code can be used to obtain the generating process for the posterior-predictive distribution.
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=0> J; //Number of groups
int<lower=0> K; //Number of group predictors
int<lower=0> O; //Number of observation predictor
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=0, upper=J> group_idx[N]; // Maps each observation to each group
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
int<lower=0> max_group_idx; //Highest group index fit in model
int<lower=0> max_t_idx; //Highest time index fit in model
real response[N]; // The outcome of the data generation process
real response_by_t[max_t_idx]; //observed average for gaussian process.
int N_samples; // number of samples drawn from posterior distributions
real nu[N_samples]; // nu parameter from student t likelihood
real phi[N_samples]; // standard deviation parameter from student t likelihood
matrix[N_samples, O] beta; // covariate matrix draws for observations
matrix[N_samples, K] gamma; // covariate matrix draws for groups
matrix[N_samples, max_group_idx] group_mu; //posterior draws of the group mu parameter
real sigma_group_mu[N_samples]; // posterior draws of the hierarchal variance parameter.
// GP posterior samples
// Noise component
matrix[N_samples, max_t_idx] t_noise;
matrix[N_samples, max_t_idx] gp_z;
real sigma_noise[N_samples];
// Auto-regressive component
real sigma_gp_ar[N_samples];
real gp_len_ar[N_samples];
real sigma_gp_trend_short[N_samples];
real gp_len_trend_short[N_samples];
real sigma_gp_trend_long[N_samples];
real gp_len_trend_long[N_samples];
}
transformed data {
real t1_vec[max_t_idx];
real t2_vec[T - max_t_idx];
real jitter = 1e-12;
for (t in 1:max_t_idx)
t1_vec[t] = t;
for (t in 1:T - max_t_idx)
t2_vec[t] = t + max_t_idx;
}
parameters {
}
model {
}
generated quantities {
matrix[N_samples, N] y_pred;
matrix[N_samples, T] gp;
// generate the gp vector from the posterior samples.
for(s in 1:N_samples) {
vector[max_t_idx] f1;
vector[T - max_t_idx] f2;
{
matrix[max_t_idx, max_t_idx] L_C_t1;
matrix[max_t_idx, max_t_idx] C_t1;
matrix[max_t_idx, T - max_t_idx] C_t1_t2;
matrix[T - max_t_idx, max_t_idx] C_t2_t1;
matrix[T - max_t_idx, T - max_t_idx] C_t2;
matrix[T - max_t_idx, max_t_idx] A;
C_t1 = gp_matern32_cov(t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
C_t1 += gp_exp_quad_cov(t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
C_t1_t2 = gp_matern32_cov(t1_vec, t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
C_t1_t2 += gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
C_t1_t2+= gp_exp_quad_cov(t1_vec, t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
C_t2_t1 = gp_matern32_cov(t2_vec, t1_vec, sigma_gp_ar[s], gp_len_ar[s]);
C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
C_t2_t1 += gp_exp_quad_cov(t2_vec, t1_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
C_t2 = gp_matern32_cov(t2_vec, sigma_gp_ar[s], gp_len_ar[s]);
C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
C_t2 += gp_exp_quad_cov(t2_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
if (size(t2_vec) > 0) {
A = C_t2_t1 * inverse(C_t1 + diag_matrix(rep_vector(square(sigma_noise[s]), max_t_idx)));
f2 = multi_normal_rng(A * to_vector(response_by_t), C_t2 - A * C_t1_t2);
}
for (t in 1:max_t_idx) {
C_t1[t, t] += jitter;
}
// Reuse samples from noise and multivariate normal for f1.
L_C_t1 = cholesky_decompose(C_t1);
f1 = L_C_t1 * to_vector(gp_z[s]) + to_vector(t_noise[s]);
gp[s] = to_row_vector(append_row(f1, f2));
}
}
for(n in 1:N) {
row_vector[N_samples] group_mu_rep;
row_vector[N_samples] mu;
if (group_idx[n] > max_group_idx) {
for (s in 1:N_samples) {
group_mu_rep[s] = gamma[s] * to_vector(group_data[group_idx[n]]) + sigma_group_mu[s] * student_t_rng(4, 0, 1);
}
}
else {
group_mu_rep = to_row_vector(group_mu[,group_idx[n]]);
}
mu = group_mu_rep + to_row_vector(beta * to_vector(observation_data[n])) + to_row_vector(gp[,t_idx[n]]);
y_pred[,n] = to_vector(student_t_rng(nu, mu, phi));
}
}