I’m trying to expand upon a model I built previously by using Gaussian Processes.
The data that I have is t-distributed that can be grouped hierarchically and modeled with random effects. In addition, the observations of the population can also be grouped together by week.
I’m trying to take my random effects model that makes use of unit-level covariates and group-level covariates and add in a linear term to the mean function of the likelihood that models the time trend.
I was able to write a stan program that I believe achieves this through Gaussian processes. The function gp keyed by week is the sum of a long term trend, a short term trend, an autoregressive component, and gaussian white noise. I believe this program models what I want to achieve correctly. The function gp is fed into the mean likelihood distribution along with the unit and group coefficients.
I’m not entirely sure I’m drawing from the multivariate distribution correctly, i.e. I might not be combining the kernels correctly. Is anyone able to verify?
The program is shown below:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=1> J; //Number of groups
int<lower=1> K; //Number of group predictors
int<lower=1> O; //Number of observation predictors
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
real t_gp_vec[T];
real jitter = 1e-12;
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
real<lower=2> nu;
real<lower=0> inv_phi;
vector[O] beta;
vector[K] gamma;
vector[J] group_mu_raw;
real<lower=0> sigma_group_mu;
// GP prior parameters
// Noise component
vector[T] t_noise_raw;
real<lower=0> sigma_noise;
// Autoregressive GP Component
vector[T] gp_raw_ar;
real<lower=0> gp_len_ar;
real<lower=0> sigma_gp_ar;
// Long-term Trend GP Component
vector[T] gp_raw_trend_long;
real<lower=0> gp_len_trend_long;
real<lower=0> sigma_gp_trend_long;
// Short-term Trend GP Component
vector[T] gp_raw_trend_short;
real<lower=0> gp_len_trend_short;
real<lower=0> sigma_gp_trend_short;
}
transformed parameters {
vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
vector[N] mu =
group_mu[group_idx] +
observation_data * beta;
real phi = 1 / inv_phi;
vector[T] gp_ar;
vector[T] gp_trend_short;
vector[T] gp_trend_long;
vector[T] gp;
vector[T] t_noise = sigma_noise * t_noise_raw;
{
matrix[T, T] C = gp_matern32_cov(t_gp_vec, sigma_gp_ar, gp_len_ar);
matrix[T, T] L_C;
for (t in 1:T)
C[t,t] += jitter;
L_C = cholesky_decompose(C);
gp_ar = L_C * gp_raw_ar;
}
{
matrix[T, T] C = gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short, gp_len_trend_short);
matrix[T, T] L_C;
for (t in 1:T)
C[t,t] += jitter;
L_C = cholesky_decompose(C);
gp_trend_short = L_C * gp_raw_trend_short;
}
{
matrix[T, T] C = gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long, gp_len_trend_long);
matrix[T, T] L_C;
for (t in 1:T)
C[t,t] += jitter;
L_C = cholesky_decompose(C);
gp_trend_long = L_C * gp_raw_trend_long;
}
// gp is sum of monthly noise and the smoothly varying processes and the autoregressive process
gp = t_noise + gp_ar + gp_trend_short + gp_trend_long;
}
model {
nu ~ gamma(2, 0.1);
inv_phi ~ student_t(4, 0, 1);
beta ~ student_t(4, 0, 1);
gamma ~ student_t(4, 0, 1);
group_mu_raw ~ student_t(4, 0, 1);
sigma_group_mu ~ student_t(4, 0, 1);
// GP priors
// Noise component
sigma_noise ~ normal(0, 1);
t_noise_raw ~ normal(0, 1);
// Autoregressive Component
gp_raw_ar ~ normal(0, 1);
gp_len_ar ~ inv_gamma(5, 5);
sigma_gp_ar ~ normal(0, 1);
// Short-term Trend Component
gp_raw_trend_short ~ normal(0, 1);
gp_len_trend_short ~ inv_gamma(5, 5);
sigma_gp_trend_short ~ normal(0, 1);
// Long-term Trend Component
gp_raw_trend_long ~ normal(0, 1);
gp_len_trend_long ~ inv_gamma(5, 5);
sigma_gp_trend_long ~ normal(0, 1);
// likelihood function
response ~ student_t(nu, mu + gp[t_idx], phi);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
}
}
However, I am now trying to generate out-of-sample predictions with the model and am running into some difficulty. I was wondering if the below code is the right way to generate those predictions. In particular, I’m not sure I’m handling the GP part correctly.
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=0> J; //Number of groups
int<lower=0> K; //Number of group predictors
int<lower=0> O; //Number of observation predictor
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=0, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
int<lower=0> max_group_idx; //Highest group index fit in model
int<lower=0> max_t_idx; //Highest time index fit in model
int N_samples; // number of samples drawn from posterior distributions
real nu[N_samples]; // nu parameter from student t likelihood
real phi[N_samples]; // standard deviation parameter from student t likelihood
matrix[N_samples, O] beta; // covariate matrix draws for observations
matrix[N_samples, K] gamma; // covariate matrix draws for groups
matrix[N_samples, max_group_idx] group_mu; //posterior draws of the group mu parameter
real sigma_group_mu[N_samples]; // posterior draws of the hierarchal variance parameter.
// GP posterior samples
// Noise component
matrix[N_samples, max_t_idx] t_noise;
real sigma_noise[N_samples];
// Auto-regressive component
real sigma_gp_ar[N_samples];
real gp_len_ar[N_samples];
matrix[N_samples, max_t_idx] gp_raw_ar;
real sigma_gp_trend_short[N_samples];
real gp_len_trend_short[N_samples];
matrix[N_samples, max_t_idx] gp_raw_trend_short;
real sigma_gp_trend_long[N_samples];
real gp_len_trend_long[N_samples];
matrix[N_samples, max_t_idx] gp_raw_trend_long;
}
transformed data {
real t_gp_vec[T];
real jitter = 1e-12;
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
}
model {
}
generated quantities {
matrix[N_samples, N] y_pred;
matrix[N_samples, T] gp;
// generate the gp vector from the posterior samples.
for(s in 1:N_samples) {
vector[T] gp_ar;
vector[T] gp_trend_short;
vector[T] gp_trend_long;
vector[T] gp_raw_ar_full;
vector[T] gp_raw_trend_short_full;
vector[T] gp_raw_trend_long_full;
vector[T] t_noise_full;
for(t in 1:max_t_idx){
gp_raw_ar_full[t] = gp_raw_ar[s, t];
gp_raw_trend_short_full[t] = gp_raw_trend_short[s, t];
gp_raw_trend_long_full[t] = gp_raw_trend_long[s, t];
t_noise_full[t] = t_noise[s, t];
}
for(t in max_t_idx+1:T){
gp_raw_ar_full[t] = normal_rng(0, 1);
gp_raw_trend_short_full[t] = normal_rng(0, 1);
gp_raw_trend_long_full[t] = normal_rng(0, 1);
t_noise_full[t] = normal_rng(0, sigma_noise[s]);
}
{
matrix[T, T] C = gp_matern32_cov(t_gp_vec, sigma_gp_ar[s], gp_len_ar[s]);
matrix[T, T] L_C;
for (t in 1:T)
C[t, t] += jitter;
L_C = cholesky_decompose(C);
gp_ar = L_C * gp_raw_ar_full;
}
{
matrix[T, T] C = gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
matrix[T, T] L_C;
for (t in 1:T)
C[t, t] += jitter;
L_C = cholesky_decompose(C);
gp_trend_short = L_C * gp_raw_trend_short_full;
}
{
matrix[T, T] C = gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
matrix[T, T] L_C;
for (t in 1:T)
C[t, t] += jitter;
L_C = cholesky_decompose(C);
gp_trend_long = L_C * gp_raw_trend_long_full;
}
gp[s] = to_row_vector(t_noise_full + gp_ar + gp_trend_short + gp_trend_long);
}
for(n in 1:N) {
row_vector[N_samples] group_mu_rep;
row_vector[N_samples] mu;
if (group_idx[n] > max_group_idx) {
for (s in 1:N_samples) {
group_mu_rep[s] = gamma[s] * to_vector(group_data[group_idx[n]]) + sigma_group_mu[s] * student_t_rng(4, 0, 1);
}
}
else {
group_mu_rep = to_row_vector(group_mu[,group_idx[n]]);
}
mu = group_mu_rep + to_row_vector(beta * to_vector(observation_data[n])) + to_row_vector(gp[,t_idx[n]]);
y_pred[,n] = to_vector(student_t_rng(nu, mu, phi));
}
}
Is there any literature on combining multiple kernels to generate out-of-sample data, especially when the observation process is not normally distributed?
UPDATE: here is what I believe to be the correct process now:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=1> J; //Number of groups
int<lower=1> K; //Number of group predictors
int<lower=1> O; //Number of observation predictors
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
real t_gp_vec[T];
real jitter = 1e-12;
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
real<lower=2> nu;
real<lower=0> inv_phi;
vector[O] beta;
vector[K] gamma;
vector[J] group_mu_raw;
real<lower=0> sigma_group_mu;
// GP prior parameters
// Noise component
vector[T] t_noise_raw;
vector[T] gp_z;
real<lower=0> sigma_noise;
// Autoregressive GP Component
real<lower=0> gp_len_ar;
real<lower=0> sigma_gp_ar;
// Long-term Trend GP Component
real<lower=0> gp_len_trend_long;
real<lower=0> sigma_gp_trend_long;
// Short-term Trend GP Component
real<lower=0> gp_len_trend_short;
real<lower=0> sigma_gp_trend_short;
}
transformed parameters {
vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
vector[N] mu =
group_mu[group_idx] +
observation_data * beta;
real phi = 1 / inv_phi;
vector[T] f;
vector[T] gp;
vector[T] t_noise = sigma_noise * t_noise_raw;
{
matrix[T, T] C;
matrix[T, T] L_C;
C = gp_matern32_cov(t_gp_vec, sigma_gp_ar, gp_len_ar);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short, gp_len_trend_short);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long, gp_len_trend_long);
for (t in 1:T)
C[t,t] += jitter;
L_C = cholesky_decompose(C);
f = L_C * gp_z;
}
// gp is sum of monthly noise and the smoothly varying processes and the autoregressive process
gp = t_noise + f;
}
model {
nu ~ gamma(2, 0.1);
inv_phi ~ student_t(4, 0, 1);
beta ~ student_t(4, 0, 1);
gamma ~ student_t(4, 0, 1);
group_mu_raw ~ student_t(4, 0, 1);
sigma_group_mu ~ student_t(4, 0, 1);
// GP priors
// Noise component
gp_z ~ normal(0, 1);
sigma_noise ~ normal(0, 1);
t_noise_raw ~ normal(0, 1);
// Autoregressive Component
gp_len_ar ~ inv_gamma(5, 5);
sigma_gp_ar ~ normal(0, 1);
// Short-term Trend Component
gp_len_trend_short ~ inv_gamma(5, 5);
sigma_gp_trend_short ~ normal(0, 1);
// Long-term Trend Component
gp_len_trend_long ~ inv_gamma(5, 5);
sigma_gp_trend_long ~ normal(0, 1);
// likelihood function
response ~ student_t(nu, mu + gp[t_idx], phi);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
}
}
and the associated out-of-sample draws:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=0> J; //Number of groups
int<lower=0> K; //Number of group predictors
int<lower=0> O; //Number of observation predictor
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=0, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
int<lower=0> max_group_idx; //Highest group index fit in model
int<lower=0> max_t_idx; //Highest time index fit in model
int N_samples; // number of samples drawn from posterior distributions
real nu[N_samples]; // nu parameter from student t likelihood
real phi[N_samples]; // standard deviation parameter from student t likelihood
matrix[N_samples, O] beta; // covariate matrix draws for observations
matrix[N_samples, K] gamma; // covariate matrix draws for groups
matrix[N_samples, max_group_idx] group_mu; //posterior draws of the group mu parameter
real sigma_group_mu[N_samples]; // posterior draws of the hierarchal variance parameter.
// GP posterior samples
// Noise component
matrix[N_samples, max_t_idx] t_noise;
matrix[N_samples, max_t_idx] gp_z;
real sigma_noise[N_samples];
// Auto-regressive component
real sigma_gp_ar[N_samples];
real gp_len_ar[N_samples];
real sigma_gp_trend_short[N_samples];
real gp_len_trend_short[N_samples];
real sigma_gp_trend_long[N_samples];
real gp_len_trend_long[N_samples];
}
transformed data {
real t_gp_vec[T];
real jitter = 1e-12;
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
}
model {
}
generated quantities {
matrix[N_samples, N] y_pred;
matrix[N_samples, T] gp;
// generate the gp vector from the posterior samples.
for(s in 1:N_samples) {
vector[T] f;
vector[T] gp_z_full;
vector[T] t_noise_full;
for(t in 1:max_t_idx){
t_noise_full[t] = t_noise[s, t];
gp_z_full[t] = gp_z[s, t];
}
for(t in max_t_idx+1:T){
t_noise_full[t] = normal_rng(0, sigma_noise[s]);
gp_z_full[t] = normal_rng(0, 1);
}
{
matrix[T, T] C;
matrix[T, T] L_C;
C = gp_matern32_cov(t_gp_vec, sigma_gp_ar[s], gp_len_ar[s]);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_short[s], gp_len_trend_short[s]);
C += gp_exp_quad_cov(t_gp_vec, sigma_gp_trend_long[s], gp_len_trend_long[s]);
for (t in 1:T)
C[t, t] += jitter;
L_C = cholesky_decompose(C);
f = L_C * gp_z_full;
}
gp[s] = to_row_vector(t_noise_full + f);
}
for(n in 1:N) {
row_vector[N_samples] group_mu_rep;
row_vector[N_samples] mu;
if (group_idx[n] > max_group_idx) {
for (s in 1:N_samples) {
group_mu_rep[s] = gamma[s] * to_vector(group_data[group_idx[n]]) + sigma_group_mu[s] * student_t_rng(4, 0, 1);
}
}
else {
group_mu_rep = to_row_vector(group_mu[,group_idx[n]]);
}
mu = group_mu_rep + to_row_vector(beta * to_vector(observation_data[n])) + to_row_vector(gp[,t_idx[n]]);
y_pred[,n] = to_vector(student_t_rng(nu, mu, phi));
}
}