Hello,
I was wondering if anyone had any experience with using the posterior samples from a stan model to generate out-of-sample replicated data for models involving gaussian processes:
I was able to create the following model:
// hierarchical model with non-centered parameterization of mu
data {
int<lower=1> N; //Number of observations
int<lower=1> J; //Number of groups
int<lower=1> K; //Number of group predictors
int<lower=1> O; //Number of observation predictors
int<lower=1> T; //Number of time indexes.
int<lower=1, upper=T> t_idx[N]; // Maps each observation to each time period
int<lower=1, upper=J> group_idx[N]; // Maps each observation to each group
real response[N]; // The outcome of the data generation process
matrix[N,O] observation_data; //covariate data of the observations
matrix[J,K] group_data; //covariate data of the groups
}
transformed data {
real t_gp_vec[T];
for (t in 1:T)
t_gp_vec[t] = t;
}
parameters {
real<lower=2> nu;
real<lower=0> inv_phi;
vector[O] beta;
vector[K] gamma;
vector[J] group_mu_raw;
real<lower=0> sigma_group_mu;
// GP prior parameters
vector[T] gp_raw;
real<lower=0> gp_len;
real<lower=0> sigma_gp;
vector[T] gp_raw_long;
real<lower=0> gp_len_long;
real<lower=0> sigma_gp_long;
real<lower=0> sigma_noise;
vector[T] t_noise_raw;
}
transformed parameters {
vector[J] group_mu = group_data * gamma + sigma_group_mu * group_mu_raw;
vector[N] mu =
group_mu[group_idx] +
observation_data * beta;
real phi = 1 / inv_phi;
vector[T] gp_exp_quad;
vector[T] gp_exp_quad_long;
vector[T] gp;
vector[T] t_noise = sigma_noise * t_noise_raw;
{
matrix[T, T] C = cov_exp_quad(t_gp_vec, sigma_gp, gp_len);
// real var_noise = square(sigma_noise);
matrix[T, T] L_C;
for (t in 1:T)
C[t,t] += 1e-12;
L_C = cholesky_decompose(C);
gp_exp_quad = L_C * gp_raw;
}
{
matrix[T, T] C_long = cov_exp_quad(t_gp_vec, sigma_gp_long, gp_len_long);
// real var_noise = square(sigma_noise);
matrix[T, T] L_C_long;
for (t in 1:T)
C_long[t,t] += 1e-12;
L_C_long = cholesky_decompose(C_long);
gp_exp_quad_long = L_C_long * gp_raw_long;
}
// gp is sum of monthly noise and the smoothly varying process
gp = t_noise + gp_exp_quad + gp_exp_quad_long;
}
model {
nu ~ gamma(2, 0.1);
inv_phi ~ student_t(4, 0, 1);
beta ~ student_t(4, 0, 1);
gamma ~ student_t(4, 0, 1);
group_mu_raw ~ student_t(4, 0, 1); // std_normal()
sigma_group_mu ~ student_t(4, 0, 1);
// GP priors
gp_raw ~ normal(0, 1);
gp_len ~ gamma(10, 2);
sigma_gp ~ normal(0, 1);
// GP priors
gp_raw_long ~ normal(0, 1);
gp_len_long ~ gamma(10, 2);
sigma_gp_long ~ normal(0, 1);
sigma_noise ~ normal(0, 1);
t_noise_raw ~ normal(0, 1);
// likelihood function
response ~ student_t(nu, mu + gp[t_idx], phi);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = student_t_rng(nu, mu[n] + gp[t_idx[n]], phi);
}
}
which will generate the replicated data for the in-sample data.
I was trying to use python to generate the out-of-sample data since using stan seemed prohibitively slow for some reason. See here for more details: Generated Quantities for prediction data
Here is the code I was using with Python to generate replicated data:
def sqe_kernel(X1, X2, l=1.0, sigma_f=1.0):
'''
Isotropic squared exponential kernel. Computes
a covariance matrix from points in X1 and X2.
Args:
X1: Array of m points (m x d).
X2: Array of n points (n x d).
Returns:
Covariance matrix (m x n).
'''
sqdist = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
return sigma_f**2 * np.exp(-0.5 / l**2 * sqdist)
def generate_replicated_data_gp(model_data, posterior_distribution):
"""Given the samples from the posterior_distribution and the model_data, create
the forward sampled likelihood data given the model."""
max_group_idx = model_data['max_group_idx']
group_idx = model_data['group_idx']
t_idx = model_data['t_idx']
max_t_idx = model_data['max_t_idx']
T = model_data['T']
group_mu = posterior_distribution['group_mu']
sigma_group_mu = posterior_distribution['sigma_group_mu']
nu = posterior_distribution['nu']
phi = posterior_distribution['phi']
beta = posterior_distribution['beta']
gamma = posterior_distribution['gamma']
observation_data = model_data['observation_data']
group_data = model_data['group_data']
# Calculate the group mean for the missing data, i.e. draw from prior distribution for random effect, but
# use the covariate coefficients.
missing_groups = group_data[max_group_idx:,]
missing_group_mus = []
for missing_group in missing_groups:
missing_group_mu = np.dot(gamma, missing_group.T) + scipy.stats.t.rvs(df=4, loc=0, scale=sigma_group_mu)
missing_group_mus.append(missing_group_mu)
missing_group_mus = np.array(missing_group_mus).T
# Combine the known group means with the missing group means.
if missing_group_mus.size > 0:
group_mu = np.concatenate([group_mu, missing_group_mus], axis=1)
assert group_mu.shape[1] == group_data.shape[0]
# Gaussian Process replication
N_samples = posterior_distribution['mu'].shape[0]
t = np.arange(T) + 1
t = t.reshape(-1, 1)
t_noise = posterior_distribution['t_noise']
sigma_noise = posterior_distribution['sigma_noise']
gp_raw = posterior_distribution['gp_raw']
gp_raw_long = posterior_distribution['gp_raw_long']
sigma_gp_long = posterior_distribution['sigma_gp_long']
gp_len_long = posterior_distribution['gp_len_long']
sigma_gp = posterior_distribution['sigma_gp']
gp_len = posterior_distribution['gp_len']
# For the new t idx, make draws from prior.
new_t_idx = t_idx.max() - max_t_idx
if new_t_idx:
for i in range(new_t_idx):
new_t_noise = scipy.stats.norm.rvs(loc=0, scale=sigma_noise).reshape(-1, 1)
t_noise = np.concatenate([t_noise, new_t_noise], axis=1)
new_gp_raw_long = scipy.stats.norm.rvs(loc=0, scale=1, size=(N_samples, new_t_idx))
gp_raw_long = np.concatenate([gp_raw_long, new_gp_raw_long], axis=1)
new_gp_raw = scipy.stats.norm.rvs(loc=0, scale=1, size=(N_samples, new_t_idx))
gp_raw = np.concatenate([gp_raw, new_gp_raw], axis=1)
gp_exp_quad_long = []
for i in range(N_samples):
l = gp_len_long[i]
s = sigma_gp_long[i]
norm = gp_raw_long[i]
K = sqe_kernel(t, t, l=l, sigma_f=s)
gp_exp_quad_long.append(np.dot(norm, K))
gp_exp_quad_long = np.array(gp_exp_quad_long)
gp_exp_quad = []
for i in range(N_samples):
l = gp_len[i]
s = sigma_gp[i]
norm = gp_raw[i]
K = sqe_kernel(t, t, l=l, sigma_f=s)
gp_exp_quad.append(np.dot(norm, K))
gp_exp_quad = np.array(gp_exp_quad)
gp = t_noise + gp_exp_quad + gp_exp_quad_long
mu = group_mu[:,group_idx-1] + np.dot(beta, observation_data.T) + gp[:,t_idx-1]
y = []
for i, replication in enumerate(mu):
y.append(scipy.stats.t.rvs(df=nu[i], loc=replication, scale=phi[i]))
y = np.array(y)
return y
However, using this code seems to severely underestimate the variance compared to the stan replication for in-sample data.
Is anyone familiar with gaussian processes enough to detect where I went wrong?