I’m trying to learn how to do kfold crossvalidation with loo
following the example from appendix A.4. Suppose my data looks like this:
library(dplyr)
set.seed(9782)
N < 1000
df < data.frame(x = rnorm(n = N, mean = 10, sd = 2),
e = rnorm(n=N, mean = 0, sd=2)) %>%
mutate(y = 10 + 0.5*x + e) %>% select(e)
train < sample_frac(df, 0.7)
sid < as.numeric(rownames(train))
test < df[sid,]
this is my stan code:
data {
int<lower=1> N_t ; // (training) number of observations
vector[N_t] x_t ; // (training) x
vector[N_t] y_t ; // (training) outcome
int<lower=1> N_h ; // (houldout) number of observations
vector[N_h] x_h ; // (houldout) x
vector[N_h] y_h ; // (houldout) outcome
}
parameters {
real alpha;
real beta;
real<lower=0> sigma;
}
model {
alpha ~ normal(0,10);
beta ~ normal(0,10);
sigma ~ normal(0,10);
y_t ~ normal(alpha + beta*x_t, sigma);
}
generated quantities {
vector[N_h] y_sim_h;
vector[N_h] log_lik_h;
vector[N_t] log_lik_t;
for (n in 1:(N_h)) {
y_sim_h[n] = normal_rng(alpha + beta*x_h[n], sigma);
log_lik_h[n] = normal_lcdf(y_h[n]  alpha + beta*x_h[n], sigma);
}
for (n in 1:(N_t)) {
log_lik_t[n] = normal_lcdf(y_t[n]  alpha + beta*x_t[n], sigma);
}
}
this is how i fit the model
library(rstan)
stan_list < list(N_t = nrow(train),
x_t = train$x,
y_t = train$y,
N_h = nrow(test),
x_h = test$x,
y_h = test$y)
fit < sampling(model, data = stan_list)
print(fit, pars = c("alpha", "beta", "sigma"))
this the density overlay with bayesplot
library(bayesplot)
y_sim_h < as.data.frame(fit, pars="y_sim_h")
ppc_dens_overlay(y = test$y,
yrep = as.matrix(y_sim_h[sample(x = 1:nrow(y_sim_h), size = 50),]))
and this is how I use the loo
package:
library(loo)
# Extract pointwise loglikelihood and compute LOO
log_lik_1 < extract_log_lik(fit, parameter_name = "log_lik_h")
loo_1 < loo(log_lik_1)
print(loo_1)
#
# marginal predictive check using LOO probability integral transform
psis < psislw(log_lik_1)
ppc_loo_pit_overlay(test$y, as.matrix(y_sim_h),
lw = psis$lw_smooth)
I have a couple of questions:

Is what i did for my basic example right or am I missing something? For example, should I be running the model again but fliping what is training and test?

Can you point me to an example in which the data is partitioned in 3 instead of 2?
Thanks!