I am setting up a cross-validation loop in cmdstanr, inspired by the examples in @bnicenboim 's book: 16.6 Cross-validation in Stan | An Introduction to Bayesian Data Analysis for Cognitive Science
and I am having issues extracting the log likelihood for the validation set, in the example below:
log_pd_random_kfold[, d$fold == k] <- extract_log_lik(gq_random_test)
which gives: “Error: Not a stanfit object.” Nor I can figure out where in gq_random_test the log_lik could be.
Data: Dropbox - d_random.csv - Simplify your life
Reproducible code:
d <- read_csv("d_random.csv")
stan_file <- write_stan_file("
// This STAN model infers a random bias from a sequences of 1s and 0s (heads and tails). Now multilevel
//
functions{
real normal_lb_rng(real mu, real sigma, real lb) {
real p = normal_cdf(lb | mu, sigma); // cdf for bounds
real u = uniform_rng(p, 1);
return (sigma * inv_Phi(u)) + mu; // inverse cdf for value
}
}
// The input (data) for the model. n of trials and h of heads
data {
int<lower = 1> trials;
int<lower = 1> agents;
array[trials, agents] int h;
}
// The parameters accepted by the model.
parameters {
real thetaM;
real<lower = 0> thetaSD;
array[agents] real theta;
}
// The model to be estimated.
model {
target += normal_lpdf(thetaM | 0, 1);
target += normal_lpdf(thetaSD | 0, .3) -
normal_lccdf(0 | 0, .3);
// The prior for theta is a uniform distribution between 0 and 1
target += normal_lpdf(theta | thetaM, thetaSD);
for (i in 1:agents)
target += bernoulli_logit_lpmf(h[,i] | theta[i]);
}
generated quantities{
real thetaM_prior;
real<lower=0> thetaSD_prior;
real<lower=0, upper=1> theta_prior;
real<lower=0, upper=1> theta_posterior;
array[trials,agents] int<lower=0, upper = trials> prior_preds;
array[trials,agents] int<lower=0, upper = trials> posterior_preds;
array[trials, agents] real log_lik;
thetaM_prior = normal_rng(0,1);
thetaSD_prior = normal_lb_rng(0,0.3,0);
theta_prior = inv_logit(normal_rng(thetaM_prior, thetaSD_prior));
theta_posterior = inv_logit(normal_rng(thetaM, thetaSD));
for (i in 1:agents){
prior_preds[,i] = binomial_rng(trials, inv_logit(theta_prior));
posterior_preds[,i] = binomial_rng(trials, inv_logit(theta[i]));
for (t in 1:trials){
log_lik[t,i] = bernoulli_logit_lpmf(h[t,i] | theta[i]);
}
}
}
")
mod_random <- cmdstan_model(stan_file, cpp_options = list(stan_threads = TRUE), pedantic = TRUE
d$fold <- kfold_split_grouped(K = 10, x = d$agent)
for(k in unique(d$fold)){
# Training set for k
d_train <- d %>% filter(fold != k)
## Create the data
d1_train <- d_train %>%
subset(select=c(agent, choice)) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = agent, values_from = choice)
data_train <- list(
trials = trials-1,
agents = agents,
h = as.matrix(d1_train[2:120,2:(length(unique(d_train$agent)))]))
# Held out set for k
d_test <- d %>%
filter(fold == k)
d1_test <- d_test %>%
subset(select=c(agent, choice)) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = agent, values_from = choice)
test_n <- length(unique(d_test$agent))
ls_test <- list(
trials = trials-1,
agents = test_n,
h = as.matrix(d1_test[2:120,2:(test_n+1)]))
# Train the models
fit_random_train <- mod_random$sample(
data = data,
seed = 123,
chains = 2,
parallel_chains = 2,
threads_per_chain = 2,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 1000,
max_treedepth = 20,
adapt_delta = 0.99
)
# Generated quantities based on the posterior from the training set
# and the data from the held out set
gq_random_test <- mod_random$generate_quantities(
fit_random_train,
data=ls_test,
threads_per_chain = 2)
# Extract log likelihood which represents
# the pointwise predictive density
log_pd_random_kfold[, d$fold == k] <- extract_log_lik(gq_random_test)
}
elpd_pupil_kfold <- elpd(log_pd_random_kfold)