hi
I’ve looked at this now using mixis, as suggested.
The vignette Mixture IS leave-one-out cross-validation for high-dimensional Bayesian models • loo is helpful, but I couldn’t run it as data(voice) gave an error. But hopefully the code below is the right idea in this example.
For both mE1 and mE2 I used reloo to obtain the true values of elpd at the exceptional points, and then plotted against the pointwise loo-psis and elpd_mixis for those points. I also compared the pointwise loo-psis and elpd_mixis across the rest of the data, and globally. I compared the results from loo_compare(looE1,looE2) with the values obtained using elpd_mixis for the two models.
Although mixis requires an additional fit of the modified code, the theory (glug) seems to put mixis on a firmer footing, with guaranteed asymptotic finite variance. psis is faster but the theory breaks down when pareto-k > 0.7 and particularly when pareto-k > 1. Despite that, loo-psis and mixis give similar answers in this example.
Lastly (not shown below), running mixis in cmdstanr rather than rstan gave unexpected output - maybe my error but has anyone else had problems?
thanks
Greg
#obtain mE1,looE1,mE2,looE2 as previously
wh<-which(looE1$diag$par>0.7)
#450 634 677
relooE1<-reloo(mE1,looE1,k_threshold=0.7)
save(relooE1,file="relooE1.rds")
#load("relooE1.rds",verbose=TRUE)
library(rstan)
library(matrixStats)
scod<-stancode(mE1)
cat(scod,file="scod.txt")
#now alter and save as "scodm.stan"
#the altered portion is at the end, and is shown immediately below
#it includes the mixis line and also generates log_lik
#####
*vector[N] log_lik;*
* vector[N] mu = rep_vector(0.0, N);*
* mu += Xs * bs + Zs_1_1 * s_1_1;*
* for (n in 1:N) {*
* // add more terms to the linear predictor*
* mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n];*
* }*
* mu = inv_logit(mu);*
* for (n in 1:N) {*
* log_lik[n] = beta_binomial_lpmf(Y[n] | trials[n], mu[n] * phi, (1 - mu[n]) * phi);*
* }*
* target += sum(log_lik);*
* target += log_sum_exp(-log_lik);*
* }*
* // priors including constants*
* target += lprior;*
* target += std_normal_lpdf(zs_1_1);*
* target += std_normal_lpdf(z_1[1]);*
* target += std_normal_lpdf(z_2[1]);*
*}*
*generated quantities {*
* vector[N] log_lik;*
* vector[N] mu = rep_vector(0.0, N);*
* mu += Xs * bs + Zs_1_1 * s_1_1;*
* for (n in 1:N) {*
* mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n];*
* }*
* mu = inv_logit(mu);*
* for (n in 1 : N) {*
* log_lik[n] = beta_binomial_lpmf(Y[n] | trials[n], mu[n] * phi, (1 - mu[n]) * phi);*
* }*
*}*
#####
fitE1ms<-stan(file="scodm.stan",data=sdatE1,cores=getOption("mc.cores",2))
save(fitE1ms,file="fitE1ms.rds")
#load("fitE1ms.rds",verbose=TRUE)
log_lik_mix <- extract(fitE1ms)$log_lik
l_common_mix <- rowLogSumExps(-log_lik_mix)
log_weights <- -log_lik_mix - l_common_mix
elpd_mixis <- logSumExp(-l_common_mix) - rowLogSumExps(t(log_weights))
plot(relooE1$pointwise[wh,1],relooE1$pointwise[wh,1],ylim=c(-77,-65),xlim=c(-77,-65),ylab="elpd")
lines(c(-80,-60),c(-80,-60),col=2)
points(relooE1$pointwise[wh,1],elpd_mixis[wh],pch=16)
points(relooE1$pointwise[wh,1],looE1$pointwise[wh],pch=16,col=2)
points(-76,-66,pch=16,col=2)
points(-76,-65.5,pch=16)
text(-75.5,-66,pos=4,label="looE1")
text(-75.5,-65.5,pos=4,label="elpd_mixis")
#as expected, elpd_mixis is (slightly) closer to the true values at the exceptional points
looE1$est
Estimate SE
elpd_loo -5887.67703 111.57827
p_loo 45.41424 15.40389
looic 11775.35406 223.15654
sum(elpd_mixis)
# -5887.423
1000^0.5*sd(looE1$pointwise[,1])
# 111.5783
1000^0.5*sd(elpd_mixis)
# 111.8122
plot(elpd_mixis[-wh],looE1$pointwise[-wh,1],col=rgb(0,0,1,0.5))
lines(c(-10,-5),c(-10,-5),col=2)
#with looE2, all pareto-k < 0.7
#but use reloo anyway to obtain the true values
relooE2<-reloo(mE2,looE2,k_threshold=0.65,cores=getOption("mc.cores",2),
control=list(adapt_delta=0.95))
save(relooE2,file="relooE2.rds")
#load("relooE2.rds",verbose=TRUE)
scod2<-stancode(mE2)
cat(scod2,file="scod2.txt")
#now alter analogous to scodm.stan (not shown) and save as "scod2m.stan"
sdatE2<-sdatE1
fitE2ms<-stan(file="scod2m.stan",data=sdatE2,cores=getOption("mc.cores",2),
control=list(adapt_delta=0.95))
save(fitE2ms,file="fitE2ms.rds")
#load("fitE2ms.rds",verbose=TRUE)
log_lik_mix2 <- extract(fitE2ms)$log_lik
l_common_mix2 <- rowLogSumExps(-log_lik_mix2)
log_weights2 <- -log_lik_mix2 - l_common_mix2
elpd_mixis2 <- logSumExp(-l_common_mix2) - rowLogSumExps(t(log_weights2))
plot(relooE2$pointwise[wh,1],relooE2$pointwise[wh,1],ylim=c(-45,-35),xlim=c(-45,-35),ylab="elpd")
lines(c(-45,-35),c(-45,-35),col=2)
points(relooE2$pointwise[wh,1],elpd_mixis2[wh],pch=16)
points(relooE2$pointwise[wh,1],looE2$pointwise[wh],pch=16,col=2)
points(-44,-36,pch=16,col=2)
points(-44,-35.5,pch=16)
text(-43.5,-36,pos=4,label="looE2")
text(-43.5,-35.5,pos=4,label="elpd_mixis2")
#both loo-psis and mixis give values very close to the true elpd
#use elpd_mixis and elpd_mixis2 to compare mE1 with mE2
sum(elpd_mixis-elpd_mixis2)
# 173.5925
1000^0.5*sd(elpd_mixis-elpd_mixis2)
# 54.04471
loo_compare(looE1,looE2)
elpd_diff se_diff
mE1 0.0 0.0
mE2 -172.5 53.8
#very similar conclusions from model comparison by loo-psis and mixis
sum(elpd_mixis-looE1$pointwise[,1])
1000^0.5*sd(elpd_mixis-looE1$pointwise[,1])
#no significant difference between pointwise estimates
sum(elpd_mixis2-looE2$pointwise[,1])
1000^0.5*sd(elpd_mixis2-looE2$pointwise[,1])
#no significant difference between pointwise estimates
plot(elpd_mixis2[-wh],looE2$pointwise[-wh,1],col=rgb(0,0,1,0.5))
lines(c(-10,-5),c(-10,-5),col=2)
Edit by Aki: added code block backticks for easier reading of the code