Proposal re comparing models despite some high pareto-k values.
We may be dealing with large datasets and complex models for which there is no simple way to eliminate high pareto-k values. In my work, a typical run-time is at least 24 hours so reloo is not feasible, nor is it practical to greatly extend the sampling in the hopes that all pareto-k < 0.7. loo_moment_match looks promising, but still has problems which the experts are looking at. These issues can arise even with small datasets and relatively simple models, as in
High values of pareto-k can also serve as some kind of diagnostic for outliers. I find the pareto-k outliers are similar to the points detected in mgcv by cooks.distance > T for some threshold.
So currently, we may not want or be able to eliminate all instances of pareto-k > 0.7.
But I wonder if loo_compare can still be used in the presence of such points. The ELPD difference and its SE are both obtained from the pointwise values at loo$pointwise. So, would this strategy, explicit in the code further below, make sense?
If M1 and M2 are models fitted to the same data D with rows indexed by id, put loo1<-loo(M1) and loo2<-loo(M2). Put parx1<-which(loo1$diag$par>0.7) and parx2<-which(loo2$diag$par>0.7). If M1 and M2 are plausible models, both parx1 and parx2 will be small subsets of id. They may overlap or even be the same. Put parx<-union(parx1,parx2). Then all the problems occur in data rows id[parx].
Form LC<-loo_compare(M1,M2) as usual.
Now use the pointwise data to compute LC_xparx from id[-parx].
If both LC and LC_xparx show M1 significantly better than M2, I would like to accept that
M1 is better, despite the presence of some points with pareto-k > 0.7.
#first model is as in
#https://discourse.mc-stan.org/t/loo-moment-match-failure-beta-binomial-example/35510
library(extraDistr)
library(brms)
library(cmdstanr)
options(mc.cores=2, brms.normalize=TRUE,brms.backend="cmdstanr")
library(loo)
set.seed(4)
N=1000
x<-runif(N)
fac1<-sample(LETTERS[1:20],N,replace=TRUE)
fac1<-factor(fac1)
ifac1<-as.integer(fac1)
g<-sample(c("South","West","North","East"),20,replace=TRUE)
g<-factor(g)
fac2<-g[ifac1]
ifac2<-as.integer(fac2)
rn1<-0.5*rnorm(20)
rn2<-rnorm(4)
lp<--0.5+sin(12*x)*exp(-3*x)+rn1[ifac1]+rn2[ifac2]
plot(x,lp,col=ifac1,pch=ifac2)
h<-function(t) (1+exp(-t))^-1
mu<-h(lp)
phi=50
T=1000
alpha<-mu*phi
beta<-(1-mu)*phi
y<-rbbinom(N,T,alpha=alpha,beta=beta)
plot(x,y/T)
#now alter some y values to create outliers
oyt<-order(y/T)
wh<-oyt[998:1000]
x[wh]
y[wh]
y[wh]<-30
fac1[wh]
fac2[wh]
plot(x,y/T)
points(x[wh],y[wh]/T,pch=16,col=2)
bfE1<-bf(y|trials(T)~-1+(1|fac2/fac1)+s(x),family="beta_binomial")
datE1<-data.frame(x,y,T,fac1,fac2)
dim(datE1)
mE1<-brm(bfE1,data=datE1,file="bbtest_E1")
bfE2<-bf(y|trials(T)~-1+(1|fac2/fac1),family="beta_binomial")
datE2<-datE1
mE2<-brm(bfE2,data=datE2,file="bbtest_E2")
looE1<-loo(mE1)
#three high values
plot(looE1)
parx1<-which(looE1$diag$par>0.7)
looE2<-loo(mE2)
#no high values
plot(looE2)
#just below 0.7 threshold
parx2<-which(looE2$diag$par>0.7)
#empty
parx1
parx2
parx<-union(parx1,parx2)
#parx=parx1
L<-length(parx)
LC<-loo_compare(looE1,looE2)
print(LC,digits=4)
sum(looE2$pointwise[,1]-looE1$pointwise[,1])
1000^0.5*sd(looE2$pointwise[,1]-looE1$pointwise[,1])
id<-1:(dim(datE1)[1])
datE1[parx,]
LC_xparx<-LC
LC_xparx[1,3]<-sum(looE1$pointwise[-parx,1])
LC_xparx[1,4]<-(1000-L)^0.5*sd(looE1$pointwise[-parx,1])
LC_xparx[1,5]<-sum(looE1$pointwise[-parx,3])
LC_xparx[1,6]<-(1000-L)^0.5*sd(looE1$pointwise[-parx,3])
LC_xparx[1,7]<-sum(looE1$pointwise[-parx,4])
LC_xparx[1,8]<-(1000-L)^0.5*sd(looE1$pointwise[-parx,4])
LC_xparx[2,1]<-sum(looE2$pointwise[-parx,1]-looE1$pointwise[-parx,1])
LC_xparx[2,2]<-(1000-L)^0.5*sd(looE2$pointwise[-parx,1]-looE1$pointwise[-parx,1])
LC_xparx[2,3]<-sum(looE2$pointwise[-parx,1])
LC_xparx[2,4]<-(1000-L)^0.5*sd(looE2$pointwise[-parx,1])
LC_xparx[2,5]<-sum(looE2$pointwise[-parx,3])
LC_xparx[2,6]<-(1000-L)^0.5*sd(looE2$pointwise[-parx,3])
LC_xparx[2,7]<-sum(looE2$pointwise[-parx,4])
LC_xparx[2,8]<-(1000-L)^0.5*sd(looE2$pointwise[-parx,4])
print(LC)
print(LC_xparx)
print(LC,simplify=FALSE)
print(LC_xparx,simplify=FALSE)
#both models fit better over id[-parx] and the superiority of mE1 is clearer there.
#but mE1 is also better over the full data.
Operating System: linux 5.15.0-91-generic Linux Mint 21.2 Victoria
brms Version: 2.21.0
loo Version: 2.7.0.9000