Short summary of the problem
There is a big literature on detecting outliers in a Bayesian framework, which I’ve not read. But here is a proposal anyway using contour plots. Please point me to something relevant to this approach, or if poss comment on the code directly.
The idea is that exceptional points are those outside a 0.95 (say) contour generated from the fitted model. Two methods are used to check whether a particular point is exceptional.
A) contour plots using the distribution of fitted values and posterior predictions.
B) contour plots using the LOO distribution of predicted values and corresponding samples
The code below uses synthetic binomial data modeled in brms with a beta-binomial distribution, but the same questions could be asked with other data and models.
linux mint 21.2 (Victoria)
brms version 2.20.1
library(brms)
library(loo)
library(MASS)
library(extraDistr)
#synthetic data
set.seed(1)
h<-function(t) (1+exp(-t))^-1
N=100
x<-runif(N)
lp<-0.5+sin(12*x)*exp(-3*x)
p<-h(lp)
plot(x,p)
size=253
#say
T<-rep(size,N)
r=1.8
#say
V<-floor(rbinom(N,size=floor(size/r),prob=p)*r)
plot(x,V/T)
which(x>0.8&V/T<0.55)
points(x[77],(V/T)[77],pch=16,col=2)
#is this an exceptional point?
dat1<-data.frame(x,T,V)
bf1<-bf(V|trials(T)~s(x,k=7),family="beta_binomial")
m1<-brm(bf1,data=dat1,family="beta_binomial",cores = getOption("mc.cores", 2),control=list(adapt_delta=0.99))
summary(m1)
points(x,fitted(m1)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")
#method A
plp<-posterior_linpred(m1)
hplp<-h(plp)
dim(hplp)
range(fitted(m1)[,1]/T-apply(hplp,2,mean))
#virtually identical
id=77
pp<-posterior_predict(m1)
samx<-sample(hplp[,id],4000)
samy<-sample(pp[,id],4000)/T[id]
#sampling to remove correlation between samx and samy
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]
plot(apply(hplp,2,mean),V/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A",xlim=c(0.5,0.8),ylim=c(0.5,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp[,id]),(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)
#outside the contour for 0.75 but inside the contour for 0.95, so not "exceptional"
#now mess up that data point
Va<-V
Va[77]<-V[77]-10
plot(x,Va/T)
points(x[77],(Va/T)[77],pch=16,col=2)
dat1a<-data.frame(x,T,Va)
bf1a<-bf(Va|trials(T)~s(x,k=7),family="beta_binomial")
m1a<-brm(bf1a,data=dat1a,family="beta_binomial",cores = getOption("mc.cores",2),control=list(adapt_delta=0.99))
summary(m1a)
points(x,fitted(m1a)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")
plp1a<-posterior_linpred(m1a)
hplp1a<-h(plp1a)
dim(hplp1a)
range(fitted(m1a)[,1]/T-apply(hplp1a,2,mean))
#virtually identical
pp1a<-posterior_predict(m1a)
samx<-sample(hplp1a[,id],4000)
samy<-sample(pp1a[,id],4000)/T[id]
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]
plot(apply(hplp1a,2,mean),Va/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp1a[,id]),(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)
#now outside the 0.95 contour, so "exceptional".
#method B
#want something similar but using E_loo
#original (unaltered) data and model
log_ratios <- -1 * log_lik(m1)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)
eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]
sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)
mat1<-as.matrix(m1)
cphi<-which(colnames(mat1)=="phi")
phi<-mat1[1:4000,cphi]
samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]
plot(eloov,V/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, original data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)
#within 0.95 contour
#altered data and model
log_ratios <- -1 * log_lik(m1a)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)
eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]
sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)
mat1a<-as.matrix(m1a)
cphi<-which(colnames(mat1a)=="phi")
phi<-mat1a[1:4000,cphi]
samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]
plot(eloov,Va/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)
#exceptional