Contour plots to check for exceptional points

Short summary of the problem
There is a big literature on detecting outliers in a Bayesian framework, which I’ve not read. But here is a proposal anyway using contour plots. Please point me to something relevant to this approach, or if poss comment on the code directly.
The idea is that exceptional points are those outside a 0.95 (say) contour generated from the fitted model. Two methods are used to check whether a particular point is exceptional.

A) contour plots using the distribution of fitted values and posterior predictions.
B) contour plots using the LOO distribution of predicted values and corresponding samples

The code below uses synthetic binomial data modeled in brms with a beta-binomial distribution, but the same questions could be asked with other data and models.

linux mint 21.2 (Victoria)
brms version 2.20.1

library(brms)
library(loo)
library(MASS)
library(extraDistr)

#synthetic data
set.seed(1)
h<-function(t) (1+exp(-t))^-1
N=100
x<-runif(N)
lp<-0.5+sin(12*x)*exp(-3*x)
p<-h(lp)
plot(x,p)
size=253
#say
T<-rep(size,N)
r=1.8
#say
V<-floor(rbinom(N,size=floor(size/r),prob=p)*r)
plot(x,V/T)
which(x>0.8&V/T<0.55)
points(x[77],(V/T)[77],pch=16,col=2)
#is this an exceptional point?

dat1<-data.frame(x,T,V)
bf1<-bf(V|trials(T)~s(x,k=7),family="beta_binomial")
m1<-brm(bf1,data=dat1,family="beta_binomial",cores = getOption("mc.cores", 2),control=list(adapt_delta=0.99))
summary(m1)

points(x,fitted(m1)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")

#method A

plp<-posterior_linpred(m1)
hplp<-h(plp)
dim(hplp)
range(fitted(m1)[,1]/T-apply(hplp,2,mean))
#virtually identical

id=77
pp<-posterior_predict(m1)
samx<-sample(hplp[,id],4000)
samy<-sample(pp[,id],4000)/T[id]
#sampling to remove correlation between samx and samy
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(apply(hplp,2,mean),V/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A",xlim=c(0.5,0.8),ylim=c(0.5,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp[,id]),(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#outside the contour for 0.75 but inside the contour for 0.95, so not "exceptional"

#now mess up that data point

Va<-V
Va[77]<-V[77]-10
plot(x,Va/T)
points(x[77],(Va/T)[77],pch=16,col=2)

dat1a<-data.frame(x,T,Va)
bf1a<-bf(Va|trials(T)~s(x,k=7),family="beta_binomial")
m1a<-brm(bf1a,data=dat1a,family="beta_binomial",cores = getOption("mc.cores",2),control=list(adapt_delta=0.99))
summary(m1a)

points(x,fitted(m1a)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")

plp1a<-posterior_linpred(m1a)
hplp1a<-h(plp1a)
dim(hplp1a)
range(fitted(m1a)[,1]/T-apply(hplp1a,2,mean))
#virtually identical

pp1a<-posterior_predict(m1a)
samx<-sample(hplp1a[,id],4000)
samy<-sample(pp1a[,id],4000)/T[id]
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(apply(hplp1a,2,mean),Va/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp1a[,id]),(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#now outside the 0.95 contour, so "exceptional".


#method B
#want something similar but using E_loo

#original (unaltered) data and model

log_ratios <- -1 * log_lik(m1)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)

eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]

sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)

mat1<-as.matrix(m1)
cphi<-which(colnames(mat1)=="phi")
phi<-mat1[1:4000,cphi]

samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)

den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(eloov,V/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, original data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#within 0.95 contour

#altered data and model

log_ratios <- -1 * log_lik(m1a)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)

eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]

sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)

mat1a<-as.matrix(m1a)
cphi<-which(colnames(mat1a)=="phi")
phi<-mat1a[1:4000,cphi]

samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)

den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(eloov,Va/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#exceptional

nb the “exceptional” point with the altered data still has pareto<0.7

fixing code in the section of Method B re the original (unaltered) data

I can see you are using brms here but I am not quite sure what you are looking for. Does the brms model work? Are you looking for feedback on the method you are using for plotting? This topic might get more responses on something like https://stats.stackexchange.com/

thanks Ara

there’s no problem I can see with the (corrected) code per se, i.e. it runs ok. The questions are: a) is this method of identifying exceptional points covered in the literature on outliers or similar to something in the literature, if so please point me to it; b) does it, and particularly method B, make sense to people here?

re method B, with the data I’m actually analysing rather than the synthetic example, the E_loo quantiles are not always well behaved. Can people point me to anything on the behaviour of these quantiles? The quantiles appear to be the sum of exp(lw) over the portion of hplp below a given value, where lw are the normalised log weights. The odd behaviour occurs when exp(lw) = 0 in a particular range of hplp. If I can construct a synthetic example to illustrate this, I’ll send it. But maybe this is already discussed somewhere?

thanks

Greg