Contour plots to check for exceptional points

gregd · January 15, 2024, 10:20pm

Short summary of the problem
There is a big literature on detecting outliers in a Bayesian framework, which I’ve not read. But here is a proposal anyway using contour plots. Please point me to something relevant to this approach, or if poss comment on the code directly.
The idea is that exceptional points are those outside a 0.95 (say) contour generated from the fitted model. Two methods are used to check whether a particular point is exceptional.

A) contour plots using the distribution of fitted values and posterior predictions.
B) contour plots using the LOO distribution of predicted values and corresponding samples

The code below uses synthetic binomial data modeled in brms with a beta-binomial distribution, but the same questions could be asked with other data and models.

linux mint 21.2 (Victoria)
brms version 2.20.1

library(brms)
library(loo)
library(MASS)
library(extraDistr)

#synthetic data
set.seed(1)
h<-function(t) (1+exp(-t))^-1
N=100
x<-runif(N)
lp<-0.5+sin(12*x)*exp(-3*x)
p<-h(lp)
plot(x,p)
size=253
#say
T<-rep(size,N)
r=1.8
#say
V<-floor(rbinom(N,size=floor(size/r),prob=p)*r)
plot(x,V/T)
which(x>0.8&V/T<0.55)
points(x[77],(V/T)[77],pch=16,col=2)
#is this an exceptional point?

dat1<-data.frame(x,T,V)
bf1<-bf(V|trials(T)~s(x,k=7),family="beta_binomial")
m1<-brm(bf1,data=dat1,family="beta_binomial",cores = getOption("mc.cores", 2),control=list(adapt_delta=0.99))
summary(m1)

points(x,fitted(m1)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")

#method A

plp<-posterior_linpred(m1)
hplp<-h(plp)
dim(hplp)
range(fitted(m1)[,1]/T-apply(hplp,2,mean))
#virtually identical

id=77
pp<-posterior_predict(m1)
samx<-sample(hplp[,id],4000)
samy<-sample(pp[,id],4000)/T[id]
#sampling to remove correlation between samx and samy
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(apply(hplp,2,mean),V/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A",xlim=c(0.5,0.8),ylim=c(0.5,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp[,id]),(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#outside the contour for 0.75 but inside the contour for 0.95, so not "exceptional"

#now mess up that data point

Va<-V
Va[77]<-V[77]-10
plot(x,Va/T)
points(x[77],(Va/T)[77],pch=16,col=2)

dat1a<-data.frame(x,T,Va)
bf1a<-bf(Va|trials(T)~s(x,k=7),family="beta_binomial")
m1a<-brm(bf1a,data=dat1a,family="beta_binomial",cores = getOption("mc.cores",2),control=list(adapt_delta=0.99))
summary(m1a)

points(x,fitted(m1a)[,1]/T,pch=16,cex=0.7,col="blue")
points(x,p,pch=16,cex=0.7,col="seagreen")

plp1a<-posterior_linpred(m1a)
hplp1a<-h(plp1a)
dim(hplp1a)
range(fitted(m1a)[,1]/T-apply(hplp1a,2,mean))
#virtually identical

pp1a<-posterior_predict(m1a)
samx<-sample(hplp1a[,id],4000)
samy<-sample(pp1a[,id],4000)/T[id]
den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(apply(hplp1a,2,mean),Va/T,pch=16,col=rgb(0,0,1,0.3), xlab="fitted",ylab="observed proportion", main="method A, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(mean(hplp1a[,id]),(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#now outside the 0.95 contour, so "exceptional".


#method B
#want something similar but using E_loo

#original (unaltered) data and model

log_ratios <- -1 * log_lik(m1)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)

eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]

sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)

mat1<-as.matrix(m1)
cphi<-which(colnames(mat1)=="phi")
phi<-mat1[1:4000,cphi]

samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)

den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(eloov,V/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, original data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(V/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#within 0.95 contour

#altered data and model

log_ratios <- -1 * log_lik(m1a)
dim(log_ratios)
r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:4, each = 1000))
psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2)

eloo<-E_loo(hplp1a,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]

sam<-E_loo(hplp1a,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value
samx<-sam[,id]
mean(samx)

mat1a<-as.matrix(m1a)
cphi<-which(colnames(mat1a)=="phi")
phi<-mat1a[1:4000,cphi]

samy<-rbbinom(4000,size=T,alpha=samx*phi,beta=(1-samx)*phi)/T
samy<-sample(samy,4000,replace=F)
plot(samx,samy)

den<-kde2d(samx,samy,n=50)
sz<-sort(den$z)
csz<-cumsum(sz)/sum(sz)
lo<-which(csz>0.05)[1]
q1<-which(csz>0.25)[1]
q2<-which(csz>0.5)[1]
q3<-which(csz>0.75)[1]
hi<-which(csz>0.95)[1]
lev<-sz[c(lo,q1,q2,q3,hi)]

plot(eloov,Va/T,pch=16,col=rgb(0,0,1,0.2),xlab="fitted",ylab="observed proportion", main="method B, altered data point 77",xlim=c(0.45,0.8),ylim=c(0.45,0.8))
lines(c(0,1),c(0,1),col="seagreen",lty=2,lwd=2)
points(eloov[id],(Va/T)[id],pch=16,col=2,cex=1.5)
points(samx,samy,cex=0.3,col=rgb(0.8,0,0.8,0.1))
contour(den$x,den$y,den$z,levels=lev,add=TRUE,drawlabels=F,lty=2)

#exceptional

gregd · January 17, 2024, 7:27am

nb the “exceptional” point with the altered data still has pareto<0.7

fixing code in the section of Method B re the original (unaltered) data

gregd:

eloo<-E_loo(hplp,psis_object,type="mean",log_ratios=log_ratios)
eloov<-eloo$value
eloop<-eloo$pareto
eloov[id]
eloop[id]

sam<-E_loo(hplp,psis_object,log_ratios=log_ratios,type="quantile",probs=(1:4000-0.5)/4000)$value

Ara_Winter · January 22, 2024, 8:25pm

I can see you are using brms here but I am not quite sure what you are looking for. Does the brms model work? Are you looking for feedback on the method you are using for plotting? This topic might get more responses on something like https://stats.stackexchange.com/

gregd · January 23, 2024, 9:28am

thanks Ara

there’s no problem I can see with the (corrected) code per se, i.e. it runs ok. The questions are: a) is this method of identifying exceptional points covered in the literature on outliers or similar to something in the literature, if so please point me to it; b) does it, and particularly method B, make sense to people here?

re method B, with the data I’m actually analysing rather than the synthetic example, the E_loo quantiles are not always well behaved. Can people point me to anything on the behaviour of these quantiles? The quantiles appear to be the sum of exp(lw) over the portion of hplp below a given value, where lw are the normalised log weights. The odd behaviour occurs when exp(lw) = 0 in a particular range of hplp. If I can construct a synthetic example to illustrate this, I’ll send it. But maybe this is already discussed somewhere?

thanks

Greg

Topic		Replies	Views
Loo_moment_match failure beta_binomial example brms loo , brms	4	196	June 27, 2024
Modelling from brms output brms	1	389	April 22, 2022
Quantitative Posterior Predictive Check Data brms	1	577	August 22, 2022
Using Pearson residuals to validate models brms	3	1979	September 24, 2020
Evaluating a brms model with a Gaussian Process brms techniques , ecology , gaussian-process , visualisation	5	1384	June 14, 2023

Contour plots to check for exceptional points

Related topics