Here is a simple model that mimics the problem I am struggling with. NegBin indicates Negative Binomial defined by mean and shape parameter of the associated gamma distribution.
\eqalign{ & {k_i}\sim{\rm{NegBin}}\left( {{\mu _{g\left( i \right)}},\beta {\mu _{g\left( i \right)}}} \right) \cr & {x_g}\sim{\rm{Poisson}}\left( {q\mu _g^c} \right) \cr & {\mu _g} = \exp (M + {E_g}) \cr & {E_g} \sim {\rm{Normal}}\left( {0,S} \right) \cr & {\rm{Some more priors}} \cr}
Here, g\left( i \right) indicates the group to which observation i belong. For each group g, there are both the single data observation of type x and several observations of type k. There is a potentially non-linear relationship between the expected value for the two data types, modelled by scaling parameter q and power parameter c.
Say I wish to test if c \neq 1 through model comparison. A straightforward approach would be to treat all observations as equal and compare models by PSIS-LOO-CV. There are two reasons why I don’t want to do this. First, in my real example, there are ~1,000,000 observations, leading to very large files and potentially a large number of re-runs for exact LOO, even if, say, 0.1% of observations are flagged as unreliable by the Pareto Gods (k>0.7). Also, I am not interested in model performance at the level of individual observations of k.
In the focal model, I can derive an expression for sum of all k belonging to a specific group:
\sum\limits_{i \in g} {{k_i}} \sim {\rm{NegBin}}\left( {N{\mu _{g\left( i \right)}},N\beta {\mu _{g\left( i \right)}}} \right)
So, an alternative could be to instead include the likelihood of \sum\limits_{i \in g} {{k_i}} . But is this any better? And does it violate any assumptions of the PSIS approximation of leave one (group) out? I am a bit at loss for how to think about this problem and would appreciate any pointers.
Here are two figures (sorry about the ugly legend) plotting the pointwise log predictive difference for the two models. Left of the dashed lines are k and \sum\limits_{i \in g} {{k_i}} , respectively, and right of the dashed lines are the corresponding x observations. As you can see, the two data types differ on the scale at which models differ in predictive performance. I think this should matter, at least for the SE of loo, but I am not quite sure how…
Below is stan code and r script to generate data and run the analysis.
Stan code:
data {
int<lower=0> N;
int<lower=0> n[N];
int<lower=0> nobs;
int<lower=1, upper=N> G[nobs];
int<lower=0> k[nobs];
int<lower=0> x[N];
int<lower=0,upper=1> incc;
int<lower=0> ksum[N];
}
parameters {
vector[N] E_raw;
real<lower=0> beta;
real<lower=0> S;
real M;
real<lower=0> c[incc ? 1 :0];
real<lower=0> q;
}
transformed parameters{
real mu[N];
vector[N] E;
real<lower=0> cc;
if (incc){
cc=c[1];
}else{
cc=1;
}
E=E_raw*S;
for (g in 1:N){
mu[g]=exp(M+E[g]);
}
}
model {
for(g in 1:N){
x[g]~poisson(mu[g]^cc*q);
}
for (i in 1:nobs){
k[i]~neg_binomial_2(mu[G[i]],mu[G[i]]*beta);
}
E_raw~normal(0,1);
S~cauchy(0,2.5);
q~lognormal(0,5);
if(incc){
c[1]~lognormal(0,5);
}
}
generated quantities{
real loglik_k[nobs];
real loglik_ksum[N];
real loglik_x[N];
for(i in 1:nobs){
loglik_k[i] = neg_binomial_2_lpmf(k[i] | mu[G[i]],mu[G[i]]*beta);
}
for (g in 1:N){
loglik_x[g] = poisson_lpmf(x[g] | mu[g]^cc*q);
loglik_ksum[g] = neg_binomial_2_lpmf(ksum[g] | n[g]*mu[g],mu[g]*beta*n[g]);
}
}
R script
library(rstan)
library(tidyverse)
library( abind)
library(loo)
library(tidyverse)
set.seed=1
N=50 #Number of groups
meann=10 #average number of observations per group
M=5 #log-mean of mu
S=1 #standard dev between groups
beta=1.5
c=.7
q=.5
logmu=rnorm(N,M,S)
mu=exp(logmu)
n=rpois(N,meann)
nobs=sum(n)
G=rep(1:N,n)
k=rnbinom(n=nobs,mu=mu[G],size=beta*mu[G])
x=rpois(N,mu^c*q)
ksum=rep(0,N)
for(i in 1:nobs){
ksum[G[i]]=ksum[G[i]]+k[i]
}
#settings to use parallel computing
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
#make data list
data=list(N=N,n=n,ksum=ksum,G=G,nobs=nobs,k=k,x=x,incc=TRUE)
#run stan for non-linear response
fit <- stan("lgo_examle_stan.stan",data=data,iter = 10000,warmup = 2000)
#loo stuff
ll_k=rstan::extract(fit, "loglik_k", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll_x=rstan::extract(fit, "loglik_x", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll_ksum=rstan::extract(fit, "loglik_ksum", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll=abind(ll_k,ll_x,along=3)
llg=abind(ll_ksum,ll_x,along=3)
LOOlist=list()
LOOlistg=list()
LOOlist[[1]]=loo(ll)
LOOlistg[[1]]=loo(llg)
#make new data list
data=list(N=N,n=n,ksum=ksum,G=G,nobs=nobs,k=k,x=x,incc=FALSE)
#run stan for linear response
fit2 <- stan("lgo_examle_stan.stan",data=data,iter = 10000,warmup = 2000)
#loo stuff
ll_k2=rstan::extract(fit2, "loglik_k", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll_x2=rstan::extract(fit2, "loglik_x", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll_ksum2=rstan::extract(fit2, "loglik_ksum", permuted = FALSE, inc_warmup = FALSE,include = TRUE)
ll2=abind(ll_k2,ll_x2,along=3)
llg2=abind(ll_ksum2,ll_x2,along=3)
LOOlist[[2]]=loo(ll2)
LOOlistg[[2]]=loo(llg2)
LOODiff=LOOlist[[1]]$pointwise[,1]-LOOlist[[2]]$pointwise[,1]
Letters=c(LETTERS,letters)
df=tibble(Diff=LOODiff,
group=Letters[c(G,1:N)],
ind=1:(nobs+N))
p<- ggplot(df, aes(y=Diff, x=ind, color=group)) +
geom_point() +
xlab("Obs") +
ylab("Pointwise Differnce") +
geom_hline(yintercept=0) +
geom_vline(xintercept = nobs+0.5,linetype="dashed") +
scale_colour_discrete()
print(p)
ggsave("PWDiff1.png", plot = last_plot(),
scale = 1, width = 16, height = 10, units = "cm",
dpi = 600)
LOODiffg=LOOlistg[[1]]$pointwise[,1]-LOOlistg[[2]]$pointwise[,1]
df=tibble(Diff=LOODiffg,
group=Letters[c(1:N,1:N)],
ind=1:(N+N))
p2<- ggplot(df, aes(y=Diff, x=ind, color=group)) +
geom_point() +
xlab("Obs") +
ylab("Pointwise Differnce") +
geom_hline(yintercept=0) +
geom_vline(xintercept = N+0.5,linetype="dashed") +
scale_colour_discrete()
print(p2)
ggsave("PWDiff2.png", plot = last_plot(),
scale = 1, width = 16, height = 10, units = "cm",
dpi = 600)