Dear all,

I have a question about a multilevel multinomial model in Stan. In particular, the model includes several fixed covariates as well as multiple random effects (both cross-classified and hierarchical), some of which have quite large sigmas.

The problem arises when doing posterior predictive checking, during which I convert estimates from log-odds to probability scale (using the softmax function). If I run a model excluding random effects with large variances, the model appears to perform well in the posterior predictive check (that is, the predicted results align closely with the proportions in the empirical data). In the model with the random effects, however, some of the random effects exhibit vary large variances, which seemingly leads to non-sensical results when I calculate predictions from the model. That is, almost all the probability is assigned to one outcome, which happens to be the most common outcome and the reference level for the analysis. This contrasts with the empirical data, which suggest that all 10 outcomes occur with non-trivial probabilities.

My suspicion is that this behavior has to do with the transformation to the probability scale when the variances of the random effects are large. Whereas the random effects are distributed symmetrically on the log-odds scale, they are not necessarily on the probability scale.

So I’m wondering if anyone here has experienced something similar, and if so, whether there is some advice or literature that you could point me to in an effort to better understand and resolve the problem. I have attempted to simulate data to use with a simplified version of the model to recreate the problem. The R code is presented below. Many thanks in advance!

library(rstan)

library(rethinking)

```
model_code <- "
data{
int N;
int N_id;
int y[N]; // 1=behavior A, 2=behavior B, 3=behavior C
int id[N];
}
parameters{
real a[2]; // intercepts for each behavior
vector[2] v[N_id]; // random effects for each individual
vector<lower=0>[2] sigma; // stddev of random effects
corr_matrix[2] Rho; // correlation matrix of random effects
}
model{
matrix[2,2] Sigma;
// priors
a ~ normal(0,1);
sigma ~ cauchy(0,2); // implicitly half-Cauchy
Rho ~ lkj_corr(2);
// hyper-prior
Sigma <- quad_form_diag(Rho,sigma); // = diag(sigma)*Rho*diag(sigma)
v ~ multi_normal( rep_vector(0,2) , Sigma );
// likelihood
for ( i in 1:N ) {
vector[3] p;
p[1] <- a[1] + v[id[i],1];
p[2] <- a[2] + v[id[i],2];
p[3] <- 0;
y[i] ~ categorical_logit( p );
}
}
"
```

N_id <- 50

N_per_id <- 60

sigma <- c(0.5,1) ## These can be adjusted up and down.

Rho <- matrix(c(1,-0.5,-0.5,1),2,2)

v <- rmvnorm2( N_id , Mu=c(0,0) , sigma=sigma , Rho=Rho )

a <- c(0.5,-0.5) ## These are intercepts for the two non-reference level outcomes.

dat <- data.frame(y=0,id=0)

row <- 1

for ( i in 1:N_id ) {

for ( j in 1:N_per_id ) {

y <- sample(1:3,size=1,prob=softmax( a[1]+v[i,1] , a[2]+v[i,2] , 0 ))

dat[row,] <- c(y,i)

row <- row + 1

}

}

dat_list <- list(

N = nrow(dat),

N_id = N_id,

y = dat$y,

id = rep(1:N_id,each=N_per_id)

)

mfit <- stan( model_code=model_code , data=dat_list , chains=3 , cores=3 , warmup=1000, iter=2000 )

summary(mfit, pars=“a”)$summary

traceplot(mfit)

post <- extract.samples(mfit)

link.t <- function( data ) {

K <- dim(post$v)[3] + 1

ns <- dim(post$v)[1]

if ( missing(data) ) stop( “BOOM: Need data argument” )

n=1

softmax2 <- function(x) {

x <- max(x) - x

exp(-x)/sum(exp(-x))

}

p <- list()

for ( i in 1:n ) {

p[[i]] <- sapply( 1:K , function(k) {

if ( k < K ) {

ptemp <- post$a[,k]

if ( data$id[i]>0 ) ptemp <- ptemp + post$v[,data$id[i],k]

} else {

ptemp <- rep(0,ns)

}

return(ptemp)

})

## The values are converted to probabilities using the softmax function

## which ensures that the predicted values across categories sum to

## 100% probabilities.

for ( s in 1:ns ) p[[i]][s,] <- softmax2( p[[i]][s,] )

}

return§

}

predat <- data.frame(a = c(1,2),

id = c(0,0))

out <- link.t(predat)

apply(out[[1]], 2, mean) #model estimates

table(dat$y)/nrow(dat) #empirical proportions

## With high variance (note that the prior has also been widened)

```
model_code <- "
data{
int N;
int N_id;
int y[N]; // 1=behavior A, 2=behavior B, 3=behavior C
int id[N];
}
parameters{
real a[2]; // intercepts for each behavior
vector[2] v[N_id]; // random effects for each individual
vector<lower=0>[2] sigma; // stddev of random effects
corr_matrix[2] Rho; // correlation matrix of random effects
}
model{
matrix[2,2] Sigma;
// priors
a ~ normal(0,1);
sigma ~ cauchy(0,15); // implicitly half-Cauchy
Rho ~ lkj_corr(2);
// hyper-prior
Sigma <- quad_form_diag(Rho,sigma); // = diag(sigma)*Rho*diag(sigma)
v ~ multi_normal( rep_vector(0,2) , Sigma );
// likelihood
for ( i in 1:N ) {
vector[3] p;
p[1] <- a[1] + v[id[i],1];
p[2] <- a[2] + v[id[i],2];
p[3] <- 0;
y[i] ~ categorical_logit( p );
}
}
"
```

N_id <- 50

N_per_id <- 60

sigma <- c(0.5,20) ## These can be adjusted up and down.

Rho <- matrix(c(1,-0.5,-0.5,1),2,2)

v <- rmvnorm2( N_id , Mu=c(0,0) , sigma=sigma , Rho=Rho )

a <- c(0.5,-0.5) ## These are intercepts for the two non-reference level outcomes.

dat <- data.frame(y=0,id=0)

row <- 1

for ( i in 1:N_id ) {

for ( j in 1:N_per_id ) {

y <- sample(1:3,size=1,prob=softmax( a[1]+v[i,1] , a[2]+v[i,2] , 0 ))

dat[row,] <- c(y,i)

row <- row + 1

}

}

dat_list <- list(

N = nrow(dat),

N_id = N_id,

y = dat$y,

id = rep(1:N_id,each=N_per_id)

)

mfit2 <- stan( model_code=model_code , data=dat_list , chains=3 , cores=3 , warmup=1000, iter=2000 )

summary(mfit2, pars=“a”)$summary

traceplot(mfit2)

post <- extract.samples(mfit2)

out <- link.t(predat)

apply(out[[1]], 2, mean) #model estimates

table(dat$y)/nrow(dat) #empirical proportions