In nested multilevel models we model variability between experimental units within rather than across groups of interest, hence nested. In the simplest case one group is known a priori to have much lower variability between experimental units than another and estimating this variability across both groups would inflate our variability estimate for the first group.
Usually I would use a parameter matrix
for simplicity, but an array
of parameter vector
s seems the only appropriate method for sum_to_zero_vector
, as mentioned here. I am looking for a simple syntactic guideline to follow for such multilevel models.
Here is my reprex in R and Stan. First let’s generate some data according to my description above.
require(tidyverse)
set.seed(101)
data <- tibble(
group = 1:2 %>% rep(each = 15 * 5) %>% str_c() %>% fct(),
unit = 1:15 %>% rep(each = 5) %>% rep(2) %>% str_c() %>% fct()
) %>%
mutate(
value_mean = if_else(group == 1,
0.1, 2),
value_sd = if_else(group == 1,
0.05, 0.5)
) %>%
group_by(unit) %>%
mutate(
value_dev = if_else(group == 1,
rnorm( 1 , 0 , 0.02 ),
rnorm( 1 , 0 , 0.2 ))
) %>%
ungroup() %>%
mutate(
value = rgamma( n() ,
(value_mean + value_dev)^2 / value_sd^2 ,
(value_mean + value_dev) / value_sd^2 )
) %>%
select(group, unit, value)
Here’s what they look like.
data %>%
ggplot(aes(unit, value)) +
geom_point(shape = 16, size = 3, alpha = 0.2) +
facet_grid(~ group) +
theme_minimal()
Now let’s look at the centred (
stan_c
) and non-centred (stan_nc
) parameterisations without sum_to_zero_vector
. alpha_u
, the intercept term capturing variation across unit
within group
, is a matrix
in both cases.
stan_c <- "
data{
int n;
vector[n] value;
array[n] int group;
int n_group;
array[n] int unit;
int n_unit;
}
parameters{
// Intercepts in log space
vector[n_group] alpha;
matrix[n_group, n_unit] alpha_u;
// Inter-unit variability
vector<lower=0>[n_group] sigma_u;
// Likelihood uncertainty
vector<lower=0>[n_group] sigma;
}
model{
// Hyperpriors
sigma_u ~ exponential( 1 );
// Priors
alpha ~ normal( log(1) , 0.2 );
for (i in 1:n_group) {
alpha_u[i,] ~ normal( 0 , sigma_u[i] );
}
sigma ~ exponential( 1 );
// Model with link function
vector[n] mu;
for ( i in 1:n ) {
mu[i] = exp( alpha[ group[i] ] +
alpha_u[ group[i] , unit[i] ] );
}
// Gamma likelihood
value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
mu ./ square( sigma[group] ) );
}
"
stan_nc <- "
data{
int n;
vector[n] value;
array[n] int group;
int n_group;
array[n] int unit;
int n_unit;
}
parameters{
// Intercepts in log space
vector[n_group] alpha;
matrix[n_group, n_unit] z_u; // z-score
// Inter-unit variability
vector<lower=0>[n_group] sigma_u;
// Likelihood uncertainty
vector<lower=0>[n_group] sigma;
}
model{
// Hyperpriors
sigma_u ~ exponential( 1 );
// Priors
alpha ~ normal( log(1) , 0.2 );
to_vector(z_u) ~ normal( 0 , 1 );
sigma ~ exponential( 1 );
// Convert z_u to alpha_u
matrix[n_group, n_unit] alpha_u;
for (i in 1:n_group) {
for (j in 1:n_unit) {
alpha_u[i,j] = z_u[i,j] * sigma_u[i] + 0;
}
}
// Model with link function
vector[n] mu;
for ( i in 1:n ) {
mu[i] = exp( alpha[ group[i] ] +
alpha_u[ group[i] , unit[i] ] );
}
// Gamma likelihood
value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
mu ./ square( sigma[group] ) );
}
generated quantities{
// Save alpha_u
matrix[n_group, n_unit] alpha_u;
for (i in 1:n_group) {
for (j in 1:n_unit) {
alpha_u[i,j] = z_u[i,j] * sigma_u[i] + 0;
}
}
}
"
model_c <- stan_c %>%
write_stan_file() %>%
cmdstan_model()
model_nc <- stan_nc %>%
write_stan_file() %>%
cmdstan_model()
require(tidybayes)
samples_c <- model_c$sample(
data = data %>% compose_data(),
chains = 8,
parallel_chains = parallel::detectCores(),
iter_warmup = 1e4,
iter_sampling = 1e4,
)
samples_nc <- model_nc$sample(
data = data %>% compose_data(),
chains = 8,
parallel_chains = parallel::detectCores(),
iter_warmup = 1e4,
iter_sampling = 1e4,
)
Let’s compare Rhat.
samples_c$summary() %>%
left_join(samples_nc$summary(),
by = "variable") %>%
rename(rhat_c = rhat.x, rhat_nc = rhat.y) %>%
ggplot(aes(rhat_c, rhat_nc)) +
geom_abline(slope = 1) +
geom_point() +
theme_minimal()
Both models converged well but the non-centred model is sightly better.
But when we look at the pairs plot we see that alpha_u
is far from zero, absorbing a substantial part of the effect of alpha
, deflating it while inflating sigma_u
, which expects the mean to be zero. In other words, alpha
and alpha_u
are non-identifiable. The results are pretty much identical for both parameterisations, so here’s the one for the non-centred model.
samples_nc$draws(format = "df") %>%
mcmc_pairs(pars = c("alpha[1]", "sigma[1]",
"alpha_u[1,1]", "sigma_u[1]"))
Enter
sum_to_zero_vector
. Now we force the mean effect of alpha_u
to be zero. stan_c_stz
and stan_nc_stz
are versions of the previous models where alpha_u
is a pair of sum_to_zero_vector
s. Here I am unsure of my notation using array[group] sum_to_zero_vector[unit]
instead of matrix[group, unit]
.
stan_c_stz <- "
data{
int n;
vector[n] value;
array[n] int group;
int n_group;
array[n] int unit;
int n_unit;
}
parameters{
// Intercepts in log space
vector[n_group] alpha;
array[n_group] sum_to_zero_vector[n_unit] alpha_u;
// Inter-unit variability
vector<lower=0>[n_group] sigma_u;
// Likelihood uncertainty
vector<lower=0>[n_group] sigma;
}
model{
// Hyperpriors
sigma_u ~ exponential( 1 );
// Priors
alpha ~ normal( log(1) , 0.2 );
for (i in 1:n_group) { // Note correction of sigma_u
alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
}
sigma ~ exponential( 1 );
// Model with link function
vector[n] mu;
for ( i in 1:n ) {
mu[i] = exp( alpha[ group[i] ] +
alpha_u[ group[i] ][ unit[i] ] );
}
// Gamma likelihood
value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
mu ./ square( sigma[group] ) );
}
"
stan_nc_stz <- "
data{
int n;
vector[n] value;
array[n] int group;
int n_group;
array[n] int unit;
int n_unit;
}
parameters{
// Intercepts in log space
vector[n_group] alpha;
array[n_group] sum_to_zero_vector[n_unit] z_u; // z-score
// Inter-unit variability
vector<lower=0>[n_group] sigma_u;
// Likelihood uncertainty
vector<lower=0>[n_group] sigma;
}
model{
// Hyperpriors
sigma_u ~ exponential( 1 );
// Priors
alpha ~ normal( log(1) , 0.2 );
for (i in 1:n_group) {
z_u[i][] ~ normal( 0 , 1 );
}
sigma ~ exponential( 1 );
// Convert z_u to alpha_u
matrix[n_group, n_unit] alpha_u;
for (i in 1:n_group) {
for (j in 1:n_unit) { // Note correction of z_u
alpha_u[i,j] = z_u[i][j] * sqrt( n_unit * inv( n_unit - 1 ) ) * sigma_u[i] + 0;
}
}
// Model with link function
vector[n] mu;
for ( i in 1:n ) {
mu[i] = exp( alpha[ group[i] ] +
alpha_u[ group[i] , unit[i] ] );
}
// Gamma likelihood
value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
mu ./ square( sigma[group] ) );
}
generated quantities{
// Save alpha_u
matrix[n_group, n_unit] alpha_u;
for (i in 1:n_group) {
for (j in 1:n_unit) { // Note correction of z_u
alpha_u[i,j] = z_u[i][j] * sqrt( n_unit * inv( n_unit - 1 ) ) * sigma_u[i] + 0;
}
}
}
"
model_c_stz <- stan_c_stz %>%
write_stan_file() %>%
cmdstan_model()
model_nc_stz <- stan_nc_stz %>%
write_stan_file() %>%
cmdstan_model()
samples_c_stz <- model_c_stz$sample(
data = data %>% compose_data(),
chains = 8,
parallel_chains = parallel::detectCores(),
iter_warmup = 1e4,
iter_sampling = 1e4,
)
samples_nc_stz <- model_nc_stz$sample(
data = data %>% compose_data(),
chains = 8,
parallel_chains = parallel::detectCores(),
iter_warmup = 1e4,
iter_sampling = 1e4,
)
The centred model samples far less efficiently and the non-centred model samples like a breeze. Let’s compare Rhat again.
samples_c_stz$summary() %>%
left_join(samples_nc_stz$summary(),
by = "variable") %>%
rename(rhat_c_stz = rhat.x, rhat_nc_stz = rhat.y) %>%
ggplot(aes(rhat_c_stz, rhat_nc_stz)) +
geom_abline(slope = 1) +
geom_point() +
theme_minimal()
The centred model’s performance has dramatically degraded, so that the non-centred model is now much better. The most likely explanation is a syntactic error on my part. Is
array[group] sum_to_zero_vector[unit]
supported? Am I doing something else wrong?
The pairs plot for the centred model confirms the sampling issues. But in the non-centred model sum_to_zero_vector
seems to mostly have resolved the non-identifiability between alpha
and alpha_u
, albeit introducing some correlation between alpha
and sigma
because this is a gamma likelihood and mean and sd aren’t independent.
samples_nc_stz$draws(format = "df") %>%
mcmc_pairs(pars = c("alpha[1]", "sigma[1]",
"alpha_u[1,1]", "sigma_u[1]"))
It helps to look at the predictions of all models over the data.
samples_c %>%
recover_types(data %>% select(group, unit)) %>%
spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
ungroup() %>%
mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
model = "c" %>% fct()) %>%
select(group, obs, model) %>%
bind_rows(
samples_nc %>%
recover_types(data %>% select(group, unit)) %>%
spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
ungroup() %>%
mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
model = "nc" %>% fct()) %>%
select(group, obs, model),
samples_c_stz %>%
recover_types(data %>% select(group, unit)) %>%
spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
ungroup() %>%
mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
model = "c_stz" %>% fct()) %>%
select(group, obs, model),
samples_nc_stz %>%
recover_types(data %>% select(group, unit)) %>%
spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
ungroup() %>%
mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
model = "nc_stz" %>% fct()) %>%
select(group, obs, model)
) %>%
ggplot() +
geom_point(data = data, aes(value, group),
shape = 16, size = 3, alpha = 0.2) +
ggridges::geom_density_ridges(aes(obs, group, colour = model),
from = 0, to = 4, fill = NA, scale = 0.8) +
# geom_density(data = tibble(value = rnorm( 1e5 , log(1) , 0.2 ) %>% exp()),
# aes(value), colour = "grey") + # prior on alpha
scale_x_continuous(limits = c(0, 4), oob = scales::oob_keep) +
scale_colour_manual(values = c("orange", "goldenrod", "lightblue", "purple")) +
theme_minimal() +
theme(legend.position = "top",
legend.justification = 0)
In each case the centred and non-centred parameterisations actually produce comparable predictions but the
sum_to_zero_vector
models clearly outperform the previous ones for group == 1
.
I think I’m on the right track with using sum_to_zero_vector
because my stz
models clearly yield better predictions given the same priors, but I would appreciate if someone could validate my syntax. Or is there an easy solution that does not involve sum_to_zero_vector
? Why is the c_stz
model having such convergence issues? I would appreciate any other advice on such hierarchical models.
I am taking the liberty to tag people I know to be involved with sum_to_zero_vector
: @mitzimorris @Bob_Carpenter @spinkney @martinmodrak