Proper use of sum_to_zero_vector in nested multilevel models

In nested multilevel models we model variability between experimental units within rather than across groups of interest, hence nested. In the simplest case one group is known a priori to have much lower variability between experimental units than another and estimating this variability across both groups would inflate our variability estimate for the first group.

Usually I would use a parameter matrix for simplicity, but an array of parameter vectors seems the only appropriate method for sum_to_zero_vector, as mentioned here. I am looking for a simple syntactic guideline to follow for such multilevel models.

Here is my reprex in R and Stan. First let’s generate some data according to my description above.

require(tidyverse)
set.seed(101)
data <- tibble(
  group = 1:2 %>% rep(each = 15 * 5) %>% str_c() %>% fct(),
  unit = 1:15 %>% rep(each = 5) %>% rep(2) %>% str_c() %>% fct()
  ) %>%
  mutate(
    value_mean = if_else(group == 1,
                         0.1, 2),
    value_sd = if_else(group == 1,
                       0.05, 0.5)
  ) %>%
  group_by(unit) %>%
  mutate(
    value_dev = if_else(group == 1,
                        rnorm( 1 , 0 , 0.02 ),
                        rnorm( 1 , 0 , 0.2 ))
  ) %>%
  ungroup() %>%
  mutate(
    value = rgamma( n() , 
                    (value_mean + value_dev)^2 / value_sd^2 ,
                    (value_mean + value_dev) / value_sd^2 )
  ) %>%
  select(group, unit, value)

Here’s what they look like.

data %>%
  ggplot(aes(unit, value)) +
    geom_point(shape = 16, size = 3, alpha = 0.2) +
    facet_grid(~ group) +
    theme_minimal()


Now let’s look at the centred (stan_c) and non-centred (stan_nc) parameterisations without sum_to_zero_vector. alpha_u, the intercept term capturing variation across unit within group, is a matrix in both cases.

stan_c <- "
    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      matrix[n_group, n_unit] alpha_u;
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      for (i in 1:n_group) {
        alpha_u[i,] ~ normal( 0 , sigma_u[i] );
      }
      sigma ~ exponential( 1 );
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] , unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }
"

stan_nc <- "
    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      matrix[n_group, n_unit] z_u; // z-score
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      to_vector(z_u) ~ normal( 0 , 1 );
      sigma ~ exponential( 1 );
      
      // Convert z_u to alpha_u
      matrix[n_group, n_unit] alpha_u;
      for (i in 1:n_group) {
        for (j in 1:n_unit) {
          alpha_u[i,j] = z_u[i,j] * sigma_u[i] + 0;
        }
      }
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] , unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }
    
    generated quantities{
      // Save alpha_u
      matrix[n_group, n_unit] alpha_u;
      for (i in 1:n_group) {
        for (j in 1:n_unit) {
          alpha_u[i,j] = z_u[i,j] * sigma_u[i] + 0;
        }
      }
    }
"

model_c <- stan_c %>%
  write_stan_file() %>%
  cmdstan_model()

model_nc <- stan_nc %>%
  write_stan_file() %>%
  cmdstan_model()

require(tidybayes)
samples_c <- model_c$sample(
          data = data %>% compose_data(),
          chains = 8,
          parallel_chains = parallel::detectCores(),
          iter_warmup = 1e4,
          iter_sampling = 1e4,
        )

samples_nc <- model_nc$sample(
          data = data %>% compose_data(),
          chains = 8,
          parallel_chains = parallel::detectCores(),
          iter_warmup = 1e4,
          iter_sampling = 1e4,
        )

Let’s compare Rhat.

samples_c$summary() %>%
  left_join(samples_nc$summary(),
            by = "variable") %>%
  rename(rhat_c = rhat.x, rhat_nc = rhat.y) %>%
  ggplot(aes(rhat_c, rhat_nc)) +
    geom_abline(slope = 1) +
    geom_point() +
    theme_minimal()


Both models converged well but the non-centred model is sightly better.

But when we look at the pairs plot we see that alpha_u is far from zero, absorbing a substantial part of the effect of alpha, deflating it while inflating sigma_u, which expects the mean to be zero. In other words, alpha and alpha_u are non-identifiable. The results are pretty much identical for both parameterisations, so here’s the one for the non-centred model.

samples_nc$draws(format = "df") %>%
  mcmc_pairs(pars = c("alpha[1]", "sigma[1]",
                      "alpha_u[1,1]", "sigma_u[1]"))


Enter sum_to_zero_vector. Now we force the mean effect of alpha_u to be zero. stan_c_stz and stan_nc_stz are versions of the previous models where alpha_u is a pair of sum_to_zero_vectors. Here I am unsure of my notation using array[group] sum_to_zero_vector[unit] instead of matrix[group, unit].

stan_c_stz <- "
    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      array[n_group] sum_to_zero_vector[n_unit] alpha_u;
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
      }
      sigma ~ exponential( 1 );
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] ][ unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }
"

stan_nc_stz <- "
    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      array[n_group] sum_to_zero_vector[n_unit] z_u; // z-score
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      for (i in 1:n_group) {
        z_u[i][] ~ normal( 0 , 1 );
      }
      sigma ~ exponential( 1 );
      
      // Convert z_u to alpha_u
      matrix[n_group, n_unit] alpha_u;
      for (i in 1:n_group) {
        for (j in 1:n_unit) { // Note correction of z_u
          alpha_u[i,j] = z_u[i][j] * sqrt( n_unit * inv( n_unit - 1 ) ) * sigma_u[i] + 0;
        }
      }
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] , unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }
    
    generated quantities{
      // Save alpha_u
      matrix[n_group, n_unit] alpha_u;
      for (i in 1:n_group) {
        for (j in 1:n_unit) { // Note correction of z_u
          alpha_u[i,j] = z_u[i][j] * sqrt( n_unit * inv( n_unit - 1 ) ) * sigma_u[i] + 0;
        }
      }
    }
"

model_c_stz <- stan_c_stz %>%
  write_stan_file() %>%
  cmdstan_model()

model_nc_stz <- stan_nc_stz %>%
  write_stan_file() %>%
  cmdstan_model()

samples_c_stz <- model_c_stz$sample(
          data = data %>% compose_data(),
          chains = 8,
          parallel_chains = parallel::detectCores(),
          iter_warmup = 1e4,
          iter_sampling = 1e4,
        )

samples_nc_stz <- model_nc_stz$sample(
          data = data %>% compose_data(),
          chains = 8,
          parallel_chains = parallel::detectCores(),
          iter_warmup = 1e4,
          iter_sampling = 1e4,
        )

The centred model samples far less efficiently and the non-centred model samples like a breeze. Let’s compare Rhat again.

samples_c_stz$summary() %>%
  left_join(samples_nc_stz$summary(),
            by = "variable") %>%
  rename(rhat_c_stz = rhat.x, rhat_nc_stz = rhat.y) %>%
  ggplot(aes(rhat_c_stz, rhat_nc_stz)) +
    geom_abline(slope = 1) +
    geom_point() +
    theme_minimal()


The centred model’s performance has dramatically degraded, so that the non-centred model is now much better. The most likely explanation is a syntactic error on my part. Is array[group] sum_to_zero_vector[unit] supported? Am I doing something else wrong?

The pairs plot for the centred model confirms the sampling issues. But in the non-centred model sum_to_zero_vector seems to mostly have resolved the non-identifiability between alpha and alpha_u, albeit introducing some correlation between alpha and sigma because this is a gamma likelihood and mean and sd aren’t independent.

samples_nc_stz$draws(format = "df") %>%
  mcmc_pairs(pars = c("alpha[1]", "sigma[1]",
                      "alpha_u[1,1]", "sigma_u[1]"))


It helps to look at the predictions of all models over the data.

samples_c %>%
  recover_types(data %>% select(group, unit)) %>%
  spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
  ungroup() %>%
  mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
         obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
         model = "c" %>% fct()) %>%
  select(group, obs, model) %>%
  bind_rows(
    samples_nc %>%
      recover_types(data %>% select(group, unit)) %>%
      spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
      ungroup() %>%
      mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
             obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
             model = "nc" %>% fct()) %>%
      select(group, obs, model),
    samples_c_stz %>%
      recover_types(data %>% select(group, unit)) %>%
      spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
      ungroup() %>%
      mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
             obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
             model = "c_stz" %>% fct()) %>%
      select(group, obs, model),
    samples_nc_stz %>%
      recover_types(data %>% select(group, unit)) %>%
      spread_draws(alpha[group], sigma_u[group], sigma[group]) %>%
      ungroup() %>%
      mutate(mu = exp( alpha + rnorm( n() , 0 , sigma_u ) ),
             obs = rgamma( n() , mu^2 / sigma^2 , mu / sigma^2 ),
             model = "nc_stz" %>% fct()) %>%
      select(group, obs, model)
  ) %>%
  ggplot() +
    geom_point(data = data, aes(value, group),
               shape = 16, size = 3, alpha = 0.2) +
    ggridges::geom_density_ridges(aes(obs, group, colour = model),
                                  from = 0, to = 4, fill = NA, scale = 0.8) +
    # geom_density(data = tibble(value = rnorm( 1e5 , log(1) , 0.2 ) %>% exp()),
    #              aes(value), colour = "grey") + # prior on alpha
    scale_x_continuous(limits = c(0, 4), oob = scales::oob_keep) +
    scale_colour_manual(values = c("orange", "goldenrod", "lightblue", "purple")) +
    theme_minimal() +
    theme(legend.position = "top",
          legend.justification = 0)


In each case the centred and non-centred parameterisations actually produce comparable predictions but the sum_to_zero_vector models clearly outperform the previous ones for group == 1.

I think I’m on the right track with using sum_to_zero_vector because my stz models clearly yield better predictions given the same priors, but I would appreciate if someone could validate my syntax. Or is there an easy solution that does not involve sum_to_zero_vector? Why is the c_stz model having such convergence issues? I would appreciate any other advice on such hierarchical models.

I am taking the liberty to tag people I know to be involved with sum_to_zero_vector: @mitzimorris @Bob_Carpenter @spinkney @martinmodrak

The two terms of art are “hierarchical” for when things are properly nested (e.g., students in classes in schools in school districts in states) and “multilevel” when they’re not (e.g., age group effects, ethnicity effects, state of residence effects, etc. in a polling model). The notion of a multilevel model includes hierarchical models as a subclass and they all work the same way. The key is identifying them, hence the sum-to-zero you found.

That’s how you have to code multiple sum-to-zero vectors for now. We also don’t have ragged arrays or matrices, so it’s challenging to do any kind of multilevel modeling with a matrix without blowing up the matrix, as for example, lme4 does.

Yes. You should be able to verify that the elements of the array sum to zero in the output.

Gamma regressions can be challenging to fit in my experience from reading these forums.

That’s a good safety check. It also highlights the point that you don’t need really great sampling for pretty good inference.

The traditional approach is to pin one of the values to zero. That also identifies the model. The main trick is to make sure you only introduce a single intercept. Sum-to-zero enforces that because the values can’t all shift without breaking the sum-to-zero constraint.

This is splitting hairs. Inference will be fine for even much larger R-hat values. We don’t usually recommend running so long that you get these kinds of R-hat values as the real uncertainty in your posterior is the posterior variance, not the standard error in estimating expectations.

You’re on the right track looking at pairs plots in the posterior and look at what happens to the step size in adaptation. It pretty much always comes down to varying geometry causing a problem. Centered vs. non-centered efficiency can vary based on how informative the data and prior are together—as data and prior get stronger, centered starts to work better.

The following code is problematic if both alpha and alpha_u are set to sum to zero because it doesn’t provide an intercept.

for (i in 1:n) {
  mu[i] = exp(alpha[group[i]] + alpha_u[group[I], unit[I]]);
}
value ~ gamma(mu^2 ./ sigma[group]^2, mu ./ sigma[group]^2));

You’re also going to want to make sure that the exponentiation doesn’t overflow. I would recommend introducing a new parameter beta with an appropriately scaled prior, keeping both alpha and alpha_u centered by group, and then using

mu[i] = exp(beta + alpha[group[i]] + alpha_u[group[I], unit[I]]);

I recommend this because as models get more complicated, it’s easiest to just keep all the varying effects zero-centered with a single intercept. Otherwise, one of the varying effects has to be allowed to be non-centered to account for the intercept. But it’s almost always more efficient to do it with a single parameter.

P.S. I’d strongly suggest storing Stan programs in their own files. You can get syntax highlighting to edit and you can also make sense of line numbers and you can use either quotes to print or apostrophes for transposition without worrying about what quote symbol was used. It also saves on indentation.

2 Likes

Thanks for the thorough reply @Bob_Carpenter!

I did not know this. I always assumed the terms to be interchangeable. There seems to be some confusion on this, e.g.

This results in a model with multiple levels of uncertainty, each feeding into the next—a multilevel model. Multilevel models—also known as hierarchical, random effects, varying effects, or mixed effects models… (Richard McElreath 2019 Statistical rethinking: a Bayesian course with examples in R and Stan).

Thanks for confirming I’m on the right track with array[] sum_to_zero_vector[].

I usually don’t have issues with the mean-sd parameterisation of the gamma likelihood and I know mean-scale is also used successfully. What may be the issue here is that I am trying to model heteroscedasticity even though mu and sigma are not independent in the gamma likelihood. Not stratifying sigma by group would definitely simplify the model. Should modelling heteroscedasticity generally be avoided in generalised linear models? The obvious cases are Poisson and Bernoulli where there is only one parameter. Any thoughts?

Yes I am guilty of that. I usually look for R-hat < 1.001 with 8*1e4 iterations. Would you say R-hat is less important to look out for than E-BFMI? In my experience E-BFMI is at risk of being low when I try to model variance separately from the mean in the gamma likelihood.

But given that the data and priors stayed the same, I’m afraid I still don’t understand why the centred parameterisation went from reasonable to terrible convergence when introducing sum_to_zero_vector?

I see. I simply chose the normal( log(1) , 0.2 ) prior for alpha based on the fact that on the response scale the midpoint between the two group means (0.1 vs. 2) is approximately 1, hence log(1). I did not consider that since log(1) = 0 this implies two zero-centred priors.

I typically don’t include a global intercept by default since it is an additional parameter that often holds no intrinsic value (e.g. in my reprex an average across group may be meaningless) and it is sometimes harder to think of a sensible prior for deviations from the global intercept rather than the intercept itself. Following your suggestion the priors would become beta ~ normal( log(1) , 0.2 ) and alpha ~ normal( 0 , 0.2 ) etc. or similar, so this doesn’t really solve the issue because now there are three instead of two parameters centred on zero, with one summing to zero. Am I missing something?

Why is having a non-stratified global intercept with deviations more efficient than a stratified intercept despite introducing the need to estimate an additional parameter?

Is there an easy way to do this in RStudio? I noticed that there is a Stan File option in the New File dropdown, but I thought it was only to be used with rstan. I would be grateful for an easy solution for cmdstanr in RStudio. It would definitely make my R code more readable.

You can use the same files with cmdstanr. Any file ending in .stan can be used with any interface. You can create the file with RStudio or just put your Stan code in a text file and give it a .stan extension yourself. The only issue is that the Stan parser used by RStudio to check syntax uses rstan. So if you use RStudio to check the syntax it will use rstan, which is several versions of Stan behind cmdstanr and so may not recognize some newer additions to the language. But it’s certainly fine to create and edit the file using RStudio and then give it to cmdstanr even if you’re using something in the language not supported by rstan yet (I do this all the time). Trying to compile the model with cmdstanr will then check the syntax without using rstan. Or you could also use the check_syntax method if you prefer create the model object without compiling it.

1 Like

Not related to original post. But this info about RStudio using rstan to parse Stan files explains some odd things I’ve noticed for a while, like std_normal_lpdf() not getting highlighted “appropriately”. I am glad to know this now.

Is this factoid publically documented anywhere? If not, should it be?

2 Likes

Hmm, I guess it’s not documented. It’s just that it’s always used rstan since way before cmdstanr existed, but you’re right that we should put this information somewhere. Where do you think would be a helpful place to document this? Maybe in the main vignette for cmdstanr? Somewhere else?

If I recall correctly I think at one point either @mike-lawrence or @ssp3nc3r (I can’t remember which, sorry!) wrote a script that overrides it and allows it to use cmdstanr. Does that ring a bell, Mike and/or Scott?

1 Like

I think this is what your referring to: Rstudio Stan syntax checker is outdated

2 Likes

You an also install rstan from the develop branch for reasonably up-to-date syntax highlighting.

2 Likes

Since using Stan files in RStudio was just an aside, I’d like move this thread back to its original question, which remains unresolved for me.

I have now tried centred and non-centred sum-to-zero parameterisations for various likelihoods and data and the centred sum-to-zero parametrisation never samples smoothly for me, even when it samples nicely without sum_to_zero_vector despite non-identifiability. I cannot comprehend how the centred model always gets worse when I fix non-identifiability with sum_to_zero_vector. The most likely explanation for me is still that my syntax is wrong.

I appreciate that my reprex is long so here are the relevant code chunks from the centred (1) and non-centred (2) sum-to-zero models:

(1)

(2)

The general trend is that (2) always improves inference while (1) always makes it worse, regardless of how the models compare before sum_to_zero_vector is introduced. Any further insight would be greatly appreciated!

1 Like

In the centered zero sum case you’re doing

z \sim \mathcal{N} \bigg(0, \sigma \sqrt{ \frac{N}{N - 1} } \bigg)
z \in \mathbb{R}^N,\quad \text{subject to } \sum_{i=1}^N z_i = 0.

The issue here is that both z and \sigma are parameters. The sum to zero constraint has a Jacobian correction of 0 but now we need to take into account the implicit multiplication by \sigma.

You need to add target += log(sigma_u[i]);

      for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
        target += log(sigma_u[i]);
      }
1 Like

Not sure if the previous argument is right. As we have -N * log(sigma) from the N calls to the normal distribution. We add a log(sigma) and now we have
(N - 1) * log(sigma). Now, it results in the same thing but is a bit clearer. We have a singular covariance on z from the sum to zero constraint and so there are only (N-1) degrees of freedom that we need the normal on. In the case where sigma is data it doesn’t matter because of proportionality of the density but when sigma is a parameter it does.

I think if we work out the direct jacobian it will come out to this as well (up to a constant). I can see about working this out when I get a moment.

I was out sick and then we had a long weekend, so I’m a bit late responding.

You’re right. The terminology, like most terminology in stats, is very confused. To follow @andrewgelman’s terminology from his multilevel and hierarchical regression books, “multilevel model” is the general term that encompasses “hierarchical model”. But a lot of people use the latter when there is not a hierarchy and they strictly mean “multilevel model” and Wikipedia is just confusing in saying that a multilevel model is a kind of hierarchical model!

No, you can do both. GLMs are just general ways of representing transformed linear effects and their mechanisms get used all over the place.

I don’t even know what E-BFMI is in the sense that I couldn’t write down its definition. About the only thing I look at is effective sample size estimates, as those won’t be high if R-hat is high in Stan, because we discount ESS for failed convergence.

You really have to look at the entire posterior geometry. The sum-to-zero vector imposes pretty strong constraints.

The issue is that you can only have one varying effect with a non-zero prior mean. Otherwise, you get non-identifiability because each one acts like an intercept. That is, if you introduce a hierarchical parameter alpha and give it the usual prior with a mean and scale, you get

vector[K] alpha;
...
alpha ~ normal(mu, sigma);
mu ~ ...;
sigma ~ ...;
y[n] ~ normal(alpha[group[n]], sigma);

This model is identical to the one formulated with an intercept directly.

vector[N] alpha;
...
alpha_ctr ~ normal(0, sigma);
mu ~ ...;
sigma ~ ...;
y[n] ~ normal(mu + alpha_ctr[group[n]], sigma);

Exact same likelihood, but the second one’s expressed with a global intercept. This is necessary because if you do this, it’s not identified.

vector[K] alpha;
vector[J] beta;
...
alpha ~ normal(mu, sigma);
beta ~ normal(nu, tau);
mu ~ ...;  nu ~ ...; sigma ~ ...;  tau ~ ...;  upsilon ~ ...;
y[n] ~ normal(alpha[group1[n]] + beta[group2[n]], upsilon);

Breaking this down we get the equivalent version, where it’s clear the existence of both mu and nu introduces non-identifiability.

vector[K] alpha_ctr;
vector[J] beta_ctr;
...
alpha_ctr ~ normal(0, sigma);
beta_ctr ~ normal(0, tau);
mu ~ ...;  nu ~ ...; sigma ~ ...;  tau ~ ...;  upsilon ~ ...;
y[n] ~ normal(mu + nu + alpha_ctr[group1[n]] + beta_ctr[group2[n]], upsilon);

As you can see above, there’s not an additional parameter. The difference is that when you have alpha_~ normal(mu, sigma), then all of the alpha have to move when the mu moves, which introduces a lot of correlation and tends to be a harder geometry to sample.

I have no idea. I never used RStudio even when I was using R. I don’t like third-party systems getting in the way and mucking with I/O the way RStudio does.

1 Like

Here’s why that Jacobian adjustment is needed when using s2z in the centered model.

Let’s start at why the sqrt(N / (N - 1)) thing comes up and then what adding \sigma does.

Why \sqrt{N / (N - 1)}?

The sum-to-zero constraint is effectively pre-multiplying x by an orthogonal basis. The basis used in Stan is the Helmert basis (though any orthogonal basis will do such as the Householder basis). This is

H\in\mathbb R^{N\times (N-1)} \\ H^{\mathsf T}H=I_{N-1},\;H^{\mathsf T}\mathbf 1=0

When we project x using H as z = H x it is to an N dimensional linear subspace where z \in \mathbb R^N and \sum_{i=1}^N z_i = 0. We have

z = \underbrace{\sigma \;H}_{N \times (N - 1)} x.

If we assume that x \sim \mathcal N(0, 1) what is the distribution of z?

When we pushfoward H through the normal distribution the mean stays 0 but the covariance matrix is now

\sigma^2 \begin{pmatrix} 1-\tfrac{1}{N} & -\tfrac{1}{N} & \cdots&-\tfrac{1}{N} \\ -\tfrac{1}{N} & 1-\tfrac{1}{N} & \cdots & -\tfrac{1}{N} \\ \vdots & \vdots & \ddots & \vdots \\ -\tfrac{1}{N} & -\tfrac{1}{N} & \cdots &1-\tfrac{1}{N} \end{pmatrix}

where \sigma^2 = 1 for now.

We see that every element of z is negatively correlated and that the marginal variances are 1 - \frac{1}{N} = \frac{N - 1}{N}. If we multiply z by \frac{N}{N-1} the marginal variances will all be 1. In Stan the normal distribution is parameterized by the standard deviation so this multiplication is

z = \underbrace{\sqrt{\frac{N}{N-1}} \;H}_{N \times (N - 1)} x,

implies that each z_n is marginally distributed as standard normal. The difficulty with Stan is that we put the prior on z_i so by doing

sum_to_zero_vector[N] z;
z ~ normal(0, sqrt(N %/% (N - 1)));

the backend is doing the H multiplication in a clever and fast way so that it never creates the H matrix and loops through x only once. Now z_i is marginally standard normal.

We hope to put in something like

z ~ normal_sum_to_zero(sigma);

where each marginal z is distributed as normal with mean zero and standard deviation sigma.

Adding in arbitrary \sigma > 0

When we add \sigma it is the same as

z = \underbrace{\sigma\sqrt{\frac{N}{N-1}}\;H}_{\text{ size of } N \times (N - 1)}\,x.

If we first multiply x by \sigma\sqrt{\frac{N}{N-1}} it means that

x \sim \mathcal N \bigg(0, \sigma\sqrt{\frac{N}{N-1}} \bigg).

Crucially, there are only N - 1 dimensions in x so the target log probability of this normal increments by

target += -(N - 1) * log(sigma * sqrt(N %/% (N - 1)));

The problem with the original centered zero-sum model is that the target is

target += -N * log(sigma * sqrt(N %/% (N - 1)));

to make it equivalent we need to add log(sigma * \sqrt{\frac{N}{N-1}}). Since the square-root of N stuff is a constant we only need to increment by

target += log(sigma);
4 Likes

Thanks @spinkney! That’s exactly what I was after. For completeness, I’ll show how @spinkney’s solution works in the context of my reprex.

My original misspecified model (wrong).

    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      array[n_group] sum_to_zero_vector[n_unit] alpha_u;
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
      }
      sigma ~ exponential( 1 );
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] ][ unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }

The corrected version (right).

    data{ 
      int n;
      vector[n] value;
      array[n] int group;
      int n_group;
      array[n] int unit;
      int n_unit;
    }

    parameters{
      // Intercepts in log space
      vector[n_group] alpha; 
      array[n_group] sum_to_zero_vector[n_unit] alpha_u;
      
      // Inter-unit variability
      vector<lower=0>[n_group] sigma_u;
      
      // Likelihood uncertainty
      vector<lower=0>[n_group] sigma; 
    }

    model{
      // Hyperpriors
      sigma_u ~ exponential( 1 );
      
      // Priors
      alpha ~ normal( log(1) , 0.2 );
      for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
        target += log(sigma_u[i]); // Note this addition 
      }
      sigma ~ exponential( 1 );
      
      // Model with link function
      vector[n] mu;
      for ( i in 1:n ) {
        mu[i] = exp( alpha[ group[i] ] + 
                     alpha_u[ group[i] ][ unit[i] ] );
      }

      // Gamma likelihood
      value ~ gamma( square( mu ) ./ square( sigma[group] ) ,
                     mu ./ square( sigma[group] ) );
    }

The relevant change is from this

for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
      }

to this

 for (i in 1:n_group) { // Note correction of sigma_u
        alpha_u[i][] ~ normal( 0 , sigma_u[i] * sqrt( n_unit * inv( n_unit - 1 ) ) );
        target += log(sigma_u[i]); // Note this addition 
      }

Clearly the difference is enormous:


wrong

right

wrong

right

I assume your

is equivalent to sqrt( N * inv( N - 1 ) )? I have never come across %/% in Stan.

Now I am questioning if my non-centred model is correctly specified. Do I need to put this prior on my z-scores?

As you can see in the original post, I currently have

    parameters{
      ...
      array[n_group] sum_to_zero_vector[n_unit] z_u; // z-score
      ...
    }

    model{
      ...
      for (i in 1:n_group) {
        z_u[i][] ~ normal( 0 , 1 );
      }
      ...
      // Convert z_u to alpha_u
      matrix[n_group, n_unit] alpha_u;
      for (i in 1:n_group) {
        for (j in 1:n_unit) { // Note correction of z_u
          alpha_u[i,j] = z_u[i][j] * sqrt( n_unit * inv( n_unit - 1 ) ) * sigma_u[i] + 0;
        }
      }
      ...
    }

In other words I just have a standard normal prior on s2z z-score and then only apply the correction when reversing the standardisation.

1 Like

Yes, this is equivalent. The integer quotient is documented at Integer-Valued Basic Functions. I actually prefer the sqrt( N * inv(N - 1)) syntax.

I don’t think so. The non-centered version implies what the centered version is doing. Just like in the regular (non-s2z) hierarchical model you may have better inference with one version vs the other when you have more or less data per group.

1 Like

Hi
Could you please show how to adjust the code where we draw alpha_u from multinormal distribution instead of normal distribution. In other words, we model correlated random effects

@spinkney If you add a normal_sum_to_zero(sigma), I’d consider defining it such that the marginal variance is \sigma^2 (1 - \tfrac{1}{N}). That is how pymc and numpyro define the ZeroSumNormal distribution, and having slightly different definitions around in the libraries could be annoying in the long term. I think that’s also usually what’s needed in applications of it. (At least it is in pretty much all cases where I’m using it, not sure how well that generalizes). It also feels quite natural to me if you look at the eigenvalues of the covariance matrix. One of those eigenvalues is zero, all others are \sigma in this parametrization.

@lukaseamus
With a little bit of work you can make sure you get the exact same results with the zero-sum-normal parametrization, as with the original model. To me, that feels a lot cleaner.

Unless I’m missing something, we can simplify your model, because the two groups don’t share any parameters. Essentially, it fits two separate models, one model for group1 and one model for group2. If we only fit one of those models at a time, it becomes easier to understand how this reparametrization works. If we do this, the model reduces to a GLM data ~ 1 + (1|unit) with log link and gamma likelihood:

data {
    int n_observations;
    int n_units;
    vector[n_observations] values;
    array[n_observations] int<lower=1, upper=n_units> unit_idx;
}
parameters {
    // Intercepts in log space
    real intercept;
    vector[n_units] unit_effect;

    // Inter-unit variability
    real<lower=0> unit_sigma;

    // Likelihood uncertainty
    real<lower=0> sigma;
}
transformed parameters {
    vector[n_observations] mu = exp(
        intercept
        + unit_effect[unit_idx]
    );
}
model {
    intercept ~ normal(log(1), 0.2);
    unit_sigma ~ normal(0, 1);
    unit_effect ~ normal(0, unit_sigma);
    sigma ~ normal(0, 1);

    // Gamma likelihood
    values ~ gamma(
        square(mu) ./ square(sigma),
        mu ./ square(sigma)
    );
}

The problem that the zero sum normal reparametrization fixes also isn’t all that pronounced in your dataset, so I’ll change the number of observations a bit, so that it is easier to see (sorry for switching to python, that’s just way easier for me…):

import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import arviz
import nutpie

def make_data(rng, n_units, n_obs_per_unit, *, intercept, unit_sigma, sigma):
    units = [f"unit_{i}" for i in range(n_units)]
    # Unit-level effects (unique unit IDs across groups)
    unit_effects = pl.DataFrame({
        'unit': units,
        'unit_effect': np.concatenate([
            rng.normal(0, unit_sigma, size=n_units),
        ])
    })

    # Observation-level structure
    observations = pl.DataFrame({
        "observation": [f"obs_{i}" for i in range(n_units * n_obs_per_unit)],
        'unit': np.repeat(units, n_obs_per_unit),
    })


    df = observations.join(unit_effects, on='unit')

    # Generate final values
    adjusted_mean = np.exp(intercept + df['unit_effect'])
    shape = adjusted_mean**2 / sigma**2
    rate = adjusted_mean / sigma**2

    df = df.with_columns([
        pl.Series('value', rng.gamma(shape, 1/rate))
    ])

    # Keep just what we need
    data = df.select(["observation", 'unit', 'value', "unit_effect"])
    return data.to_pandas()
true_values =  {
    "intercept": 0.15,
    "unit_sigma": 0.2,
    "sigma": 0.1,
}

rng = np.random.default_rng(42)
data = make_data(rng, n_units=5, n_obs_per_unit=500, **true_values)

sns.catplot(
    data=data,
    kind="strip",
    x='unit',
    y='value',
)

If we sample this, the sampler (here nutpie, but similar with stan) will struggle quite a bit, and won’t be particularly efficient. It is using very long trajectories quite often (a depth of 8 for instance means, that it used ~2^8=256 gradient evaluations for a single draw).

unit_idx, units = pd.factorize(data["unit"])

coords = {
    "observation": data["observation"],
    "unit": units,
}

data_stan = {
    "n_observations": len(coords["observation"]),
    "n_units": len(coords["unit"]),
    "values": data["value"].values,
    "unit_idx": unit_idx + 1,
}

dims = {
    "unit_effect": ["unit"],
    "mu": ["observation"],
}

trace = nutpie.sample(
    compiled
    .with_data(**data_stan)
    .with_coords(**coords)
    .with_dims(**dims),
    tune=1000,
    draws=1000,
    chains=10,
    seed=42,
)

sns.countplot(trace.sample_stats.depth.to_dataframe(), x="depth", hue="chain")

So why does the sampler struggle with such a simple model and a still pretty small dataset? It is not because of the usual centered / non-centered parametrization. If we were to switch to a non-centered parametrization, it performs even worse.

The problem is that all parameters except for sigma are still quite uncertain, but some linear combinations of the parameters are very certain, which leads to high correlations in the posterior.

Those linear combinations have actually quite nice interpretations here. The way I chose the parameters, we have very precise information about the expected value within each unit. But since we only ever observe 5 units, we never learn much about the distribution of the units.

For instance, we know quite well how big the expected outcome in unit 1 is:

(trace.posterior.intercept + trace.posterior.unit_effect).sel(unit="unit_0").std()
# 0.0036

But at the same time, we are very uncertain about what the unit effect of the first unit itself is:

trace.posterior.unit_effect.sel(unit="unit_0").std()
# 0.13

This is because unit_effect measures how much larger the expected value of the first unit is relative to the population mean of the units! So for instance, if each unit were a medical treatment, it would specify how much better the first teatment is, compared to some (slightly hypothetical) infinite population of treatments. But since we only ever observed 5 units, most of the uncertainty in unit_effect is due to the uncertainty in the population mean:

trace.posterior.intercept.std()
# 0.13

We can make use of a quantity that’s closely related to the population mean of the units though: The sample mean of the 5 units that we did observe. The basic idea of the zero-sum-normal parametrization is to parameterize our model in terms of differences to this sample mean, instead of in terms of differences to the population mean:

unit_effect_sample_mean = trace.posterior.unit_effect - trace.posterior.unit_effect.mean("unit")
unit_effect_rel = trace.posterior.unit_effect - unit_effect_sample_mean

Instead of using unit_effect as a parameter, we would like to use unit_effect_rel instead: The difference between the mean within a unit and the sample mean of the 5 observed units. As a small side-note: I think there are many cases, where this difference is what we want to know in the first place, and not really the difference to the population mean. This is almost always the case, if the population in question isn’t infinite in the first place. So if you have a model where the unit is a US state, I think we should really look at the difference to the other existing states, not the difference to some (hypothetical and arguably nonsensical infinite population of states).

For some more details about how that works, see for instance here.

Here is an implementation of this reparametrization, that also reconstructs the original values for unit_effect and intercept:

data {
    int n_observations;
    int n_units;
    vector[n_observations] values;
    array[n_observations] int<lower=1, upper=n_units> unit_idx;
}
parameters {
    // Intercepts in log space
    real intercept_plus_pop_mean;
    sum_to_zero_vector[n_units] unit_effect_rel;

    // Inter-unit variability
    real<lower=0> unit_sigma;

    // Likelihood uncertainty
    real<lower=0> sigma;
}
transformed parameters {
    vector[n_observations] mu = exp(
        intercept_plus_pop_mean
        + unit_effect_rel[unit_idx]
    );

    real intercept_sigma = 0.2;
    real intercept_plus_pop_mean_sigma = sqrt(intercept_sigma ^ 2 + unit_sigma ^ 2 / n_units);
}
model {
    // Priors
    intercept_plus_pop_mean ~ normal(0, intercept_plus_pop_mean_sigma);
    unit_sigma ~ normal(0, 1);

    unit_effect_rel ~ normal(0, unit_sigma);
    target += log(unit_sigma);

    sigma ~ normal(0, 1);

    // Gamma likelihood
    values ~ gamma(
        square(mu) ./ square(sigma),
        mu ./ square(sigma)
    );
}
generated quantities {
    real intercept;
    {
        real var1 = intercept_sigma ^ 2;
        real var2 = unit_sigma ^ 2 / n_units;
        real total_var = var1 + var2;
        intercept = normal_rng(
            intercept_plus_pop_mean * var1 / total_var,
            sqrt(var1 * var2 / total_var)
        );
    }

    real unit_sample_mean = intercept_plus_pop_mean - intercept;
    vector[n_units] unit_effect = unit_effect_rel + unit_sample_mean;
}

This samples very well, with much smaller treedepth:

and pretty close to ideal effective sample size. It also gives the exact same posterior as the first model.

1 Like

This is something the Stan devs will need to discuss. I can see the benefit of what you’re saying although it seems harder to look at the code and know what’s going on. One benefit, and you mention this, is that we’d end up “backing-out” that sqrt(N * inv(N - 1)) to recover the population stuff, which is annoying. Initially the discussion was around how to say that each marginal is explicitly given the intended prior distribution. When a user writes z ~ sum_to_zero_normal(sigma) where z is a sum-to-zero vector it’s convenient to know that marginally you’re getting a standard deviation of sigma.