Centered parametrization throws errors in Zero inflated poisson regression

I am trying to estimate a random intercept zero inflated poisson model. The non centered parametrization seems to work fine as in completes estimation but is not able to retrevie the true parameters, however, the centered parametrization does not mix even at adapt_delta set to 0.99 and 10000 iterations. It also throws low BFMI errors

Here is the code I am using for the centered parametrization:

data {
  int<lower=0> N;
  int<lower=0> K; // Number of predictors in zero inflation model - homogenous
  int<lower=0> J;// Number of predictors in count model - homogenous
  matrix[N,K] x1;// data for logit model - homogenous
  matrix[N,J] x2;// data for count model - homogenous
  int<lower=0> y[N];//count variable
  int<lower=1> L;//Number of customers/groups
  int<lower=1,upper=L> ll[N];//Number of observations in groups
}
transformed data {
  matrix[N, K] Q_ast1;
  matrix[K, K] R_ast1;
  matrix[K, K] R_ast_inverse1;
  matrix[N, J] Q_ast2;
  matrix[J, J] R_ast2;
  matrix[J, J] R_ast_inverse2;
  // thin and scale the QR decomposition
  Q_ast1 = qr_thin_Q(x1) * sqrt(N - 1);
  R_ast1 = qr_thin_R(x1) / sqrt(N - 1);
  R_ast_inverse1 = inverse(R_ast1);
  
  Q_ast2 = qr_thin_Q(x2) * sqrt(N - 1);
  R_ast2 = qr_thin_R(x2) / sqrt(N - 1);
  R_ast_inverse2 = inverse(R_ast2);
}
parameters {
  vector[K] zeta;//count model estimates - homogenous
  vector[L] rbeta; //count model estimates - heterogenous 
  real rbeta_mu;//coefficient of variation of random effects for zero inflation model
  real<lower=0> rbeta_sigma;//std dev of distribution of rbeta parameters
  vector[J] omega;//zero inflation model estimates - homogenous
  vector[L] rgamma;//zero inflation model estimates - heterogenous
  real rgamma_mu;//mean of distribution of rgamma parameters
  real<lower=0> rgamma_sigma;//std dev of distribution of rgamma
}
transformed parameters {
  vector[N] theta;
  vector[N] lambda;
  vector[N] eta1;
  vector[N] eta2;
  eta1 = Q_ast1 * zeta;
  eta2 = Q_ast2 * omega;
  for(n in 1:N){
   theta[n] =  inv_logit(eta1[n] + rbeta[ll[n]]); 
   lambda[n] =  exp(eta2[n]  +  rgamma[ll[n]]); 
  }
}
model {
  zeta ~ normal(0,100);
  rbeta_mu ~ normal(0,100);
  rbeta_sigma ~ cauchy(0,10);
  omega ~ normal(0,100);
  rgamma_mu ~ normal(0,100);
  rgamma_sigma ~ cauchy(0,10);
  rbeta~normal(rbeta_mu,rbeta_sigma);
  rgamma~normal(rgamma_mu,rgamma_sigma);
  for (n in 1:N) {
    if (y[n] == 0)
      target += log_sum_exp(bernoulli_lpmf(1 | theta[n]),
                            bernoulli_lpmf(0 | theta[n])
                              + poisson_lpmf(y[n] | lambda[n]));
    else
      target += bernoulli_lpmf(0 | theta[n])
                  + poisson_lpmf(y[n] | lambda[n]);
  }
}
generated quantities {
  vector[K] beta;
  vector[J] gamma;
  beta = R_ast_inverse1 * zeta; // coefficients on x1
  gamma= R_ast_inverse2 * omega;// coefficients on x2
}

This is the code for non centered parametrization:

data {
  int<lower=0> N;
  int<lower=0> K; // Number of predictors in zero inflation model - homogenous
  int<lower=0> J;// Number of predictors in count model - homogenous
  matrix[N,K] x1;// data for logit model - homogenous
  matrix[N,J] x2;// data for count model - homogenous
  int<lower=0> y[N];//count variable
  int<lower=1> L;//Number of customers/groups
  int<lower=1,upper=L> ll[N];//Number of observations in groups
}
transformed data {
  matrix[N, K] Q_ast1;
  matrix[K, K] R_ast1;
  matrix[K, K] R_ast_inverse1;
  matrix[N, J] Q_ast2;
  matrix[J, J] R_ast2;
  matrix[J, J] R_ast_inverse2;
  // thin and scale the QR decomposition
  Q_ast1 = qr_thin_Q(x1) * sqrt(N - 1);
  R_ast1 = qr_thin_R(x1) / sqrt(N - 1);
  R_ast_inverse1 = inverse(R_ast1);
  
  Q_ast2 = qr_thin_Q(x2) * sqrt(N - 1);
  R_ast2 = qr_thin_R(x2) / sqrt(N - 1);
  R_ast_inverse2 = inverse(R_ast2);
}
parameters {
  vector[K] zeta;//count model estimates - homogenous
  vector[L] beta_raw; //count model estimates - heterogenous 
  real rbeta_mu;//coefficient of variation of random effects for zero inflation model
  real<lower=0> rbeta_sigma;//std dev of distribution of rbeta parameters
  vector[J] omega;//zero inflation model estimates - homogenous
  vector[L] gamma_raw;//zero inflation model estimates - heterogenous
  real rgamma_mu;//mean of distribution of rgamma parameters
  real<lower=0> rgamma_sigma;//std dev of distribution of rgamma
}
transformed parameters {
  vector[N] theta;
  vector[N] lambda;
  vector[L] rbeta;
  vector[L] rgamma;
  vector[N] eta1;
  vector[N] eta2;
  rbeta = rbeta_mu + beta_raw * rbeta_sigma; // coefficients on x1
  rgamma = rgamma_mu + gamma_raw * rgamma_sigma ; // coefficients on x2
  eta1 = Q_ast1 * zeta;
  eta2 = Q_ast2 * omega;
  for(n in 1:N){
   theta[n] =  inv_logit(eta1[n] + rbeta[ll[n]]); 
   lambda[n] =  exp(eta2[n]  +  rgamma[ll[n]]); 
  }
}
model {
  zeta ~ normal(0,100);
  rbeta_mu ~ normal(0,100);
  rbeta_sigma ~ cauchy(0,10);
  omega ~ normal(0,100);
  rgamma_mu ~ normal(0,100);
  rgamma_sigma ~ cauchy(0,10);
  beta_raw ~ normal(0,1);
  gamma_raw ~ normal(0,1);
  for (n in 1:N) {
    if (y[n] == 0)
      target += log_sum_exp(bernoulli_lpmf(1 | theta[n]),
                            bernoulli_lpmf(0 | theta[n])
                              + poisson_lpmf(y[n] | lambda[n]));
    else
      target += bernoulli_lpmf(0 | theta[n])
                  + poisson_lpmf(y[n] | lambda[n]);
  }
}
generated quantities {
  vector[K] beta;
  vector[J] gamma;
  beta = R_ast_inverse1 * zeta; // coefficients on x1
  gamma= R_ast_inverse2 * omega;// coefficients on x2
}

Any suggestions as to what is going wrong? All help is appreciated.

If the non-centered parameterization is consistently fitting without diagnostic warnings, then there is no reason to switch to the centered parameterization. It’s not surprising that the centered parameterization yields non-convergence. So I think the key question is why you are seeing poor parameter recovery from the non-centered version. There are five options:

  1. There’s a bug in your Stan code
  2. There’s a bug in your data generation code
  3. You have hit on a very improbable model where there are consistently fitting problems that do not get flagged by Stan’s diagnostics (I think this is the least likely option of the five).
  4. The posterior is highly uncertain or dominated by the prior. Stan is sampling from the “correct” posterior, but this posterior doesn’t tightly concentrate near the true parameter values (your super-weak priors on logit-scale parameters might encourage this behavior!).
  5. You actually are getting good parameter recovery, but something is preventing you from noticing that. For example, maybe you’ve tested on too few replicate synthetic datasets to have a robust sense of how good or bad your parameter recovery is.

Could you share the code you’re using to generate data under known parameter values? This could help us spot whether/where the non-centered model differs from the data generation.

Thank you for your response.

I agree that it might be the weak priors on the logit scale parameters might be the issue as most of the discrepancies in the estimates are with regards to the logit scale parameters.

Here is the code i used to generate the data:

#generating data

# sample size
n <- 2000
# variable x1
x1 <- runif(n)
# variable x2
x2 <- runif(n)
# variable x3
x3 <- runif(n)
# variable x3
x4 <- runif(n)

g <- 500
groups <- rep(1:g,each=4)

#beta
beta <- rnorm(2,0,5)

#gamma
gamma <- rnorm(2,0,5)

x <- as.matrix(data.frame("x1"=x1,"x3"=x2))
z <- as.matrix(data.frame("z1"=x3,"z3"=x4))

rbeta_mu <- rnorm(1,0,5)
rbeta_sigma <- runif(1,0,1)
rgamma_mu <- rnorm(1,0,5)
rgamma_sigma <- runif(1,0,1)
rbeta <- data.frame(matrix(ncol = 2,nrow = length(unique(groups))))
colnames(rbeta) <- c("group","rbeta")

rgamma <- data.frame(matrix(ncol = 2,nrow = length(unique(groups))))
colnames(rgamma) <- c("group","rgamma")


for(i in 1:length(unique(groups))){
  rbeta$group[i] <- i
  rbeta$rbeta[i] <- rnorm(1,rbeta_mu,rbeta_sigma)
  
  rgamma$group[i] <- i
  rgamma$rgamma[i] <- rnorm(1,rgamma_mu,rgamma_sigma)
}

xb_beta <- x %*% beta
xb_gamma <- z %*% gamma


for(i in 1:n){
  alpha1 <- rbeta$rbeta[which(rbeta$group==groups[i])]
  xb_beta[i] <- xb_beta[i]+alpha1
  
  alpha2 <- rgamma$rgamma[which(rgamma$group==groups[i])]
  xb_gamma[i] <- xb_gamma[i]+alpha2
}

lambda <- exp(xb_gamma)
theta <- plogis(xb_beta)

y <- rpois(n, lambda = lambda) * rbinom(n, prob = 1 - theta, size = 1)

I understand this is counterintuitive but these priors are not weak. They are extremely strong. Just try to run the following code in R

mu <- rnorm(1000, 0, 100)
sigma <- abs(rcauchy(1000, 0, 10))
theta <- rnorm(1000, mu, sigma)
hist(plogis(theta))

You’ll see that these priors imply that the model (+ priors) is really sure that the probability will be lower than .1 or higher than .9. If the actual data implies that the probability is more towards the middle and does not overwhelm the prior it can be hard for Stan to compute the posterior. More important, given that you think of the priors as weak, I think you don’t want these priors, i.e. they don’t conform to your knowledge of the problem that you are dealing with.

In my personal opinion, once you work on the logit or exponential scale, I think cauchy priors and normal priors with a scale of more than 10 are probably not what most people want. So, it’s the first thing that jumps out to me when I see a model like yours.

I understand but I have tried weak priors as welll, rnorm(0,5) or rnorm(0,1) but to no luck. What surprises me more is that the same model minus the random intercept, that is the fixed effects model works absolutely fine and is able to recover the true parameters just fine.

One thing that I notice from your data generation script is that it has a non-trivial probability of generating data where every observation is in the zero-inflation state (and likewise a non-trivial probability of generating data where every observation is in the non-zero-inflation state). For example, here’s your script modified only to set the seed as 1 and then replacing the last line with a look at how many observations are in the zero-inflation state. On my computer, anyway, this returns 0. Other seeds will give other behavior; for example a seed of 2 gives 625. There is potential for pathological behavior here.

set.seed(1)

#generating data

# sample size
n <- 2000
# variable x1
x1 <- runif(n)
# variable x2
x2 <- runif(n)
# variable x3
x3 <- runif(n)
# variable x3
x4 <- runif(n)

g <- 500
groups <- rep(1:g,each=4)

#beta
beta <- rnorm(2,0,5)

#gamma
gamma <- rnorm(2,0,5)

x <- as.matrix(data.frame("x1"=x1,"x3"=x2))
z <- as.matrix(data.frame("z1"=x3,"z3"=x4))

rbeta_mu <- rnorm(1,0,5)
rbeta_sigma <- runif(1,0,1)
rgamma_mu <- rnorm(1,0,5)
rgamma_sigma <- runif(1,0,1)
rbeta <- data.frame(matrix(ncol = 2,nrow = length(unique(groups))))
colnames(rbeta) <- c("group","rbeta")

rgamma <- data.frame(matrix(ncol = 2,nrow = length(unique(groups))))
colnames(rgamma) <- c("group","rgamma")


for(i in 1:length(unique(groups))){
  rbeta$group[i] <- i
  rbeta$rbeta[i] <- rnorm(1,rbeta_mu,rbeta_sigma)
  
  rgamma$group[i] <- i
  rgamma$rgamma[i] <- rnorm(1,rgamma_mu,rgamma_sigma)
}

xb_beta <- x %*% beta
xb_gamma <- z %*% gamma


for(i in 1:n){
  alpha1 <- rbeta$rbeta[which(rbeta$group==groups[i])]
  xb_beta[i] <- xb_beta[i]+alpha1
  
  alpha2 <- rgamma$rgamma[which(rgamma$group==groups[i])]
  xb_gamma[i] <- xb_gamma[i]+alpha2
}

lambda <- exp(xb_gamma)
theta <- plogis(xb_beta)

zi <- rbinom(n, prob = 1 - theta, size = 1)
sum(zi)

Thanks, @jsocolar I understand.