Hi everyone,
I try to run non-centered parameterization of a random effects model without Cholesky factorization.
The random effects are well identified. Nevertheless, the estimates for the random effects’ standard deviation and correlation matrix differ. For example consider the following DGP:
set.seed(123)
options(max.print = 10000)
library(MASS)
library(tidyverse)
library(rstan)
S <- 30 #number of subjects
T <- 10 #number of time observations
N <- S * T
K <- 3 # number of predictors incl. intercept
id <- rep(1:S, each=T)
X <- data.frame(rep(1, N), matrix(rnorm(N*(K-1)), nrow = N, ncol=K-1)) %>% data.matrix()
e <- rnorm(N,0,0.1)
beta_bar = rep(2,K)
beta_sigma = 1:K
beta_omega = 0.5+0.5*diag(K)
beta_vcov <- diag(beta_sigma) %*% beta_omega %*% diag(beta_sigma)
beta_i = mvrnorm(S,beta_bar,beta_vcov)
Y <- numeric()
for(i in 1:S){
Y[which(id==i)] <- X[which(id==i),] %*% beta_i[i,] + e[which(id==i)]
}
stan_data = list(N = NROW(Y),
S = S,
T = T,
id = id,
K = K,
Y = Y,
X = X)
c(beta_i %>% data.frame() %>% summarise_all(list(mean=mean, sd=sd)))
$X1_mean
[1] 1.75432
$X2_mean
[1] 1.890897
$X3_mean
[1] 0.9262343
$X1_sd
[1] 1.086134
$X2_sd
[1] 1.811626
$X3_sd
[1] 3.303104
cor(beta_i)
[,1] [,2] [,3]
[1,] 1.0000000 0.3323812 0.6027953
[2,] 0.3323812 1.0000000 0.5710059
[3,] 0.6027953 0.5710059 1.0000000
I can now run the model without non-centered parameterization as follows:
code_multinorm="data {
int<lower=0> N;
int<lower=0> S;
int<lower=0> T;
int<lower=0> K;
int<lower=0> id[N];
matrix[N,K] X; //predictor
vector[N] Y; //outcome
}
parameters {
vector [K] beta_bar;
vector <lower=0> [K] beta_sigma;
corr_matrix[K] beta_Omega;
matrix[S,K] beta_i;
real<lower=0> sigma;
}
model {
Y ~ normal(rows_dot_product(X,beta_i[id]), sigma);
beta_bar ~ normal(0,5);
beta_sigma ~ cauchy(0,2.5);
beta_Omega ~ lkj_corr(1);
for(i in 1:S)
beta_i[i,] ~ multi_normal(beta_bar, quad_form_diag(beta_Omega,beta_sigma));
sigma ~ cauchy(0,2.5);
}
"
fit_regression_multinorm = stan(model_code = code_multinorm,
data = stan_data,
chains = 4,
cores = 4,
iter = 1000,
warmup = 500)
I get the following results:
Inference for Stan model: e1e5ba2cc1336320a9ec6e9459618ead.
4 chains, each with iter=1000; warmup=500; thin=1;
post-warmup draws per chain=500, total post-warmup draws=2000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
beta_bar[1] 1.76 0.00 0.20 1.37 1.62 1.76 1.89 2.16 2259 1
beta_bar[2] 1.86 0.01 0.34 1.20 1.64 1.85 2.08 2.52 2673 1
beta_bar[3] 0.90 0.01 0.59 -0.25 0.52 0.91 1.30 2.04 2113 1
beta_sigma[1] 1.14 0.00 0.15 0.90 1.03 1.13 1.23 1.47 2353 1
beta_sigma[2] 1.88 0.00 0.25 1.47 1.71 1.85 2.03 2.46 2697 1
beta_sigma[3] 3.37 0.01 0.44 2.64 3.06 3.32 3.62 4.36 2441 1
beta_Omega[1,1] 1.00 NaN 0.00 1.00 1.00 1.00 1.00 1.00 NaN NaN
beta_Omega[1,2] 0.26 0.00 0.17 -0.08 0.15 0.27 0.38 0.55 2250 1
beta_Omega[1,3] 0.54 0.00 0.13 0.24 0.46 0.55 0.62 0.75 2123 1
beta_Omega[2,1] 0.26 0.00 0.17 -0.08 0.15 0.27 0.38 0.55 2250 1
beta_Omega[2,2] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 1958 1
beta_Omega[2,3] 0.50 0.00 0.14 0.19 0.42 0.52 0.60 0.72 2264 1
beta_Omega[3,1] 0.54 0.00 0.13 0.24 0.46 0.55 0.62 0.75 2123 1
beta_Omega[3,2] 0.50 0.00 0.14 0.19 0.42 0.52 0.60 0.72 2264 1
beta_Omega[3,3] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 1019 1
sigma 0.11 0.00 0.01 0.10 0.10 0.11 0.11 0.12 1719 1
beta_i[1,1] 3.13 0.00 0.03 3.07 3.11 3.13 3.16 3.20 4112 1
beta_i[1,2] 3.02 0.00 0.04 2.95 3.00 3.02 3.05 3.10 4429 1
beta_i[1,3] 5.02 0.00 0.03 4.95 4.99 5.02 5.04 5.08 3570 1
beta_i[2,1] 4.53 0.00 0.03 4.46 4.51 4.53 4.55 4.59 5678 1
...
beta_i[29,3] -6.22 0.00 0.05 -6.32 -6.25 -6.22 -6.18 -6.11 3431 1
beta_i[30,1] 2.85 0.00 0.04 2.77 2.82 2.85 2.88 2.94 3110 1
beta_i[30,2] -0.25 0.00 0.04 -0.32 -0.27 -0.25 -0.23 -0.18 2768 1
beta_i[30,3] 1.33 0.00 0.03 1.28 1.31 1.33 1.35 1.38 3470 1
Samples were drawn using NUTS(diag_e) at Sun Jan 30 23:20:04 2022.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
Now I run the same model with non-centered parameterization:
code_multinorm_nc="data {
int<lower=0> N;
int<lower=0> S;
int<lower=0> T;
int<lower=0> K;
int<lower=0> id[N];
matrix[N,K] X; //predictor
vector[N] Y; //outcome
}
parameters {
vector [K] beta_bar;
vector <lower=0> [K] beta_sigma;
corr_matrix[K] beta_Omega;
matrix[K,S] beta_i_z;
real<lower=0> sigma;
}
transformed parameters {
matrix[S,K] beta_i;
for (i in 1:S)
beta_i[i,] = beta_bar' + (quad_form_diag(beta_Omega, beta_sigma) * beta_i_z[,i])';
}
model {
Y ~ normal(rows_dot_product(X,beta_i[id]), sigma);
beta_bar ~ normal(0,5);
beta_sigma ~ cauchy(0,2.5);
beta_Omega ~ lkj_corr(1);
to_vector(beta_i_z) ~ normal(0,1);
sigma ~ cauchy(0,2.5);
}
"
fit_regression_multinorm_nc = stan(model_code = code_multinorm_nc,
data = stan_data,
chains = 4,
cores = 4,
iter = 1000,
warmup = 500)
And these are the results:
Inference for Stan model: 025a15f69ba019c1fb1854955fbfe0b6.
4 chains, each with iter=1000; warmup=500; thin=1;
post-warmup draws per chain=500, total post-warmup draws=2000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
beta_bar[1] 1.76 0.01 0.22 1.34 1.61 1.76 1.90 2.18 610 1.00
beta_bar[2] 1.89 0.01 0.34 1.24 1.66 1.89 2.11 2.56 587 1.00
beta_bar[3] 0.95 0.03 0.64 -0.28 0.49 0.96 1.40 2.17 472 1.00
beta_sigma[1] 1.01 0.00 0.07 0.89 0.96 1.00 1.05 1.16 700 1.01
beta_sigma[2] 1.33 0.00 0.09 1.18 1.27 1.32 1.38 1.52 718 1.00
beta_sigma[3] 1.83 0.00 0.12 1.62 1.75 1.82 1.91 2.09 844 1.00
beta_Omega[1,1] 1.00 NaN 0.00 1.00 1.00 1.00 1.00 1.00 NaN NaN
beta_Omega[1,2] 0.08 0.00 0.09 -0.09 0.03 0.09 0.14 0.25 1042 1.00
beta_Omega[1,3] 0.28 0.00 0.08 0.14 0.23 0.28 0.33 0.42 998 1.00
beta_Omega[2,1] 0.08 0.00 0.09 -0.09 0.03 0.09 0.14 0.25 1042 1.00
beta_Omega[2,2] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 2082 1.00
beta_Omega[2,3] 0.29 0.00 0.08 0.12 0.24 0.29 0.34 0.45 863 1.00
beta_Omega[3,1] 0.28 0.00 0.08 0.14 0.23 0.28 0.33 0.42 998 1.00
beta_Omega[3,2] 0.29 0.00 0.08 0.12 0.24 0.29 0.34 0.45 863 1.00
beta_Omega[3,3] 1.00 0.00 0.00 1.00 1.00 1.00 1.00 1.00 1955 1.00
sigma 0.11 0.00 0.01 0.10 0.10 0.11 0.11 0.12 1030 1.00
beta_i[1,1] 3.13 0.00 0.03 3.07 3.11 3.14 3.16 3.20 2190 1.00
beta_i[1,2] 3.02 0.00 0.04 2.95 3.00 3.03 3.05 3.10 1958 1.00
beta_i[1,3] 5.02 0.00 0.03 4.96 5.00 5.02 5.04 5.08 1992 1.00
beta_i[2,1] 4.53 0.00 0.03 4.46 4.51 4.53 4.55 4.60 1765 1.00
...
beta_i[29,3] -6.22 0.00 0.05 -6.32 -6.25 -6.22 -6.18 -6.11 1844 1.00
beta_i[30,1] 2.86 0.00 0.04 2.78 2.83 2.86 2.88 2.93 2185 1.00
beta_i[30,2] -0.25 0.00 0.03 -0.32 -0.27 -0.25 -0.23 -0.19 2149 1.00
beta_i[30,3] 1.33 0.00 0.03 1.28 1.31 1.33 1.35 1.38 1989 1.00
Samples were drawn using NUTS(diag_e) at Sun Jan 30 23:24:45 2022.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
We see that the results for beta_bar and beta_i are identical. But why do the results for beta_sigma and beta_Omega differ? I played around with the estimates a little bit and I found that the results for beta_sigma and beta_Omega from the model without non-centered parameterization reflect something like the square root of the result from the model with non-centered parameterization.
Is there anything wrong with my coding of the non-centered parameterization?
Thanks!