While using mean-field ADVI to estimate a penalised regression model with a horseshoe prior, I obtain posteriors for the regression coefficients that are skewed.
Below is the code to generate some data and run two models, namely HMC and Mean-field VI:
# generate some data
Ntrain = 100
P = 45
rho = 0.1
# generate covariance matrix
V <- rho + (1-rho) * diag(P)
# we generate P parameters of which 1/3 are negative, 1/3 zero, and 1/3 positive
b = c(rep(-10,P/3),
rep(0,P/3),
rep(10,P/3))
# generate random noise, add the covariance structure
Xtrain <- matrix(rnorm(Ntrain * (P)), nrow = Ntrain) %*% chol(V)
# generate the outcome
Ytrain <- Xtrain %*% b + rnorm(Ntrain, sd = 2)
df_train = data.frame(X = Xtrain, Y = Ytrain)
# set HS prior
priors_self <- set_prior(horseshoe(df = 1, par_ratio = 2/3))
m1 = brm(Y ~ ., data = df_train, algorithm = "sampling", prior = priors_self, backend = "cmdstanr", cores = 4,
iter = 4000)
m2 = brm(Y ~ ., data = df_train, algorithm = "meanfield", prior = priors_self, backend = "cmdstanr",
draws = 40000)
If we now plot the draws of posteriors for three regression coefficients, we can see the following (note that the x-axis are not the same):
HMC obtains posteriors that look fairly normal for all parameters, while mean-field VI produces very long tails towards zero in the case of negative and positive parameters.
I read this post discussing the fact that ADVI fits the normal in the unconstrained space, and that the back transformation might lead to non-normality. A regression coefficient is not restricted, so I would assume that this is not the issue. Does anyone know why using mean-field VI with the horseshoe prior leads to these skewed posteriors?
Below is the Stan code in case it helps::
// generated with brms 2.22.0
functions {
/* Efficient computation of the horseshoe scale parameters
* see Appendix C.1 in https://projecteuclid.org/euclid.ejs/1513306866
* Args:
* lambda: local shrinkage parameters
* tau: global shrinkage parameter
* c2: slap regularization parameter
* Returns:
* scale parameter vector of the horseshoe prior
*/
vector scales_horseshoe(vector lambda, real tau, real c2) {
int K = rows(lambda);
vector[K] lambda2 = square(lambda);
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return lambda_tilde * tau;
}
/* integer sequence of values
* Args:
* start: starting integer
* end: ending integer
* Returns:
* an integer sequence from start to end
*/
array[] int sequence(int start, int end) {
array[end - start + 1] int seq;
for (n in 1:num_elements(seq)) {
seq[n] = n + start - 1;
}
return seq;
}
// compute partial sums of the log-likelihood
real partial_log_lik_lpmf(array[] int seq, int start, int end, data vector Y, data matrix Xc, vector b, real Intercept, real sigma) {
real ptarget = 0;
int N = end - start + 1;
ptarget += normal_id_glm_lpdf(Y[start:end] | Xc[start:end], Intercept, b, sigma);
return ptarget;
}
}
data {
int<lower=1> N; // total number of observations
vector[N] Y; // response variable
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int<lower=1> Kc; // number of population-level effects after centering
int<lower=1> Kscales; // number of local scale parameters
// data for the horseshoe prior
real<lower=0> hs_df; // local degrees of freedom
real<lower=0> hs_df_global; // global degrees of freedom
real<lower=0> hs_df_slab; // slab degrees of freedom
real<lower=0> hs_scale_global; // global prior scale
real<lower=0> hs_scale_slab; // slab prior scale
int grainsize; // grainsize for threading
int prior_only; // should the likelihood be ignored?
}
transformed data {
matrix[N, Kc] Xc; // centered version of X without an intercept
vector[Kc] means_X; // column means of X before centering
array[N] int seq = sequence(1, N);
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
vector[Kc] zb; // unscaled coefficients
real Intercept; // temporary intercept for centered predictors
// horseshoe shrinkage parameters
real<lower=0> hs_global; // global shrinkage parameter
real<lower=0> hs_slab; // slab regularization parameter
vector<lower=0>[Kscales] hs_local; // local parameters for the horseshoe prior
real<lower=0> sigma; // dispersion parameter
}
transformed parameters {
vector[Kc] b; // scaled coefficients
vector<lower=0>[Kc] sdb; // SDs of the coefficients
vector<lower=0>[Kscales] scales; // local horseshoe scale parameters
real lprior = 0; // prior contributions to the log posterior
// compute horseshoe scale parameters
scales = scales_horseshoe(hs_local, hs_global, hs_scale_slab^2 * hs_slab);
sdb = scales[(1):(Kc)];
b = zb .* sdb; // scale coefficients
lprior += student_t_lpdf(Intercept | 3, -13.6, 63.9);
lprior += student_t_lpdf(hs_global | hs_df_global, 0, hs_scale_global * sigma)
- 1 * log(0.5);
lprior += inv_gamma_lpdf(hs_slab | 0.5 * hs_df_slab, 0.5 * hs_df_slab);
lprior += student_t_lpdf(sigma | 3, 0, 63.9)
- 1 * student_t_lccdf(0 | 3, 0, 63.9);
}
model {
// likelihood including constants
if (!prior_only) {
target += reduce_sum(partial_log_lik_lpmf, seq, grainsize, Y, Xc, b, Intercept, sigma);
}
// priors including constants
target += lprior;
target += std_normal_lpdf(zb);
target += student_t_lpdf(hs_local | hs_df, 0, 1)
- rows(hs_local) * log(0.5);
}
generated quantities {
// actual population-level intercept
real b_Intercept = Intercept - dot_product(means_X, b);
}