# User-friendly implementation of regularised horseshoe

Hello @avehtari and all,

even though I have used regularised horseshoe before, I am getting a bit lost in my code I was wondering if there is a user friendly implementation, not much more complex than

beta ~ double_exponential(0, 1);

The goal is to control better

• the percentage of “outliers”
• the scale for outliers

Thanks

1 Like

Hi!

I like Paul’s function in brms! You can get it when specifying a formula with horseshoe prior and displaying the code with make_stancode.

Cheers,
Lucas

Thanks Lucas I don’t use much brms, would you mind when you have sometime to paste a minimal example?

No problem, I’ll do that when I open my computer
:)

1 Like

If I define the following data and model:

library(brms)
library(rethinking)

## Sample the predictors
X <- cbind(1, rmvnorm2(100, Mu = rep(0,100), sigma = rep(1, 100), Rho = rlkjcorr(1, 100, 1)))
## Sample the parameters
beta <- rnorm(101)

## Compute the linear predictor
mu <- X %*% beta
## Sample the observed values
y <- rnorm(100, mu, sd = 0.5)

## Store everything in a data.frame
Dat <- data.frame(y = y)
Dat\$X <- X

## Define the formula
formul <- bf(y ~ X)
## Define the priors
priors <- prior(horseshoe(df = 1, par_ratio = 0.5), class= b)

## DIsplay stan code
make_stancode(formul, data = Dat, prior = priors)

I obtain the following stan code, with the prior being scaled by the residual standard-error.

// generated with brms 2.8.8
functions {

/* Efficient computation of the horseshoe prior
* Args:
*   zb: standardized population-level coefficients
*   global: global horseshoe parameters
*   local: local horseshoe parameters
*   scale_global: global scale of the horseshoe prior
*   c2: positive real number for regularization
* Returns:
*   population-level coefficients following the horseshoe prior
*/
vector horseshoe(vector zb, vector[] local, real[] global,
real scale_global, real c2) {
int K = rows(zb);
vector[K] lambda = local[1] .* sqrt(local[2]);
vector[K] lambda2 = square(lambda);
real tau = global[1] * sqrt(global[2]) * scale_global;
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return zb .* lambda_tilde * tau;
}
}
data {
int<lower=1> N;  // number of observations
vector[N] Y;  // response variable
int<lower=1> K;  // number of population-level effects
matrix[N, K] X;  // population-level design matrix
real<lower=0> hs_df;
real<lower=0> hs_df_global;
real<lower=0> hs_df_slab;
real<lower=0> hs_scale_global;
real<lower=0> hs_scale_slab;
int prior_only;  // should the likelihood be ignored?
}
transformed data {
int Kc = K - 1;
matrix[N, Kc] Xc;  // centered version of X
vector[Kc] means_X;  // column means of X before centering
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
// local parameters for horseshoe prior
vector[Kc] zb;
vector<lower=0>[Kc] hs_local[2];
real temp_Intercept;  // temporary intercept
// horseshoe shrinkage parameters
real<lower=0> hs_global[2];
real<lower=0> hs_c2;
real<lower=0> sigma;  // residual SD
}
transformed parameters {
vector[Kc] b;  // population-level effects
b = horseshoe(zb, hs_local, hs_global, hs_scale_global * sigma, hs_scale_slab^2 * hs_c2);
}
model {
vector[N] mu = temp_Intercept + Xc * b;
// priors including all constants
target += normal_lpdf(zb | 0, 1);
target += normal_lpdf(hs_local[1] | 0, 1)
- 101 * log(0.5);
target += inv_gamma_lpdf(hs_local[2] | 0.5 * hs_df, 0.5 * hs_df);
target += student_t_lpdf(temp_Intercept | 3, 3, 10);
target += normal_lpdf(hs_global[1] | 0, 1)
- 1 * log(0.5);
target += inv_gamma_lpdf(hs_global[2] | 0.5 * hs_df_global, 0.5 * hs_df_global);
target += inv_gamma_lpdf(hs_c2 | 0.5 * hs_df_slab, 0.5 * hs_df_slab);
target += student_t_lpdf(sigma | 3, 0, 10)
- 1 * student_t_lccdf(0 | 3, 0, 10);
// likelihood including all constants
if (!prior_only) {
target += normal_lpdf(Y | mu, sigma);
}
}
generated quantities {
// actual population-level intercept
real b_Intercept = temp_Intercept - dot_product(means_X, b);
}

If I remove the autoscaling

## Define the priors
priors <- prior(horseshoe(df = 1, par_ratio = 0.5, autoscale = F), class= b)

## DIsplay stan code
make_stancode(formul, data = Dat, prior = priors)

I obtain this code

// generated with brms 2.8.8
functions {

/* Efficient computation of the horseshoe prior
* Args:
*   zb: standardized population-level coefficients
*   global: global horseshoe parameters
*   local: local horseshoe parameters
*   scale_global: global scale of the horseshoe prior
*   c2: positive real number for regularization
* Returns:
*   population-level coefficients following the horseshoe prior
*/
vector horseshoe(vector zb, vector[] local, real[] global,
real scale_global, real c2) {
int K = rows(zb);
vector[K] lambda = local[1] .* sqrt(local[2]);
vector[K] lambda2 = square(lambda);
real tau = global[1] * sqrt(global[2]) * scale_global;
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return zb .* lambda_tilde * tau;
}
}
data {
int<lower=1> N;  // number of observations
vector[N] Y;  // response variable
int<lower=1> K;  // number of population-level effects
matrix[N, K] X;  // population-level design matrix
real<lower=0> hs_df;
real<lower=0> hs_df_global;
real<lower=0> hs_df_slab;
real<lower=0> hs_scale_global;
real<lower=0> hs_scale_slab;
int prior_only;  // should the likelihood be ignored?
}
transformed data {
int Kc = K - 1;
matrix[N, Kc] Xc;  // centered version of X
vector[Kc] means_X;  // column means of X before centering
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
// local parameters for horseshoe prior
vector[Kc] zb;
vector<lower=0>[Kc] hs_local[2];
real temp_Intercept;  // temporary intercept
// horseshoe shrinkage parameters
real<lower=0> hs_global[2];
real<lower=0> hs_c2;
real<lower=0> sigma;  // residual SD
}
transformed parameters {
vector[Kc] b;  // population-level effects
b = horseshoe(zb, hs_local, hs_global, hs_scale_global, hs_scale_slab^2 * hs_c2);
}
model {
vector[N] mu = temp_Intercept + Xc * b;
// priors including all constants
target += normal_lpdf(zb | 0, 1);
target += normal_lpdf(hs_local[1] | 0, 1)
- 101 * log(0.5);
target += inv_gamma_lpdf(hs_local[2] | 0.5 * hs_df, 0.5 * hs_df);
target += student_t_lpdf(temp_Intercept | 3, 3, 10);
target += normal_lpdf(hs_global[1] | 0, 1)
- 1 * log(0.5);
target += inv_gamma_lpdf(hs_global[2] | 0.5 * hs_df_global, 0.5 * hs_df_global);
target += inv_gamma_lpdf(hs_c2 | 0.5 * hs_df_slab, 0.5 * hs_df_slab);
target += student_t_lpdf(sigma | 3, 0, 10)
- 1 * student_t_lccdf(0 | 3, 0, 10);
// likelihood including all constants
if (!prior_only) {
target += normal_lpdf(Y | mu, sigma);
}
}
generated quantities {
// actual population-level intercept
real b_Intercept = temp_Intercept - dot_product(means_X, b);
}

Cheers,
Lucas

Thanks,

I wish there was a one line implementation of this ;)

The one-line implementation tends to yield divergent transitions.

1 Like

brms implementation follows what I recommend, too. It has the additional benefit of using non-centered parameterization, which reduces the probability of getting funnel shaped posterior which is more difficult to sample.

1 Like

Just to confirm,

the above implementation of reg_horseshoe does (or does not) have arguments as parameters?

It is analogous of (?)

~ double exponential(0,1)

or

~ double exponential(alpha, beta);
alpha ~ ..
beta ~ ..

I am looking for the case (1) where I impose the arguments and not trying to infer them.

For example, can I set the following parameters?, based on:

• expected fraction of “putliers”
• extected scale of outliers
// local parameters for horseshoe prior
vector<lower=0>[Kc] hs_local[2];

// horseshoe shrinkage parameters
real<lower=0> hs_global[2];
real<lower=0> hs_c2;

I have compressed the code and moved to functions, and kept just the key three tuning arguments (In my option but I can be wrong), decreasing of more than half the number of code lines in the main body of the model.

functions{

vector horseshoe_get_tp(vector zb, vector[] local, real[] global, real scale_global, real c2) {
int K = rows(zb);
vector[K] lambda = local[1] .* sqrt(local[2]);
vector[K] lambda2 = square(lambda);
real tau = global[1] * sqrt(global[2]) * scale_global;
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return zb .* lambda_tilde * tau;
}

real horseshoe_get_lp(vector zb, vector[] local, real df, real[] global, real df_global, real c2, real df_slab){

// real<lower=0> hs_df; // == 1  // If divergencies increase this
// real<lower=0> hs_df_global; // == 1
// real<lower=0> hs_df_slab; // == 4 // df of the outliers

vector[6] lp;

lp[1] = normal_lpdf(zb | 0, 1);
lp[2] = normal_lpdf(local[1] | 0, 1) - 101 * log(0.5);
lp[3] = inv_gamma_lpdf(local[2] | 0.5 * df, 0.5 * df);
lp[4] = normal_lpdf(global[1] | 0, 1)  - 1 * log(0.5);
lp[5] = inv_gamma_lpdf(global[2] | 0.5 * df_global, 0.5 * df_global);
lp[6] = inv_gamma_lpdf(c2 | 0.5 * df_slab, 0.5 * df_slab);

return(sum(lp));
}
}
data{
int N;
/// Horseshoe tuning
real<lower=0> hs_df; // == 1  // If divergencies increase this
real<lower=0, upper=1> par_ratio; // real<lower=0> hs_scale_global; // from par ratio   !! KEY PARAMETER
real<lower=0> hs_scale_slab; // == 2 // regularisation/scale of outliers                !! KEY PARAMETER
}
parameters{
vector[1] zbeta;
// Horseshoe
vector<lower=0>[1] hs_local[2]; // local parameters for horseshoe prior
real<lower=0> hs_global[2]; // horseshoe shrinkage parameters
real<lower=0> hs_c2; // horseshoe shrinkage parameters
}
transformed parameters {
// Horseshoe
vector[1] beta= horseshoe_get_tp(zbeta, hs_local, hs_global, par_ratio / sqrt(N), hs_scale_slab^2 * hs_c2);

}
model{
// Horseshoe
target += horseshoe_get_lp(zbeta, hs_local, hs_df, hs_global, 1, hs_c2, 4);
}

In the above code the parameters are given as data

real<lower=0> hs_df;
real<lower=0> hs_df_global;
real<lower=0> hs_df_slab;
real<lower=0> hs_scale_global;
real<lower=0> hs_scale_slab;

See https://projecteuclid.org/euclid.ejs/1513306866 how these different parameters affect the distribution.

Yes, see https://projecteuclid.org/euclid.ejs/1513306866 how to set the above parameters based on that information.

1 Like

Thanks for the article. I think brms had a nice description as well (please correct me if there is any misunderstanding)

• par_ratio = Ratio of the expected number of non-zero coefficients to the expected number of zero coefficients. So if i expect 1% of non-zero coefficient this ratio will be 1/99

• hs_scale_slab = the expected value of the non-zero coefficient. For example if I expect the coefficient all being 0 except few being 2; 2 will be the value of this parameter