Skew Normal Model Optimization

Hi everyone,

I’m working on a hierarchical model with multiple user-level parameters and could use some advice on optimization and efficiency improvements. The model estimates parameters for multiple individuals with measurements that depend on both individual-specific parameters and a covariate that affects the location parameter through an asymmetric adjustment mechanism. I have a really hard time forcing the asymmetric parabolic-like relationship between the outcome and the covariate.

Here’s my current Stan code:

data {
int<lower=1> nUsers;
int<lower=0> N;
array[N] int<lower=1,upper=nUsers> userId;
vector[N] outcome;
vector[N] covariate;
}
parameters {
real<lower=-7, upper=-6> mu_skew;
real<lower=110, upper=112> mu_loc;
real<lower=2, upper=4> mu_log_scale;
real<lower=7, upper=15> mu_cov_mu;
real<lower=2.5, upper=4> mu_log_cov_sigma;
real<lower=7, upper=11> mu_ideal_cov;
real<lower=-7, upper=-6> mu_log_gamma_above;
real<lower=-1.5, upper=-1.2> mu_log_gamma_below;
real<lower=.7, upper=1.2> sigma_skew;
real<lower=2.3, upper=2.7> sigma_loc;
real<lower=.06, upper=0.1> sigma_log_scale;
real<lower=3, upper=4.2> sigma_cov_mu;
real<lower=.3, upper=1.0> sigma_log_cov_sigma;
real<lower=4.3, upper=5.3> sigma_ideal_cov;
real<lower=0,upper=.3> sigma_log_gamma_above;
real<lower=0,upper=.3> sigma_log_gamma_below;
vector[nUsers] z_skew;
vector[nUsers] z_loc;
vector[nUsers] z_log_scale;
vector[nUsers] z_cov_mu;
vector[nUsers] z_log_cov_sigma;
vector[nUsers] z_ideal_cov;
vector[nUsers] z_log_gamma_above;
vector[nUsers] z_log_gamma_below;
}
transformed parameters {
vector[nUsers] outcome_skew = mu_skew + sigma_skew .* z_skew;
vector[nUsers] outcome_loc = mu_loc + sigma_loc .* z_loc;
vector[nUsers] outcome_scale = exp( mu_log_scale + sigma_log_scale .* z_log_scale );
vector[nUsers] cov_mu = mu_cov_mu + sigma_cov_mu .* z_cov_mu;
vector[nUsers] cov_sigma = exp( mu_log_cov_sigma + sigma_log_cov_sigma .* z_log_cov_sigma );
vector[nUsers] ideal_cov = mu_ideal_cov + sigma_ideal_cov .* z_ideal_cov;
vector[nUsers] log_gamma_above = mu_log_gamma_above + sigma_log_gamma_above .* z_log_gamma_above;
vector[nUsers] log_gamma_below = mu_log_gamma_below + sigma_log_gamma_below .* z_log_gamma_below;
vector[nUsers] gamma_above = exp(log_gamma_above);
vector[nUsers] gamma_below = exp(log_gamma_below);
}
model {
mu_skew ~ normal(-6.66, 1);
mu_loc ~ normal(110.50, 1);
mu_log_scale ~ normal(2.97, 0.4);
mu_cov_mu ~ normal(11.69, 1);
mu_log_cov_sigma ~ normal(3.28, 0.3);
mu_ideal_cov ~ normal(8.09, 1);
mu_log_gamma_above ~ normal(-1.41, 0.1);
mu_log_gamma_below ~ normal(-1.06, 0.1);

sigma_skew ~ normal(0.90, 0.1);
sigma_loc ~ normal(2.51, 0.05);
sigma_log_scale ~ normal(0.08, 0.005);
sigma_cov_mu ~ normal(3.90, 0.15);
sigma_log_cov_sigma ~ normal(0.06, 0.005);
sigma_ideal_cov ~ normal(4.77, 0.25);
sigma_log_gamma_above ~ normal(0.14, 0.025);
sigma_log_gamma_below ~ normal(0.19, 0.025);

z_skew ~ std_normal();
z_loc ~ std_normal();
z_log_scale ~ std_normal();
z_cov_mu ~ std_normal();
z_log_cov_sigma ~ std_normal();
z_ideal_cov ~ std_normal();
z_log_gamma_above ~ std_normal();
z_log_gamma_below ~ std_normal();

vector[N] cov_diff = covariate - ideal_cov[userId];
vector[N] outcome_loc_treated = outcome_loc[userId]

  • exp(log_gamma_above[userId]) .* log1p_exp(cov_diff)
  • exp(log_gamma_below[userId]) .* (-log1p_exp(-cov_diff));

covariate ~ normal(cov_mu[userId], cov_sigma[userId]);
outcome ~ skew_normal(outcome_loc_treated,
outcome_scale[userId],
outcome_skew[userId]);
}

Nothing to add here, but I’ve made your code more readable by putting it in backticks.

data {
    int<lower=1> nUsers;
    int<lower=0> N;
    array[N] int<lower=1,upper=nUsers> userId;
    vector[N] outcome;
    vector[N] covariate;
}
parameters {
    real<lower=-7, upper=-6> mu_skew;
    real<lower=110, upper=112> mu_loc;
    real<lower=2, upper=4> mu_log_scale;
    real<lower=7, upper=15> mu_cov_mu;
    real<lower=2.5, upper=4> mu_log_cov_sigma;
    real<lower=7, upper=11> mu_ideal_cov;
    real<lower=-7, upper=-6> mu_log_gamma_above;
    real<lower=-1.5, upper=-1.2> mu_log_gamma_below;
    real<lower=.7, upper=1.2> sigma_skew;
    real<lower=2.3, upper=2.7> sigma_loc;
    real<lower=.06, upper=0.1> sigma_log_scale;
    real<lower=3, upper=4.2> sigma_cov_mu;
    real<lower=.3, upper=1.0> sigma_log_cov_sigma;
    real<lower=4.3, upper=5.3> sigma_ideal_cov;
    real<lower=0,upper=.3> sigma_log_gamma_above;
    real<lower=0,upper=.3> sigma_log_gamma_below;
    vector[nUsers] z_skew;
    vector[nUsers] z_loc;
    vector[nUsers] z_log_scale;
    vector[nUsers] z_cov_mu;
    vector[nUsers] z_log_cov_sigma;
    vector[nUsers] z_ideal_cov;
    vector[nUsers] z_log_gamma_above;
    vector[nUsers] z_log_gamma_below;
}
transformed parameters {
    vector[nUsers] outcome_skew = mu_skew + sigma_skew .* z_skew;
    vector[nUsers] outcome_loc = mu_loc + sigma_loc .* z_loc;
    vector[nUsers] outcome_scale = exp( mu_log_scale + sigma_log_scale .* z_log_scale );
    vector[nUsers] cov_mu = mu_cov_mu + sigma_cov_mu .* z_cov_mu;
    vector[nUsers] cov_sigma = exp( mu_log_cov_sigma + sigma_log_cov_sigma .* z_log_cov_sigma );
    vector[nUsers] ideal_cov = mu_ideal_cov + sigma_ideal_cov .* z_ideal_cov;
    vector[nUsers] log_gamma_above = mu_log_gamma_above + sigma_log_gamma_above .* z_log_gamma_above;
    vector[nUsers] log_gamma_below = mu_log_gamma_below + sigma_log_gamma_below .* z_log_gamma_below;
    vector[nUsers] gamma_above = exp(log_gamma_above);
    vector[nUsers] gamma_below = exp(log_gamma_below);
}
model {
    mu_skew ~ normal(-6.66, 1);
    mu_loc ~ normal(110.50, 1);
    mu_log_scale ~ normal(2.97, 0.4);
    mu_cov_mu ~ normal(11.69, 1);
    mu_log_cov_sigma ~ normal(3.28, 0.3);
    mu_ideal_cov ~ normal(8.09, 1);
    mu_log_gamma_above ~ normal(-1.41, 0.1);
    mu_log_gamma_below ~ normal(-1.06, 0.1);

    sigma_skew ~ normal(0.90, 0.1);
    sigma_loc ~ normal(2.51, 0.05);
    sigma_log_scale ~ normal(0.08, 0.005);
    sigma_cov_mu ~ normal(3.90, 0.15);
    sigma_log_cov_sigma ~ normal(0.06, 0.005);
    sigma_ideal_cov ~ normal(4.77, 0.25);
    sigma_log_gamma_above ~ normal(0.14, 0.025);
    sigma_log_gamma_below ~ normal(0.19, 0.025);

    z_skew ~ std_normal();
    z_loc ~ std_normal();
    z_log_scale ~ std_normal();
    z_cov_mu ~ std_normal();
    z_log_cov_sigma ~ std_normal();
    z_ideal_cov ~ std_normal();
    z_log_gamma_above ~ std_normal();
    z_log_gamma_below ~ std_normal();

    vector[N] cov_diff = covariate - ideal_cov[userId];
    vector[N] outcome_loc_treated = outcome_loc[userId]
         * exp(log_gamma_above[userId]) .* log1p_exp(cov_diff)
         * exp(log_gamma_below[userId]) .* (-log1p_exp(-cov_diff));
    covariate ~ normal(cov_mu[userId], cov_sigma[userId]);
    outcome ~ skew_normal(outcome_loc_treated,
       outcome_scale[userId],
       outcome_skew[userId]);
}

Let me know if anything is incorrect here, I had to guess with the bullets that it was a * and not a .*.

1 Like

Are these physically impossible bounda for your parameters to reach? If not then reducing the hard constraints on the model and using priors as soft constraints can sometimes help the algorithm search the space a bit easier

I ran this earlier, but have since got new data — say like 5% of the total data. A colleague suggested these bounds should help with sampling since there is not reasonable possibility they should approach the bounds. Is this counterintuitive?

You shouldn’t restrict probability space to “Nah, it wouldn’t be there.” Restrictions should be for “if it goes here the model breaks” for instance:
y\sim\mathcal N(\mu, \sigma)
\sigma\sim\mathcal N(0,1)
Here, we want to actually put restrictions on \sigma to be:
real<lower=0.001> sigma;
So that it can’t be lower than 0 and throw lots of errors. (Well, we actually make it <0.001 because even 0 will be a problem.)

Just to be clear, in this case the standard thing to write would be real<lower=0> sigma;

1 Like

Oh, I didn’t realize you could do that—my mistake. I’ve always seen/used <lower=0.001>. Thanks for the correction!