Hello everyone,
I am new to Stan and I am trying to implement a hierarchical mixed logit in WTP-space with finite mixing (only two clusters) and some behavioural constraints.
Note that the beta elements 6, 7, and 8 are like that for a reason, as I am trying to implement decreasing marginal utility with the sign concordant with beta elements 1, 2 and 3.
I have tested the elemental model (the one without mixing) with synthetic data and could retrieve the dgp ok. So, I am quite confident that the version without finite mixing works just fine.
In the non-finite versions I use
y[n] ~ categorical_logit(utilities)
But to mix at the individual level in the finite mixture I use:
target += log_sum_exp(log(lambda[id[n]]) + categorical_logit_lpmf(y[n] | utilities),
log1m(lambda[id[n]]) + categorical_logit_lpmf(y[n] | utilities2));
The error I get is:
Initialization between (-2, 2) failed after 100 attempts.
Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model.
[1] "Error in sampler$call_sampler(args_list[[i]]) : Initialization failed."
I am very grateful for any help.
Here is the code:
// ********* declare dimensions of variables to be received by R
data {
int<lower = 1> N ; // number of choices
int<lower = 1> K ; // number of attr., including price
int<lower = 2> J ; // number of alts.
int<lower = 1> I ; // number of resp.
int<lower = 1 , upper = J> y[N] ; // dep. var
int<lower = 1 , upper = I> id[N] ; // resp. index
matrix[N*J, K] x; // ind. vars
}
// ********* declare dimensions of parameters
parameters {
vector<lower=0.01,upper=0.99>[I] lambda;
vector[K-3] mu ; // population means for rnd-U coeff.
vector<lower=0>[3] mu_eta ; // population means for additive effects
cholesky_factor_corr[K-3] Lcorr; // cholesky for correlation in rnd-U coeff.
vector<lower=0>[K-3] sigma; // variances for rnd-U coeff. (diag. of cov.)
cholesky_factor_corr[3] Lcorr_eta; // cholesky for correlation in rnd-U coeff.
vector<lower=0>[3] sigma_eta; // variances for rnd-U coeff. (diag. of cov.)
matrix[K-3,I] zeta ; // rnd part of individual U coef.
matrix<lower=0>[3,I] eta ; // individual additional utility coeff.
vector[K-3] mu2 ; // population means for rnd-U coeff.
vector<lower=0>[3] mu2_eta ; // population means for additive effects
cholesky_factor_corr[K-3] Lcorr2; // cholesky for correlation in rnd-U coeff.
vector<lower=0>[K-3] sigma2; // variances for rnd-U coeff. (diag. of cov.)
cholesky_factor_corr[3] Lcorr2_eta; // cholesky for correlation in rnd-U coeff.
vector<lower=0>[3] sigma2_eta; // variances for rnd-U coeff. (diag. of cov.)
matrix[K-3,I] zeta2 ; // rnd part of individual U coef.
matrix<lower=0>[3,I] eta2 ; // individual additional utility coeff.
}
// ********* priors
model {
matrix[K,I] beta ; // individual utility coeff.
vector[J] utilities;
matrix[K,I] beta2 ; // individual utility coeff.
vector[J] utilities2;
matrix[5, 5] Lcov = diag_pre_multiply(sigma, Lcorr);
matrix[3, 3] Lcov_eta = diag_pre_multiply(sigma_eta, Lcorr_eta);
matrix[5, 5] Lcov2 = diag_pre_multiply(sigma2, Lcorr2);
matrix[3, 3] Lcov_eta2 = diag_pre_multiply(sigma2_eta, Lcorr2_eta);
lambda[I] ~ uniform(0,1) ;
mu[1:4] ~ normal(10,5) ; // priors popul. means of rnd attribute coef.
mu[5] ~ normal(-2.5,.5) ;
mu_eta ~ normal(3,5) ;
Lcorr ~ lkj_corr_cholesky(1); // priors Cholesky of corr. across rnd coef.
sigma ~ cauchy(0,2.5); // priors for variance
Lcorr_eta ~ lkj_corr_cholesky(1); // priors Cholesky of corr. across rnd coef.
sigma_eta ~ cauchy(0,2.5); // priors for variance
// **********************
mu2[1:4] ~ normal(5,2) ; // priors popul. means of rnd attribute coef.
mu2[5] ~ normal(-1.5,.5) ;
mu2_eta ~ normal(2,5) ;
Lcorr2 ~ lkj_corr_cholesky(1); // priors Cholesky of corr. across rnd coef.
sigma2 ~ cauchy(0,3.5); // priors for variance
Lcorr2_eta ~ lkj_corr_cholesky(1); // priors Cholesky of corr. across rnd coef.
sigma2_eta ~ cauchy(0,3.5); // priors for variance
// ****** loop over respondents: each loop generates rnd utility coef.
to_vector(zeta) ~ normal(0, 1);
to_vector(eta) ~ normal(0, 1);
for (i in 1:I) {// priors on beta main effects
beta[1:5,i] = mu[1:5] + Lcov * zeta[1:5, i];
beta[6:8,i] = mu_eta + Lcov_eta * eta[1:3, i];
}
to_vector(zeta2) ~ normal(0, 1);
to_vector(eta2) ~ normal(0, 1);
for (i in 1:I) {// priors on beta main effects
beta[1:5,i] = mu2[1:5] + Lcov2 * zeta2[1:5, i];
beta[6:8,i] = mu2_eta + Lcov_eta2 * eta2[1:3, i];
}
// ****** loop over choices: each loop compute utility and choice probability
//QUESTION: BETA WAS DEFINED WITH DIMENSION K*I AND NOW WE USE beta[1:K-1,id[n]]
// where n>I , HOW IS IT POSSIBLE???
for (n in 1:N) {
utilities = ( x[(((n-1)*J)+1):(n*J),1] * beta[1,id[n]] +
x[(((n-1)*J)+1):(n*J),2] * (beta[1,id[n]] + beta[6,id[n]]) +
x[(((n-1)*J)+1):(n*J),3] * beta[2,id[n]] +
x[(((n-1)*J)+1):(n*J),4] * (beta[2,id[n]] + beta[7,id[n]]) +
x[(((n-1)*J)+1):(n*J),5] * beta[3,id[n]] +
x[(((n-1)*J)+1):(n*J),6] * (beta[3,id[n]] + beta[8,id[n]]) +
x[(((n-1)*J)+1):(n*J),7] * beta[4,id[n]] +
- x[(((n-1)*J)+1):(n*J),K] )
* exp(beta[5,id[n]]) ;
utilities2 = ( x[(((n-1)*J)+1):(n*J),1] * beta2[1,id[n]] +
x[(((n-1)*J)+1):(n*J),2] * (beta2[1,id[n]] + beta2[6,id[n]]) +
x[(((n-1)*J)+1):(n*J),3] * beta2[2,id[n]] +
x[(((n-1)*J)+1):(n*J),4] * (beta2[2,id[n]] + beta2[7,id[n]]) +
x[(((n-1)*J)+1):(n*J),5] * beta2[3,id[n]] +
x[(((n-1)*J)+1):(n*J),6] * (beta2[3,id[n]] + beta2[8,id[n]]) +
x[(((n-1)*J)+1):(n*J),7] * beta2[4,id[n]] +
- x[(((n-1)*J)+1):(n*J),K] )
* exp(beta2[5,id[n]]) ;
target += log_sum_exp(log(lambda[id[n]]) + categorical_logit_lpmf(y[n] | utilities),
log1m(lambda[id[n]]) + categorical_logit_lpmf(y[n] | utilities2));
}
}
"