# Mixed logit model with heterogeneity in means (and measured in WTP space)

This is a model I’ve been working on for a few months now. Some previous discussion on its development and my fitting issues is given here.

The model is a simple mixed logit application, with some slight modifications.

\frac{e^V_i}{\sum_j E^V_j}
where

V_i = -\beta_{cost}cost-\beta_{cost}WTP_{time}time+WTP_i

\beta_{cost} is a combination of a cost parameter and a scale parameter, WTP_{time} and WTP_i are measures of willingness-to-pay (in $) and are category/alternatives-specific. All parameters are hierarchical, equivalent to a mixed logit specification. \beta_{cost} and WTP_{time} are given zero-avoiding positive priors which, when combined with the negation in V_i, mean that they have a negative effect on the probability of choosing an alternative. The basic model follows. I use non-centered parameterizations and gamma priors on the time parameters, which was chosen after testing several models with log-normal priors and finding the gamma prior gave fewer divergences. The alternatives are travel modes. I introduce an availability condition to account for cases when some travel modes are unavailable. functions { int[] sequence(int start, int end) { int seq[end - start + 1]; for (n in 1:num_elements(seq)) { seq[n] = n + start - 1; } return seq; } real partial_log_lik_lpmf(int[] seq, int start, int end, data int ncat, data int nvar, data int ntotvar, data int[] Y, data matrix x1, data matrix x2, data matrix x3, data matrix x4, data int[] J_1, data int[] J_2, vector b_a2, vector b_a3, vector b_a4, vector b_bc, vector b_b1tt, vector b_b2tt, vector b_b3tt, vector b_b4tt, matrix avail) { real ptarget = 0; int N = end - start + 1; // Define matrices/vector for x, alpha, beta matrix[ncat,nvar] x; vector[ncat] alpha; matrix[nvar,ncat] beta; array[N] row_vector[ncat] a = rep_array(rep_row_vector(0,ncat), N); // initialize linear predictor term // Terms are btc, b1tt, b2tt, b3tt, b4tt array[N] row_vector[ntotvar] b = rep_array(rep_row_vector(0,ntotvar), N); for (n in 1:N) { int nn = n + start - 1; // add to linear predictor term a[n] += [0,b_a2[J_2[nn]], b_a3[J_2[nn]], b_a4[J_2[nn]]]; // add to linear predictor term // Terms are btc (0), b1tt, b2tt, b3tt, b4tt b[n] += [b_bc[J_1[nn]], b_b1tt[J_1[nn]], b_b2tt[J_1[nn]], b_b3tt[J_1[nn]], b_b4tt[J_1[nn]]]; // Each x and beta is a matrix with dimensions alts x variables // Our x will be the time/cost coming in as inputs x = to_matrix([x1[nn],x2[nn],x3[nn],x4[nn]]); // Our betas will be the hierarchical slope parameters (2 x 4) beta = -1*[b[n,1] * b[n,2:5], [b[n,1],0,0,b[n,1]]]; // Our alphas will be the hierarchical intercept parameters. ln(avail[nn,i]] gives an availability measure alpha = [a[n,1]+log(avail[nn,1]), b[n,1] * a[n,2]+log(avail[nn,2]), b[n,1] * a[n,3]+log(avail[nn,3]), b[n,1] * a[n,4]+log(avail[nn,4])]'; ptarget += categorical_logit_glm_lupmf(Y[nn] | x, alpha, beta); } return ptarget; } } data { int<lower=1> N; // total number of observations int<lower=2> ncat; // number of categories int<lower=2> nvar; // number of variables for each alternative int<lower=2> ntotvar; // number of total variables int grainsize; // grainsize for threading array[N] int Y; // response variable // covariate matrix for alt1 matrix[N,nvar] x1; // matrix[N,nvar] x1; // covariate matrix for alt2 matrix[N,nvar] x2; // matrix[N,nvar] x2; // covariate matrix for alt3 matrix[N,nvar] x3; // matrix[N,nvar] x3; // covariate matrix for alt4 matrix[N,nvar] x4; // matrix[N,nvar] x4; // data for group-level effects of ID 1 int<lower=1> N_1; // number of grouping levels int<lower=1> M_1; // number of coefficients per level array[N] int<lower=1> J_1; // grouping indicator per observation // data for group-level effects of ID 2 int<lower=1> N_2; // number of grouping levels int<lower=1> M_2; // number of coefficients per level array[N] int<lower=1> J_2; // grouping indicator per observation matrix<lower=0,upper=1>[N,ncat] avail; // whether an alternative is available int prior_only; // should the likelihood be ignored? } transformed data { int seq[N] = sequence(1, N); } parameters { real<lower=0> mu_b1tt; // population-level effects real<lower=0> mu_b2tt; // population-level effects real<lower=0> mu_b3tt; // population-level effects real<lower=0> mu_b4tt; // population-level effects real mu_a2; // population-level effects real mu_a3; // population-level effects real mu_a4; // population-level effects real<lower=0> sd1_bc; // group-level standard deviations real<lower=0> sd1_b1tt; // group-level standard deviations real<lower=0> sd1_b2tt; // group-level standard deviations real<lower=0> sd1_b3tt; // group-level standard deviations real<lower=0> sd1_b4tt; // group-level standard deviations real<lower=0> sd2_a2; // group-level standard deviations real<lower=0> sd2_a3; // group-level standard deviations real<lower=0> sd2_a4; // group-level standard deviations vector<offset=0, multiplier = sd1_bc>[N_1] log_b_bc; vector<offset=mu_b1tt, multiplier = sd1_b1tt>[N_1] b_b1tt; vector<offset=mu_b2tt, multiplier = sd1_b2tt>[N_1] b_b2tt; vector<offset=mu_b3tt, multiplier = sd1_b3tt>[N_1] b_b3tt; vector<offset=mu_b4tt, multiplier = sd1_b4tt>[N_1] b_b4tt; vector<offset=mu_a2, multiplier = sd2_a2>[N_2] b_a2; vector<offset=mu_a3, multiplier = sd2_a3>[N_2] b_a3; vector<offset=mu_a4, multiplier = sd2_a4>[N_2] b_a4; } model { // likelihood including constants if (!prior_only) { target += reduce_sum(partial_log_lik_lpmf, seq, grainsize, ncat, nvar, ntotvar, Y, x1, x2, x3, x4, J_1, J_2, b_a2, b_a3, b_a4, b_bc, b_b1tt, b_b2tt, b_b3tt, b_b4tt, avail); } // priors mu_b1tt ~ normal(3,0.25); mu_b2tt ~ normal(3,0.25); mu_b3tt ~ normal(3,0.25); mu_b4tt ~ normal(2,0.25); mu_a2 ~ normal(-1,0.25); mu_a3 ~ normal(-1,0.25); mu_a4 ~ normal(-1,0.25); sd1_bc ~ student_t(3,0,1); sd1_b1tt ~ student_t(3,0,1); sd1_b2tt ~ student_t(3,0,1); sd1_b3tt ~ student_t(3,0,1); sd1_b4tt ~ student_t(3,0,1); sd2_a2 ~ student_t(3,0,1); sd2_a3 ~ student_t(3,0,1); sd2_a4 ~ student_t(3,0,1); b_a2 ~ normal(mu_a2, sd2_a2); b_a3 ~ normal(mu_a3, sd2_a3); b_a4 ~ normal(mu_a4, sd2_a4); b_bc ~ normal(0, sd1_bc); b_b1tt ~ gamma(mu_b1tt, sd1_b1tt); b_b2tt ~ gamma(mu_b2tt, sd1_b2tt); b_b3tt ~ gamma(mu_b3tt, sd1_b3tt); b_b4tt ~ gamma(mu_b4tt, sd1_b4tt); }  The literature suggests that alternative 4 (transit) should have a lower mean WTP than alternative 1 (auto), which I capture in the priors. If I plot the individual WTP for time mean parameters, I get something in the expected range for auto ($10-\$20) and some very large values for transit. The results are also very narrowly distributed for auto and widely distributed for transit. I have other specifications with more reasonable numbers for transit (with more divergences though), but it is consistent that the distributions are not what I anticipate and the transit mean is consistently higher than auto (This is not a major issue, so long as the distributions are in a reasonable range. I can also introduce wait and walk times for transit that may help here).
AutoWTP

Transit WTP

The posterior plot and frequency plots for sd’s are as below. Note: the final \beta parameters are generated from mu and sd hyperparameters. Also, the r-hat are consistently about 3-4 across all my test specifications.

I am also interested in heterogeneity in the mean effects as a function of race and income. The ultimate goal is to generate a logsum term, which will be translated into a measure of contingent valuation that is heterogenous across the sample and measures equity in accessibility. My current model with heterogeneity (i.e., group variables on the time variables) is given below. This model also includes correlations between the group effects because I anticipate there being a correlation between among racial groups and also income. I have another model where I try including the group variables in the cost effect (rather than time WTP). I’ve tried various specifications with variation in max_treedepth, prior distribution and standard devs., and number of warmup iterations.

functions {
int[] sequence(int start, int end) {
int seq[end - start + 1];
for (n in 1:num_elements(seq)) {
seq[n] = n + start - 1;
}
return seq;
}

real partial_log_lik_lpmf(int[] seq, int start, int end,
data int ncat, data int nvar, data int ntotvar, data int[] Y, data matrix x1, data matrix x2, data matrix x3, data matrix x4, data matrix z1, data int[] J_1, data int[] J_2,
vector b_a2, vector b_a3, vector b_a4, vector b_bc, matrix mu_b1tt_dem, matrix mu_b2tt_dem, matrix mu_b3tt_dem, matrix mu_b4tt_dem, vector b_b1tt, vector b_b2tt, vector b_b3tt, vector b_b4tt, matrix avail) {

real ptarget = 0;
int N = end - start + 1;
// Define matrices/vector for x, alpha, beta
matrix[ncat,nvar] x;
vector[ncat] alpha;
matrix[nvar,ncat] beta;

array[N] row_vector[ncat] a = rep_array(rep_row_vector(0,ncat), N);

// initialize linear predictor term
// Terms are btc, b1tt, b2tt, b3tt, b4tt

array[N] row_vector[ntotvar] b = rep_array(rep_row_vector(0,ntotvar), N);

for (n in 1:N) {
int nn = n + start - 1;
// add to linear predictor term
a[n] += [0,b_a2[J_2[nn]], b_a3[J_2[nn]], b_a4[J_2[nn]]];
// add to linear predictor term
// Terms are btc (0), b1tt, b2tt, b3tt, b4tt
b[n] += [b_bc[J_1[nn]], b_b1tt[J_1[nn]]+sum(mu_b1tt_dem*z1[nn]'), b_b2tt[J_1[nn]]+sum(mu_b2tt_dem*z1[nn]'), b_b3tt[J_1[nn]]+sum(mu_b3tt_dem*z1[nn]'), b_b4tt[J_1[nn]]+sum(mu_b4tt_dem*z1[nn]')];
// Each x and beta is a matrix with dimensions alts x variables
// Our x will be the time/cost coming in as inputs
x = to_matrix([x1[nn],x2[nn],x3[nn],x4[nn]]);

// Our betas will be the hierarchical slope parameters (2 x 4)
beta = -1*[b[n,1] * b[n,2:5], [b[n,1],0,0,b[n,1]]];
// Our alphas will be the hierarchical intercept parameters. ln(avail[nn,i]] gives an availability measure
alpha = [a[n,1]+log(avail[nn,1]), b[n,1] * a[n,2]+log(avail[nn,2]), b[n,1] * a[n,3]+log(avail[nn,3]), b[n,1] * a[n,4]+log(avail[nn,4])]';
ptarget += categorical_logit_glm_lupmf(Y[nn] | x, alpha, beta);
}
return ptarget;
}
}

data {
int<lower=1> N;  // total number of observations
int<lower=2> ncat;  // number of categories
int<lower=2> nvar;  // number of variables for each alternative
int<lower=2> ntotvar;  // number of total variables
int grainsize;  // grainsize for threading
array[N] int Y;  // response variable
// covariate matrix for alt1
matrix[N,nvar] x1; // matrix[N,nvar] x1;
// covariate matrix for alt2
matrix[N,nvar] x2; // matrix[N,nvar] x2;
// covariate matrix for alt3
matrix[N,nvar] x3; // matrix[N,nvar] x3;
// covariate matrix for alt4
matrix[N,nvar] x4; // matrix[N,nvar] x4;
// data for group-level effects of ID 1
int<lower=1> N_1;  // number of grouping levels
int<lower=1> M_1;  // number of coefficients per level
array[N] int<lower=1> J_1;  // grouping indicator per observation
// group-level predictor values
matrix[N, M_1] z1;
// data for group-level effects of ID 2
int<lower=1> N_2;  // number of grouping levels
int<lower=1> M_2;  // number of coefficients per level
array[N] int<lower=1> J_2;  // grouping indicator per observation
matrix<lower=0,upper=1>[N,ncat] avail; // whether an alternative is available
int prior_only;  // should the likelihood be ignored?
}
transformed data {
int seq[N] = sequence(1, N);
}
parameters {
real mu_b1tt; // population-level effects
real mu_b2tt; // population-level effects
real mu_b3tt; // population-level effects
real mu_b4tt; // population-level effects
real mu_a2; // population-level effects
real mu_a3; // population-level effects
real mu_a4; // population-level effects
cholesky_factor_corr[M_1] L1;  // cholesky factor of correlation matrix
cholesky_factor_corr[M_1] L2;  // cholesky factor of correlation matrix
cholesky_factor_corr[M_1] L3;  // cholesky factor of correlation matrix
cholesky_factor_corr[M_1] L4;  // cholesky factor of correlation matrix
real<lower=0> sd1_bc;  // group-level standard deviations
real<lower=0> sd1_b1tt;  // group-level standard deviations
real<lower=0> sd1_b2tt;  // group-level standard deviations
real<lower=0> sd1_b3tt;  // group-level standard deviations
real<lower=0> sd1_b4tt;  // group-level standard deviations
real<lower=0> sd2_a2;  // group-level standard deviations
real<lower=0> sd2_a3;  // group-level standard deviations
real<lower=0> sd2_a4;  // group-level standard deviations
vector<offset=0, multiplier = sd1_bc>[N_1] log_b_bc;
matrix[M_1,M_1] log_mu_b1tt_dem;
matrix[M_1,M_1] log_mu_b2tt_dem;
matrix[M_1,M_1] log_mu_b3tt_dem;
matrix[M_1,M_1] log_mu_b4tt_dem;
vector<offset=mu_b1tt, multiplier = sd1_b1tt>[N_1] log_b_b1tt;
vector<offset=mu_b2tt, multiplier = sd1_b2tt>[N_1] log_b_b2tt;
vector<offset=mu_b3tt, multiplier = sd1_b3tt>[N_1] log_b_b3tt;
vector<offset=mu_b4tt, multiplier = sd1_b4tt>[N_1] log_b_b4tt;
vector<offset=mu_a2, multiplier = sd2_a2>[N_2] b_a2;
vector<offset=mu_a3, multiplier = sd2_a3>[N_2] b_a3;
vector<offset=mu_a4, multiplier = sd2_a4>[N_2] b_a4;
}
transformed parameters{
vector<lower=0>[N_1] b_bc = exp(log_b_bc);
vector<lower=0>[N_1] b_b1tt = exp(log_b_b1tt);
vector<lower=0>[N_1] b_b2tt = exp(log_b_b2tt);
vector<lower=0>[N_1] b_b3tt = exp(log_b_b3tt);
vector<lower=0>[N_1] b_b4tt = exp(log_b_b4tt);
matrix<lower=0>[M_1,M_1] mu_b1tt_dem = exp(log_mu_b1tt_dem*L1);
matrix<lower=0>[M_1,M_1] mu_b2tt_dem = exp(log_mu_b2tt_dem*L2);
matrix<lower=0>[M_1,M_1] mu_b3tt_dem = exp(log_mu_b3tt_dem*L3);
matrix<lower=0>[M_1,M_1] mu_b4tt_dem = exp(log_mu_b4tt_dem*L4);
}
model {
// likelihood including constants
if (!prior_only) {
target += reduce_sum(partial_log_lik_lpmf, seq,
grainsize, ncat, nvar, ntotvar, Y, x1, x2, x3, x4, z1, J_1, J_2,
b_a2, b_a3, b_a4, b_bc, mu_b1tt_dem, mu_b2tt_dem, mu_b3tt_dem, mu_b4tt_dem,b_b1tt, b_b2tt, b_b3tt, b_b4tt, avail);

}
// priors
mu_b1tt ~ normal(3,0.25);
mu_b2tt ~ normal(3,0.25);
mu_b3tt ~ normal(3,0.25);
mu_b4tt ~ normal(2,0.25);
mu_a2 ~ normal(-1,0.25);
mu_a3 ~ normal(-1,0.25);
mu_a4 ~ normal(-1,0.25);
to_vector(log_mu_b1tt_dem) ~ normal(0,0.5);
to_vector(log_mu_b2tt_dem) ~ normal(0,0.5);
to_vector(log_mu_b3tt_dem) ~ normal(0,0.5);
to_vector(log_mu_b4tt_dem) ~ normal(0,0.5);
sd1_bc ~ student_t(3,0,1);
sd1_b1tt ~ student_t(3,2,1);
sd1_b2tt ~ student_t(3,2,1);
sd1_b3tt ~ student_t(3,2,1);
sd1_b4tt ~ student_t(3,2,1);
sd2_a2 ~ student_t(3,2,1);
sd2_a3 ~ student_t(3,2,1);
sd2_a4 ~ student_t(3,2,1);
b_a2 ~ normal(mu_a2, sd2_a2);
b_a3 ~ normal(mu_a3, sd2_a3);
b_a4 ~ normal(mu_a4, sd2_a4);
log_b_bc ~ normal(0, sd1_bc);
log_b_b1tt ~ normal(mu_b1tt, sd1_b1tt);
log_b_b2tt ~ normal(mu_b2tt, sd1_b2tt);
log_b_b3tt ~ normal(mu_b3tt, sd1_b3tt);
log_b_b4tt ~ normal(mu_b4tt, sd1_b4tt);
L1 ~ lkj_corr_cholesky(1);
L2 ~ lkj_corr_cholesky(1);
L3 ~ lkj_corr_cholesky(1);
L4 ~ lkj_corr_cholesky(1);
}


The posterior plot and frequency plots for sd’s are as below for this model.