Hello,
I’m currently working on testing a hierarchical logistic regression model by using a synthetic data. However, I’ve noticed a discrepancy between the stan posteriors and the synthetic data.
I have implemented the hierarchical logistic regression model in Stan using the provided code.
functions {
int[] sequence(int start, int end) {
int seq[end - start + 1];
for (n in 1:num_elements(seq)) {
seq[n] = n + start - 1;
}
return seq;
}
real partial_log_lik_lpmf(int[] seq, int start, int end,
data int ncat, data int nvar, data int ntotvar, data int[] Y, data matrix x1, data matrix x2, data matrix x3, data matrix x4, data int[] J_1, data int[] J_2,
vector b_a2, vector b_a3, vector b_a4, vector b_bc, vector b_b1tt, vector b_b2tt, vector b_b3tt, vector b_b4tt, matrix avail) {
real ptarget = 0;
int N = end - start + 1;
// Define matrices/vector for x, alpha, beta
matrix[ncat,nvar] x;
vector[ncat] alpha;
matrix[nvar,ncat] beta;
array[N] row_vector[ncat] a = rep_array(rep_row_vector(0,ncat), N);
// initialize linear predictor term
// Terms are btc, b1tt, b2tt, b3tt, b4tt
array[N] row_vector[ntotvar] b = rep_array(rep_row_vector(0,ntotvar), N);
for (n in 1:N) {
int nn = n + start - 1;
// add to linear predictor term
a[n] += [0,b_a2[J_2[nn]], b_a3[J_2[nn]], b_a4[J_2[nn]]];
// add to linear predictor term
// Terms are btc (0), b1tt, b2tt, b3tt, b4tt
b[n] += [b_bc[J_1[nn]], b_b1tt[J_1[nn]], b_b2tt[J_1[nn]], b_b3tt[J_1[nn]], b_b4tt[J_1[nn]]];
// Each x and beta is a matrix with dimensions alts x variables
// Our x will be the time/cost coming in as inputs
x = to_matrix([x1[nn],x2[nn],x3[nn],x4[nn]]);
// Our betas will be the hierarchical slope parameters (2 x 4)
beta = -1*[b[n,1] * b[n,2:5], [b[n,1],0,0,b[n,1]]];
// Our alphas will be the hierarchical intercept parameters. ln(avail[nn,i]] gives an availability measure
alpha = [a[n,1]+log(avail[nn,1]), b[n,1] * a[n,2]+log(avail[nn,2]), b[n,1] * a[n,3]+log(avail[nn,3]), b[n,1] * a[n,4]+log(avail[nn,4])]';
ptarget += categorical_logit_glm_lupmf(Y[nn] | x, alpha, beta);
}
return ptarget;
}
}
data {
int<lower=1> N; // total number of observations
int<lower=2> ncat; // number of categories
int<lower=2> nvar; // number of variables for each alternative
int<lower=2> ntotvar; // number of total variables
int grainsize; // grainsize for threading
array[N] int Y; // response variable
// covariate matrix for alt1
matrix[N,nvar] x1; // matrix[N,nvar] x1;
// covariate matrix for alt2
matrix[N,nvar] x2; // matrix[N,nvar] x2;
// covariate matrix for alt3
matrix[N,nvar] x3; // matrix[N,nvar] x3;
// covariate matrix for alt4
matrix[N,nvar] x4; // matrix[N,nvar] x4;
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
array[N] int<lower=1> J_1; // grouping indicator per observation
// group-level predictor values
// data for group-level effects of ID 2
int<lower=1> N_2; // number of grouping levels
int<lower=1> M_2; // number of coefficients per level
array[N] int<lower=1> J_2; // grouping indicator per observation
matrix<lower=0,upper=1>[N,ncat] avail; // whether an alternative is available
int prior_only; // should the likelihood be ignored?
}
transformed data {
int seq[N] = sequence(1, N);
}
parameters {
real<lower=0> mu_b1tt; // population-level effects
real<lower=0> mu_b2tt; // population-level effects
real<lower=0> mu_b3tt; // population-level effects
real<lower=0> mu_b4tt; // population-level effects
real mu_a2; // population-level effects
real mu_a3; // population-level effects
real mu_a4; // population-level effects
real<lower=0> sd1_bc; // group-level standard deviations
real<lower=0> sd1_b1tt; // group-level standard deviations
real<lower=0> sd1_b2tt; // group-level standard deviations
real<lower=0> sd1_b3tt; // group-level standard deviations
real<lower=0> sd1_b4tt; // group-level standard deviations
real<lower=0> sd2_a2; // group-level standard deviations
real<lower=0> sd2_a3; // group-level standard deviations
real<lower=0> sd2_a4; // group-level standard deviations
vector<offset=0, multiplier = sd1_bc>[N_1] log_b_bc;
vector<offset=mu_b1tt, multiplier = sd1_b1tt>[N_1] b_b1tt;
vector<offset=mu_b2tt, multiplier = sd1_b2tt>[N_1] b_b2tt;
vector<offset=mu_b3tt, multiplier = sd1_b3tt>[N_1] b_b3tt;
vector<offset=mu_b4tt, multiplier = sd1_b4tt>[N_1] b_b4tt;
vector<offset=mu_a2, multiplier = sd2_a2>[N_2] b_a2;
vector<offset=mu_a3, multiplier = sd2_a3>[N_2] b_a3;
vector<offset=mu_a4, multiplier = sd2_a4>[N_2] b_a4;
}
transformed parameters{
vector<lower=0>[N_1] b_bc = exp(log_b_bc);
}
model {
// likelihood including constants
if (!prior_only) {
target += reduce_sum(partial_log_lik_lpmf, seq,
grainsize, ncat, nvar, ntotvar, Y, x1, x2, x3, x4, J_1, J_2,
b_a2, b_a3, b_a4, b_bc, b_b1tt, b_b2tt, b_b3tt, b_b4tt, avail);
}
// priors
mu_b1tt ~ normal(3,0.25);
mu_b2tt ~ normal(3,0.25);
mu_b3tt ~ normal(3,0.25);
mu_b4tt ~ normal(3,0.25);
mu_a2 ~ normal(-1,0.25);
mu_a3 ~ normal(-1,0.25);
mu_a4 ~ normal(-1,0.25);
sd1_bc ~ student_t(3,0,1);
sd1_b1tt ~ student_t(3,0,1);
sd1_b2tt ~ student_t(3,0,1);
sd1_b3tt ~ student_t(3,0,1);
sd1_b4tt ~ student_t(3,0,1);
sd2_a2 ~ student_t(3,0,1);
sd2_a3 ~ student_t(3,0,1);
sd2_a4 ~ student_t(3,0,1);
b_a2 ~ normal(mu_a2, sd2_a2);
b_a3 ~ normal(mu_a3, sd2_a3);
b_a4 ~ normal(mu_a4, sd2_a4);
log_b_bc ~ normal(0, sd1_bc);
b_b1tt ~ gamma(mu_b1tt, sd1_b1tt);
b_b2tt ~ gamma(mu_b2tt, sd1_b2tt);
b_b3tt ~ gamma(mu_b3tt, sd1_b3tt);
b_b4tt ~ gamma(mu_b4tt, sd1_b4tt);
}
Additionally, I have generated the synthetic data using an R script.
rm(list = ls())
# install.packages("brms", repos='http://cran.us.r-project.org')
library(tidyverse)
library(brms)
library(cmdstanr) # observe startup messages
library(posterior)
library(bayesplot)
set.seed(1232)
n <- 2000 # number of observations
travelTimeDrive <- abs(rnorm(n, 13, 13))
travelCostDrive <- abs(rnorm(n, 136, 202))
travelTimeWalk <- abs(rnorm(n, 123, 197))
travelTimeBike <- abs(rnorm(n, 31, 50))
travelTimeTransit <- abs(rnorm(n, 45, 25))
travelCostTransit <- abs(rnorm(n, 212, 128))
av_car <- rep(1, n)
av_walk <- rep(1, n)
av_bike <- rep(1, n)
av_transit <- rep(1, n)
trippurp <- round(pmax(pmin(rnorm(n, mean=2.77, sd=1.24), 5), 1))
#Creating personid column
# Create a vector of numbers between 1000 and 2000, shuffle it, and select the first max_rows/2 elements
personid <- sample(seq(1000, 2000), size = n/2, replace = TRUE)
personid <- sample(personid, size = n/2)
# Duplicate the personid vector to fill the column and add 0.1 or 0.2 randomly to each ID
personid <- rep(personid, each = 2)
personid <- personid + sample(c(0.1, 0.2), size = n, replace = TRUE)
# create data frame
fake_data <- data.frame(
travelTimeDrive = travelTimeDrive,
travelCostDrive = travelCostDrive,
travelTimeWalk = travelTimeWalk,
travelTimeBike = travelTimeBike,
travelTimeTransit = travelTimeTransit,
travelCostTransit = travelCostTransit,
av_car = av_car,
av_walk = av_walk,
av_bike = av_bike,
av_transit = av_transit,
personid = personid,
trippurp = trippurp
)
fake_data$av_walk <- ifelse(fake_data$travelTimeDrive < 6, 1, fake_data$av_walk)
# Load the package
library(writexl)
n <- nrow(fake_data)
trippurp <- fake_data$trippurp
travelTimeWalk <- fake_data$travelTimeWalk
travelTimeBike <- fake_data$travelTimeBike
travelTimeDrive <- fake_data$travelTimeDrive
travelTimeTransit <- fake_data$travelTimeTransit
travelCostDrive <- fake_data$travelCostDrive
travelCostTransit <- fake_data$travelCostTransit
# define alpha parameters
alpha_2 <- numeric(n)
sd2_a2 <- numeric(n)
mu_a2 <- rnorm(1, -1, 0.25)
sd2_a2 <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:5) {
alpha_2[trippurp == i] <- rnorm(1, mu_a2, sd2_a2)
alpha_2[trippurp == i & n >= 1] <- rep(alpha_2[trippurp == i], sum(trippurp == i))
}
alpha_3 <- numeric(n)
sd2_a3 <- numeric(n)
mu_a3 <- rnorm(1, -1, 0.25)
sd2_a3 <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:5) {
alpha_3[trippurp == i] <- rnorm(1, mu_a3, sd2_a3)
alpha_3[trippurp == i & n >= 1] <- rep(alpha_3[trippurp == i], sum(trippurp == i))
}
alpha_4 <- numeric(n)
sd2_a4 <- numeric(n)
mu_a4 <- rnorm(1, -1, 0.25)
sd2_a4 <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:5) {
alpha_4[trippurp == i] <- rnorm(1, mu_a4, sd2_a4)
alpha_4[trippurp == i & n >= 1] <- rep(alpha_4[trippurp == i], sum(trippurp == i))
}
# define b_cost parameter
sd1_bc <- numeric(n)
log_b_cost <- numeric(n)
sd1_bc <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:n) {
log_b_cost[i] <- rnorm(1, 0, sd1_bc)
}
b_cost <- exp(log_b_cost)
# define b_tt parameters
sd1_b1tt <- numeric(n)
b_tt_1 <- numeric(n)
mu_b1tt <- numeric(n)
mu_b1tt <- abs(rnorm(1, 3, 0.25))
sd1_b1tt <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:n) {
b_tt_1[i] <- rgamma(1, mu_b1tt, sd1_b1tt) # Generate b_tt_1 from gamma(shape, rate) distribution
}
mu_b2tt <- numeric(n)
sd1_b2tt <- numeric(n)
b_tt_2 <- numeric(n)
mu_b2tt <- abs(rnorm(1, 3, 0.25))
sd1_b2tt <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:n) {
b_tt_2[i] <- rgamma(1, mu_b2tt, sd1_b2tt) # Generate b_tt_2 from gamma(shape, rate) distribution
}
mu_b3tt <- numeric(n)
sd1_b3tt <- numeric(n)
b_tt_3 <- numeric(n)
mu_b3tt <- abs(rnorm(1, 3, 0.25))
sd1_b3tt <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:n) {
b_tt_3[i] <- rgamma(1, mu_b3tt, sd1_b3tt) # Generate b_tt_3 from gamma(shape, rate) distribution
}
mu_b4tt <- numeric(n)
sd1_b4tt <- numeric(n)
b_tt_4 <- numeric(n)
mu_b4tt <- abs(rnorm(1, 3, 0.25))
sd1_b4tt <- abs(rt(1, df = 3, ncp = 0) * 1)
for (i in 1:n) {
b_tt_4[i] <- rgamma(1, mu_b4tt, sd1_b4tt) # Generate b_tt_4 from gamma(shape, rate) distribution
}
travelCostDrive1 <- travelCostDrive / 100
travelCostTransit1 <- travelCostTransit / 100
travelTimeDrive1 <- travelTimeDrive / 60
travelTimeWalk1 <- travelTimeWalk / 60
travelTimeBike1 <- travelTimeBike / 60
travelTimeTransit1 <- travelTimeTransit / 60
# Utility functions
V1 <- -1 * b_cost * (b_tt_1 * travelTimeDrive1 + travelCostDrive1)
V2 <- alpha_2 + -1 * b_cost * (b_tt_2 * travelTimeWalk1)
V3 <- alpha_3 + -1 * b_cost * (b_tt_3 * travelTimeBike1)
V4 <- alpha_4 + -1 * b_cost * (b_tt_4 * travelTimeTransit1 + travelCostTransit1)
# # Normalize the utilities
max_utilities <- apply(cbind(V1, V2, V3, V4), 1, max)
V1_norm <- V1 - max_utilities
V2_norm <- V2 - max_utilities
V3_norm <- V3 - max_utilities
V4_norm <- V4 - max_utilities
p1 <- exp(V1_norm) / rowSums(exp(cbind(V1_norm, V2_norm, V3_norm, V4_norm)))
p2 <- exp(V2_norm) / rowSums(exp(cbind(V1_norm, V2_norm, V3_norm, V4_norm)))
p3 <- exp(V3_norm) / rowSums(exp(cbind(V1_norm, V2_norm, V3_norm, V4_norm)))
p4 <- exp(V4_norm) / rowSums(exp(cbind(V1_norm, V2_norm, V3_norm, V4_norm)))
# initialize trptrans vector
trptrans <- rep(NA, nrow(fake_data))
# generate a vector of random numbers
ra <- runif(nrow(fake_data))
# loop over all rows of fake_data
for (i in 1:nrow(fake_data)) {
# find the index of the first element in the cumulative sum of probabilities that is greater than ra[i]
trptrans[i] <- min(which(cumsum(c(p1[i], p2[i], p3[i], p4[i])) > ra[i]))
}
# add the trptrans vector to the data as a new column
fake_data$trptrans <- trptrans
# Create a data frame containing your dataset
my_data <- data.frame(mu_b1tt, mu_b2tt, mu_b3tt, mu_b4tt, mu_a2, mu_a3, mu_a4, sd1_bc, sd1_b1tt, sd1_b2tt, sd1_b3tt, sd1_b4tt, sd2_a2, sd2_a3, sd2_a4)
# Extract the columns you want to use into a separate data frame
result_vector <- my_data[, c("mu_b1tt", "mu_b2tt", "mu_b3tt", "mu_b4tt", "mu_a2", "mu_a3", "mu_a4", "sd1_bc", "sd1_b1tt", "sd1_b2tt", "sd1_b3tt", "sd1_b4tt", "sd2_a2", "sd2_a3", "sd2_a4")]
# Calculate the means, medians, standard deviations and median absolute deviations
means <- sapply(result_vector, mean)
medians <- sapply(result_vector, median)
sds <- sapply(result_vector, sd)
mads <- sapply(result_vector, mad)
# Create a new data frame with the desired format
df <- data.frame(Mean = round(means,2), Median = round(medians,2), SD = round(sds,2), MAD = round(mads,2), row.names = c("mu_b1tt", "mu_b2tt", "mu_b3tt", "mu_b4tt", "mu_a2", "mu_a3", "mu_a4", "sd1_bc", "sd1_b1tt", "sd1_b2tt", "sd1_b3tt", "sd1_b4tt", "sd2_a2", "sd2_a3", "sd2_a4"))
# Print the new data frame
print(df)
# Mutate costs/times to be negative and in $/hours. Income in $100,000
mod = fake_data%>%mutate(travelCostDrive=travelCostDrive/100, travelCostTransit=travelCostTransit/100, travelTimeDrive=travelTimeDrive/60, travelTimeWalk=travelTimeWalk/60,travelTimeBike=travelTimeBike/60,
travelTimeTransit=travelTimeTransit/60,hhfaminc=hhfaminc/10^5)
mod = mod[order(mod$personid),]
mod$personid = cumsum(c(0, diff(mod$personid)) != 0) + 1
model.1 = cmdstan_model("mtc_equity_23.stan", cpp_options = list(stan_opencl = TRUE, stan_threads = TRUE), stanc_options = list("O1"))
samp_ct = nrow(mod)
data_list = list(N = samp_ct,
ncat = 4L,
nvar = 2L,
ntotvar = 5L,
grainsize = 1,
Y = as.integer(mod$trptrans),
x1 = as.matrix(mod%>%select(travelTimeDrive,travelCostDrive)),
x2 = cbind(mod$travelTimeWalk,rep(0,samp_ct)),
x3 = cbind(mod$travelTimeBike,rep(0,samp_ct)),
x4 = as.matrix(mod%>%select(travelTimeTransit,travelCostTransit)),
avail = as.matrix(mod%>%select(av_car,av_walk,av_bike,av_transit)),
N_1 = as.integer(max(mod$personid)), # number of households
M_1 = 1L,
J_1 = as.integer(mod$personid),
# Z_1 = cbind(rep(1,samp_ct),mod%>%select(race_2,race_3,race_4,race_5,hhfaminc)),
Z_1 = rep(1,samp_ct),
N_2 = as.integer(max(mod$trippurp)), # number of trip purposes
M_2 = 1L,
J_2 = as.integer(mod$trippurp),
prior_only = 0L)
model_fit = model.1$sample(data = data_list,
refresh = 100,
adapt_delta = 0.90,
seed = 76543,
iter_warmup = 2000,
iter_sampling =1000,
chains = 4,
parallel_chains = 4,
threads_per_chain = 9,
opencl_ids = c(0, 0))
model_fit$save_object(file = "fit_mtc_model-lognorm-v23-2.RDS")
original.R (8.9 KB)
The provided image showcases both the stan posteriors (displayed in the table below the image) and the synthetic data (depicted in the table above the image). It is evident that there is a lack of match between synthetic data and Stan posteriors.
It is important to note that certain variables in my model need to be positive, as indicated in the parameter section. One such variable is b_bc, which represents the cost parameter.
Additionally, I have encountered the following warning multiple times during the warmup stage.
Chain 3 Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
Chain 3 Exception: gamma_lpdf(OpenCL): Random variable[55, 0] = -1.07909e-05, but it must be not NaN! (in '/tmp/RtmpvxeqsQ/model-945ba1d099263.stan', line 138, column 2 to column 36)
Chain 3 If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
Chain 3 but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.
Also this warning at the end:
arning: 2 chain(s) finished unexpectedly!
Warning: The returned fit object will only read in results of successful chains. Please use read_cmdstan_csv() to read the results of the failed chains separately.Use the $output(chain_id) method for more output of the failed chains.
Warning: 1999 of 2000 (100.0%) transitions ended with a divergence.
See https://mc-stan.org/misc/warnings for details.
Warning: 1 of 2000 (0.0%) transitions hit the maximum treedepth limit of 10.
See https://mc-stan.org/misc/warnings for details.
Warning: 1 of 2 chains had an E-BFMI less than 0.2.
See https://mc-stan.org/misc/warnings for details.
I would appreciate it if anyone can help me identify the reason behind this discrepancy.