I have a very basic logistic regression model:
y_n \sim \text{Bernoulli}(\text{logit}^{-1}(\alpha+\beta x_n)).
\alpha \sim \mathcal{N}(0,2.5)
\beta \sim \mathcal{N}(0,2.5)
As a check I did simulation based callibration (SBC) so generated and fitted 1000 datasets. The SBC seemed fine (not shown) but then, after plotting the true values against the mean posterior values of \alpha, I noticed something strange: When plotting the posterior_mean of \alpha as a function of the true \alpha, the inferred \alpha does not go beyond -5 or 5 and some values suspiciously agglomerate around -5 and 5 (red dots in first graph), and those same simulated datasets also have a posteriorSD that is larger and separate from the rest (same red dots in second graph below).
On the graphs below, each dot corresponds to one simulated dataset, and the posterior mean (y-axis) is plotted as a function of the true mean (x-axis). The colored dots are the ones I find “suspicious”:
And you can see the strange behavior when we plot the \alpha posterior SD as a function of the true mean, in color are the very same dots from the previous graph:
Does anybody understand why this is happening?
Below is a reproducible example:
Data Generating program (generate_data.stan)
data {
int<lower=1> N;
vector[N] x;
}
generated quantities {
real alpha;
real beta;
vector[N] p;
vector<lower=0, upper=1>[N] y;
alpha = normal_rng(0,2.5);
beta = normal_rng(0,2.5);
for (n in 1:N) {
p[n] = inv_logit(alpha + x[n]*beta);
y[n] = bernoulli_rng(p[n]);
}
}
Stan model (model.stan)
data {
int<lower=0> N;
vector[N] x;
array[N] int<lower=0, upper=1> y;
}
parameters {
real alpha;
real beta;
}
model {
alpha ~ normal(0, 2.5);
beta ~ normal(0, 2.5);
for (n in 1:N) {
y[n] ~ bernoulli_logit(alpha + x[n]*beta);
}
}
Code:
data <- c(0.8005,0.2906,0.2749,0.6954,0.0994,0.096,0.7415,0.5268,0.8975,0.5828,0.0761,0.9294,0.238,0.1003,0.0836,0.1037,0.9322,0.1746,0.9061,0.0949,0.0643,0.1103,0.1252,0.9093,0.217,0.7554,0.9464,0.6578,0.5713,0.1081,0.9908,0.268,0.4619,0.233,0.9943,0.0751,0.1154,0.3231,0.9973,0.9485,0.8924,0.9991,0.094,0.1447,0.1288,0.9686,0.2301,0.6639,0.0735,0.0939,0.0781,0.1177,0.3757,0.9973,0.3544,0.353,0.9988,0.2945,0.952,0.0704,0.0714,0.936,0.0729,0.7596,0.0852,0.2991,0.1807,0.8161,0.245,0.2215,0.9728,0.0781,0.1569,0.933,0.0803,0.9972,0.2066,0.9944,0.1668,0.2539,0.0568,0.0661,0.1301,0.1354,0.0708,0.0667,0.974,0.9971,0.9258,0.9861,0.3701,0.1389,0.8527,0.2023,0.9455,0.7888,0.1293,0.9964,0.2298,0.208)
input_data <- list("N" = length(data),
"x" = data)
model <- cmdstan_model(stan_file="generate_data.stan",quiet=FALSE)
fit <- suppressMessages(model$sample(data=input_data,
chains=1,
iter_warmup = 0,
iter_sampling = 1000,
fixed_param = TRUE,
seed = 1,
refresh = 0))
posterior <- fit$draws()
options(mc.cores = parallel::detectCores())
cl <- makeCluster(24)
registerDoParallel(cl)
model <- cmdstan_model(stan_file="model.stan",quiet=FALSE)
tmp<-dimnames(posterior)$variable
yNames <- tmp[grepl("y",tmp)]
vars <- c("alpha","beta")
ensembleOutput <- foreach(i=1:nrow(posterior),
.combine='rbind') %dopar% {
simulatedY <- sapply(1:length(yNames), function(j) {
posterior[i,1,variable=yNames[j]]
})
input_data <- list("N" = length(simulatedY),
"x" = data,
"y"=simulatedY)
fit <- model$sample(data=input_data,
chains=4,
parallel_chains=4,
refresh=0,
show_messages = FALSE,
seed=i)
draws <- fit$draws()
# Compute rank of prior draw with respect to thinned posterior draws
true_mean <- as.numeric(posterior[i,1,variable=vars])
names(true_mean) <- vars
post_mean <- fit$summary(vars)$mean
post_sd <- fit$summary(vars)$sd
sbc_rank <- sapply(vars,function(var) { sum(true_mean[var] < draws[,,variable=var][seq(1, 4000 - 8, 8)])})
r <- data.frame(t(c(sbc_rank,post_sd,true_mean,post_mean)))
colnames(r) <- c(paste0(vars,"_SBC"),paste0(vars,"_PosteriorSD"),paste0(vars,"_TrueMean"),paste0(vars,"_PosteriorMean"))
r$iteration <- i
r
}
ggplot(ensembleOutput,aes(x=alpha_TrueMean,y=alpha_PosteriorMean,colour=Status,fill=Status))+
geom_point()+
theme(legend.position = "none")
ggplot(ensembleOutput,aes(x=alpha_TrueMean,y=alpha_PosteriorSD,colour=Status,fill=Status))+
geom_point()+
theme(legend.position = "none")

