To follow up on my response, here is the simulation and the Stan model to recover the parameters and generate retrodictive checks (aka posterior predictive checks) for the data for the hurdle-Poisson data generation process that I described with a truncated Poisson and inflation at 5 dimes. (Note I am using an older version of rstan, so you would want to use the newer array synax).
library(rstan)
#function to simulate truncated Poisson data where data is between 1 and 10
rtpois <- function(N, lambda)
qpois(runif(N, dpois(0, lambda), 1-dpois(10, lambda)), lambda)
#simulation of hurdle-Poisson process with inflation at 5 and truncation at 10
theta <- 0.55 #45% zeroes for below sim process
pi <- 0.3
lambda <- 2.5
y <- rbinom(500, 1, theta)
for (n in 1:length(y)) {
if(y[n] == 1) {
y[n] <- rbinom(1, 1, pi)
y[n] <- ifelse(y[n]==1, 5, rtpois(1, lambda))
}
}
hist(y)
#Stan data
stan_data <- list(y = y, N = length(y))
#Stan model
stan_model <- "
data {
int<lower=0> N;
int<lower=0> y[N];
}
parameters {
real<lower=0, upper=1> theta;
simplex[2] pi;
real<lower=0> lambda;
}
model {
target += beta_lpdf(theta | 1, 1);
target += dirichlet_lpdf(pi | rep_vector(1, 2));
target += normal_lpdf(lambda | 3, 1);
for (n in 1:N) {
real poisson_t_lp = log(pi[2]) + poisson_lpmf(y[n] | lambda)
- log_diff_exp(poisson_lcdf(10 | lambda), poisson_lcdf(0 | lambda));
if (y[n] == 0)
target += log(theta);
else if (y[n] == 5)
target += log_sum_exp(log(pi[1]), poisson_t_lp) + log1m(theta);
else
target += poisson_t_lp + log1m(theta);
}
}
generated quantities {
vector[N] y_pred;
for (n in 1:N) {
y_pred[n] = 11;
if (bernoulli_rng(theta))
y_pred[n] = 0;
else if (bernoulli_rng(pi[1]))
y_pred[n] = 5;
else
while (y_pred[n] < 1 || y_pred[n] > 10)
y_pred[n] = poisson_rng(lambda);
}
}
"
m1 <- stan(model_code = stan_model, data = stan_data,
chains = 4, iter = 2000, warmup = 1000,
thin = 1, cores = 4)
print(m1, pars=c("theta","pi","lambda"))
#extract predictions for retrodictive checks
y_preds <- extract(m1)$y_pred
#retrodictive check plot
library(scales)
c_light_m1 <- c("#edf8e9")
c_light_highlight_m1 <- c("#c7e9c0")
c_mid_m1 <- c("#a1d99b")
c_mid_highlight_m1 <- c("#74c476")
c_dark_m1 <- c("#31a354")
c_dark_highlight_m1 <- c("#006d2c")
B <- max(y_preds) + 1
idx <- rep(1:B, each=2)
x <- sapply(1:length(idx), function(b) if(b %% 2 == 0) idx[b] + 0.5 else idx[b] - 0.5)
x <- (x - 1)
obs_counts <- hist(y, breaks=(0:(B)) - 0.5, plot=FALSE)$counts
pad_obs_counts <- sapply(idx, function(n) obs_counts[n])
pred_counts <- sapply(1:1000, function(n)
hist(y_preds[n,], breaks=(0:B) - 0.5, plot=FALSE)$counts)
probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
cred<- sapply(1:B, function(b) quantile(pred_counts[b,], probs=probs))
pad_cred <- do.call(cbind, lapply(idx, function(n) cred[1:9,n]))
plot(1, type="n", main="",
xlim=c(-0.5, B + 0.5), xlab="number of dimes",
ylim=c(0, 500), ylab="", cex.lab=2 , cex.axis=1.9)
title("Posterior predictions for number of dimes", line = 0.5, font.main=1, cex.main=2.2)
polygon(c(x, rev(x)), c(pad_cred[1,], rev(pad_cred[9,])),
col = alpha(c_light_m1, 0.7), border = NA)
polygon(c(x, rev(x)), c(pad_cred[2,], rev(pad_cred[8,])),
col = alpha(c_light_highlight_m1, 0.7), border = NA)
polygon(c(x, rev(x)), c(pad_cred[3,], rev(pad_cred[7,])),
col = alpha(c_mid_m1, 0.7), border = NA)
polygon(c(x, rev(x)), c(pad_cred[4,], rev(pad_cred[6,])),
col = alpha(c_mid_highlight_m1, 0.7), border = NA)
lines(x, pad_cred[5,], col=alpha(c_dark_m1, 0.7), lwd=2)
lines(x, pad_obs_counts, col="white", lty=1, lw=2.5)
lines(x, pad_obs_counts, col="black", lty=1, lw=2)
Which provides quite a bit better retrodictive checks for data like yours. You could write the model such that theta
, pi
, and lambda
were predicted by your covariates of interest.