# Poisson Binomial comparisons # In this file I compare the coverage accuracy of three # versions of the poisson binomial model for the data # with observed individual predictors but outcomes only # visible in aggregate within groups. library(tidyverse) # load stan and plotting lib library(cmdstanr) library(bayesplot) library(posterior) library(tidybayes) # compile models # compile stan model for poisson binomial with probit link mod_pb_probit <- cmdstan_model("code/poisson_binomial_probit.stan") # compile stan model for poisson binomial with probit link mod_pb_logit <- cmdstan_model("code/poisson_binomial_logit.stan") # compile stan model for averaging probit probs in groups mod_pb_ma <- cmdstan_model("code/poisson_binomial_mean_approx.stan") ## Try several models for the partially observed sort of eco reg problem # make fake data make_simulated_data <- function(N, K, G = 32, delta = rnorm(K), obs_frac = .33) { N <- N K <- K delta <- delta X <- matrix(rnorm(N * K), ncol = K) eps <- rnorm(N) grps <- sample(G, N, replace = TRUE) select_ind <- as.numeric(X %*% delta >= -eps) df <- tibble(select_ind, grps) obs <- sample(which(df$select_ind==0), obs_frac*N, replace = F) df_obs <- df[obs,] %>% mutate(y = select_ind, gsize = 1, grps = (G+1):(G+length(obs))) %>% select(grps, y, gsize) df_unob <- df[-obs, ] %>% group_by(grps) %>% summarise(y = sum(select_ind), gsize = n()) df <- bind_rows(df_obs, df_unob) # re-order X to match order of df! X_ord <- rbind(X[obs,], X[-obs,]) ## now build stan data ob standat <- list(N = N, K = K, G = nrow(df), y = df$y, X = X_ord, # put in order gsize = df$gsize, delta_true = delta, z_true = c(select_ind[obs], select_ind[-obs]), p_true = pnorm(X_ord %*% delta)) standat } # set seed set.seed(2021) # make data standat <- make_simulated_data(N = 1000, K = 3, G = 32, delta = rnorm(K, 0, 2), obs_frac = .33) # now sample # sample ! pb_draws <- mod_pb_probit$sample(data = standat, chains = 4, parallel_chains = 4, iter_sampling = 4000, adapt_delta = .9) # check performance mcmc_recover_hist(pb_draws$draws(variables = c("mu", "delta")), true = c(0, standat$delta_true)) pb_draws mcmc_recover_scatter(pb_draws$draws(variables = "p"), true = standat$p_true) pb_draws$summary(variables = "p", function(x) quantile(x, c(.025, .975))) %>% mutate(p_true = standat$p_true, in_interval = p_true >= `2.5%` & p_true <= `97.5%`) %>% summarise(ave_cov = mean(in_interval)) pb_draws$summary(variables = "delta", function(x) quantile(x, c(.025, .975))) %>% mutate(delta_true = standat$delta_true, in_interval = delta_true >= `2.5%` & delta_true <= `97.5%`) # sample logit pb_draws_logit <- mod_pb_logit$sample(data = standat, chains = 4, parallel_chains = 4, iter_sampling = 5000) # check performance mcmc_recover_hist(pb_draws_logit$draws(variables = c("mu", "delta")), true = c(0, standat$delta_true)) pb_draws_logit mcmc_recover_scatter(pb_draws_logit$draws(variables = "p"), true = standat$p_true) pb_draws_logit$summary(variables = "p", function(x) quantile(x, c(.025, .975))) %>% mutate(p_true = standat$p_true, in_interval = p_true >= `2.5%` & p_true <= `97.5%`) %>% summarise(ave_cov = mean(in_interval)) pb_draws_logit$summary(variables = "delta", function(x) quantile(x, c(.025, .975))) %>% mutate(delta_true = standat$delta_true, in_interval = delta_true >= `2.5%` & delta_true <= `97.5%`) # sample averaging probits! pb_ma_draws <- mod_pb_ma$sample(data = standat, chains = 4, parallel_chains = 4, iter_sampling = 5000) # check performance mcmc_recover_hist(pb_ma_draws$draws(variables = c("mu", "delta")), true = c(0, standat$delta_true)) pb_ma_draws mcmc_recover_scatter(pb_ma_draws$draws(variables = "p"), true = standat$p_true) ## compare classification accuracy p_poi_bin <- pb_draws$summary(variables = "p", "median")$median p_poi_bin_logit <- pb_draws_logit$summary(variables = "p", "median")$median p_ma_approx <- pb_ma_draws$summary(variables = "p", "median")$median # MSE for probit crossprod(p_poi_bin - standat$p_true) # MSE for logit crossprod(p_poi_bin_logit - standat$p_true) # MSE for mean prob crossprod(p_ma_approx - standat$p_true) # evaluating coverage. f_coverage_rep <- function(N, K, G, obs_frac, f_) { df <- make_simulated_data(N, K, G, delta = rnorm(K, 0, 5), obs_frac) est_probit <- f_$sample(data = df, chains = 4, parallel_chains = 4, iter_sampling = 4000, refresh=0) res <- tibble(model = rep(quo(f_) %>% rlang::as_name(), each = (N+K+1)), G = G, obs_frac = obs_frac) prob_cov <- est_probit$summary(variables = c("mu", "delta", "p"), function(x) quantile(x, c(.025, .975))) %>% mutate(param_true = c(0, df$delta_true, df$p_true), in_95_ci = param_true >= `2.5%` & param_true <= `97.5%`) bind_cols(res, prob_cov) } f_coverage_rep(1000, 3, 32, .33, f_ = mod_pb_logit) # Calibration. I wrote out for logit with 20 reps (takes about 20 minutes # on my unspectacular macbook. Probit is longer, mean approximation is fast) reps <- 20 pb <- progress_estimated(reps) sims_probit <- map_dfr(rep(1000, reps), .f = function(n) { pb$tick()$print() f_coverage_rep(N = n, K = 3, G = 32, obs_frac = .33, f_ = mod_pb_logit) }) sims_probit %>% filter(variable %in% c("delta[1]", "delta[2]", "delta[3]", "mu")) %>% group_by(variable) %>% summarise(mean_cov = mean(in_95_ci)) sims_probit