I was trying to speed up a hierarchical binomial model that I’m using to run some pre-experiment simulations to plan my sample sizes, and I noticed different posteriors between my regular version and my optimized version. I reduced things down to a minimal reproducible example that I’ve included below (inc. R code for data generation). In binom_mod_long.stan
I use bernoulli_logit()
while in binom_mod_matrix.stan
I try to use the sufficient statistics trick from the manual. The latter runs much faster but yields a lower n_eff, and even when I boost the requested number of samples to make n_eff match, the posteriors for the pop_mean
parameter are discernibly different. Presumably I’ve misunderstood something or made an error somewhere, but haven’t been able to figure it out myself yet. Help?
binom_mod_matrix.stan
data {
// nSubj: number of subjects
int nSubj ;
// nK: number of observations per participant
int nK ;
// Y: matrix of outcomes (0/1)
int Y[nSubj,nK] ;
// prior_mean_for_pop_mean
real prior_mean_for_pop_mean ;
}
transformed data{
// Y_sum: sum of observations for each participant
int Y_sum[nSubj] ;
for(s in 1:nSubj){
Y_sum[s] = sum(Y[s,]);
}
}
parameters {
// pop_mean: population mean
real pop_mean ;
// pop_sd: population sd
real<lower=0> pop_sd ;
// subj_coefs: coefficients for each subject
vector[nSubj] subj_coefs ;
}
model {
pop_mean ~ normal(prior_mean_for_pop_mean,1) a;
pop_sd ~ normal(0,1) ;
subj_coefs ~ normal(0,1) ; //non-centered parameterization
Y_sum ~ binomial(nK,inv_logit(pop_mean + subj_coefs*pop_sd)) ;
}
binom_mod_long.stan
data {
// nSubj: number of subjects
int nSubj ;
// nY: total number of observations
int nY ;
// Y: vector of outcomes (0/1)
int Y[nY] ;
// S: integer label of subject associated with each entry in Y
int S[nY] ;
// prior_mean_for_pop_mean
real prior_mean_for_pop_mean ;
}
parameters {
// pop_mean: population mean
real pop_mean ;
// pop_sd: population sd
real<lower=0> pop_sd ;
// subj_coefs: coefficients for each subject
vector[nSubj] subj_coefs ;
}
model {
pop_mean ~ normal(prior_mean_for_pop_mean,1) ;
pop_sd ~ normal(0,1) ;
subj_coefs ~ normal(0,1) ; //non-centered parameterization
Y ~ bernoulli_logit( pop_mean + subj_coefs[S]*pop_sd ) ;
}
binom_sim.r
#load packages
library(tidyverse)
library(rstan)
#compile the models
binom_mod_matrix = rstan::stan_model(file='binom_mod_matrix.stan')
binom_mod_long = rstan::stan_model(file='binom_mod_long.stan')
#simulation parameters
N = 100
K = 100
SD = 1
prop_mean = .8
#set the random seed for reproducibility
set.seed(1)
#combine to long data format
subj_true_coefs = data.frame(
subj = 1:N
, value = rnorm(N,qlogis(prop_mean),SD)
)
#generate observed binomial data
a_mat = matrix(NA,nrow=N,ncol=K)
for(i in 1:nrow(subj_true_coefs)){
a_mat[i,] = rbinom(K,1,plogis(subj_true_coefs$value[i]))
}
#sample the matrix-formatted data
fit_matrix = rstan::sampling(
object = binom_mod_matrix
, data = list(
nSubj = N
, nK = K
, Y = a_mat
, prior_mean_for_pop_mean = qlogis(prop_mean)
)
, chains = 4
, iter = 2e3
, refresh = 0
)
#reshape for the long model
a_long = tidyr::gather(as.data.frame(a_mat))
#sample the long-formatted data
fit_long = rstan::sampling(
object = binom_mod_long
, data = list(
nSubj = N
, nY = N*K
, Y = a_long$value
, S = as.numeric(as.factor(a_long$key))
, prior_mean_for_pop_mean = qlogis(prop_mean)
)
, chains = 4
, iter = 2e3
, refresh = 0
)
#How long did each take
mean(rowSums(get_elapsed_time(fit_matrix)))
# <1s
mean(rowSums(get_elapsed_time(fit_long)))
# ~25s
#check whether the samples for the pop_mean parameter roughly match
summary(fit_matrix,par=c('pop_mean'))$summary
# mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
#pop_mean 1.465358 0.003453002 0.08865914 1.28569 1.408115 1.465362 1.52566 1.638436 659.2549 1.002375
summary(fit_long,par=c('pop_mean'))$summary
# mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
#pop_mean 1.294175 0.000279221 0.02465094 1.246923 1.277222 1.293975 1.31059 1.342536 7794.187 0.9995637