I am trying to emulate an analysis in a vignette taken from the GLMMadaptive
package in R. It is a relatively simple panel design, the response, on a binary variable, of two groups over time.
Here is simulated data for a binary longitudinal outcome
library(GLMMadaptive)
library(dplyr)
set.seed(1234)
n <- 100 # number of subjects
K <- 8 # number of measurements per subject
t_max <- 15 # maximum follow-up time
# we constuct a data frame with the design:
# everyone has a baseline measurement, and then measurements at random follow-up times
DF <- data.frame(id = rep(seq_len(n), each = K),
time = c(replicate(n, c(0, sort(runif(K - 1, 0, t_max))))),
sex = rep(gl(2, n/2, labels = c("male", "female")), each = K))
# design matrices for the fixed and random effects
X <- model.matrix(~ sex * time, data = DF)
Z <- model.matrix(~ time, data = DF)
betas <- c(-2.13, -0.25, 0.24, -0.05) # fixed effects coefficients
D11 <- 0.48 # variance of random intercepts
D22 <- 0.1 # variance of random slopes
# we simulate random effects
b <- cbind(rnorm(n, sd = sqrt(D11)), rnorm(n, sd = sqrt(D22)))
# linear predictor
eta_y <- as.vector(X %*% betas + rowSums(Z * b[DF$id, ]))
# we simulate binary longitudinal data
DF$y <- rbinom(n * K, 1, plogis(eta_y))
Let’s plot these data to get some idea of what it looks like. To plot it well we need to bin the data
DF$timeCat <- ifelse(DF$time >= 0 & DF$time < 1, 1,
ifelse(DF$time >= 1 & DF$time < 2, 2,
ifelse(DF$time >= 2 & DF$time < 3, 3,
ifelse(DF$time >= 3 & DF$time < 4, 4,
ifelse(DF$time >= 4 & DF$time < 5, 5,
ifelse(DF$time >= 5 & DF$time < 6, 6,
ifelse(DF$time >= 6 & DF$time < 7, 7,
ifelse(DF$time >= 7 & DF$time < 8, 8,
ifelse(DF$time >= 8 & DF$time < 9, 9,
ifelse(DF$time >= 9 & DF$time < 10, 10,
ifelse(DF$time >= 10 & DF$time < 11, 11,
ifelse(DF$time >= 11 & DF$time < 12, 12,
ifelse(DF$time >= 12 & DF$time < 13, 13,
ifelse(DF$time >= 13 & DF$time < 14, 14, 15))))))))))))))
# lmake a summary table of proportions of events by group and time point
margDF <- DF %>% group_by(sex, timeCat) %>%
summarise(mean = mean(y), sd = sd(y), count = n()))
# now plot the binned data
ggplot(margDF, aes(x = timeCat, y = mean, colour = sex, shape = sex)) +
geom_point(stat = "identity") +
geom_line(stat = "identity") +
geom_smooth(method = "lm", se = F) +
scale_x_continuous(breaks = seq(1,15,1))
It looks like the outcome gets steadily more likely over time and that the two sexes start out different at different levels but with about the same rate of increase over time.
Now we run a mixed-model linear regression in the GLMMadaptive package.
# We continue by fitting the mixed effects logistic regression for y assuming random intercepts and random slopes for the random-effects part.
fm <- mixed_model(fixed = y ~ sex * time,
random = ~ 1 | id,
data = DF,
family = binomial())
# obtain marginal (not subject-specific) coefficients (WARNING: takes about 20 seconds to run)
margMod <- marginal_coefs(fm, std_errors = TRUE, cores = 3)
# output
# Estimate Std.Err z-value p-value
# (Intercept) -1.5919 0.2609 -6.1007 < 1e-04
# sexfemale -0.8907 0.4340 -2.0521 0.04016
# time 0.1732 0.0236 7.3545 < 1e-04
# sexfemale:time 0.0331 0.0354 0.9340 0.35033
Now for the Bayesian version
# data.
dList <- list(N = nrow(DF),
nID = nlevels(as.factor(DF$id)),
id = DF$id,
time = DF$time,
sex = as.numeric(DF$sex) - 1,
outcome = DF$y)
# model string for longitudinal logistic regression with reference-level parameterisation
mString <- "
data{
int<lower=1> N;
int<lower=1> nID;
int<lower=1> id [N];
real time [N];
int <lower=0,upper=1> sex [N];
int<lower=0,upper=1> outcome [N];
}
parameters{
real a;
real bSex;
real bTime;
real bSxT;
vector[nID] bSubj;
real<lower=0> sigma_subj;
}
model{
vector [N] p;
//hyper-priors
sigma_subj ~ normal(0,2);
//priors
bSex ~ normal(0,2);
bTime ~ normal(0,2);
bSxT ~ normal(0,2);
bSubj ~ normal(0,2);
//likelihood
for (i in 1:N) {
outcome[i] ~ bernoulli_logit(a + bSex*sex[i] + bTime*time[i] + bSxT*sex[i]*time[i]+ bSubj[id[i]]);
}
}
"
# run the model
longLogitCont <- stan(model_code = mString,
data = dList,
iter = 2e3,
warmup = 1e3,
cores = 3,
chains = 3,
seed = 123)
Now for the model summary
print(longLogitCont, pars = c("a", "bSex", "bTime", "bSxT"), probs = c(0.025, 0.975))
# output
# mean se_mean sd 2.5% 97.5% n_eff Rhat
# a -2.64 0.01 0.40 -3.46 -1.86 1589 1
# bSex -1.21 0.01 0.60 -2.41 -0.06 1746 1
# bTime 0.29 0.00 0.04 0.22 0.36 2845 1
# bSxT 0.03 0.00 0.05 -0.07 0.14 3060 1
So the model converges quite well, but why are the parameter estimates so different from the marginal coefficients from the GLMMadaptive package?
Can anyone see anything wrong with the way my model is parameterised? I Tried a reference-level parameterisation as used in the NHST model from the GLMMadaptive
package.
My understanding was that hierarchican Bayesian models of this sort deliver naturally marginal coefficients anyway so I expected the results from the NHST analysis with marginal coefficients and the Bayesian analysis would be more similar. They are in the same direction but the Bayesian regression coefficients for the intercepts and the slops are a fair bit larger.
Alternatively is it something to do with my priors, or my not specifying an error-covariance matrix, as mentioned in the final answer in this previous post https://discourse.mc-stan.org/t/bayesian-repeated-measures-logistic-regression/13275??