I’m using the brms package to test the overall change in a hurdle model. I have two time periods for two treatment groups of individual observations and I want to see if there’s a difference in overall change in the average between the groups. I’m not sure what to do if the both the hurdle (0’s) and non-zero’s average increase. Also, I have concerns I’m not including the random intercepts in the testing appropriately, especially because they are important and correlated between the two parts of the model.
Attached is some R code to simulate and analyze similar data. In the analysis, the interaction effect for time period and treatment is different from 0 for both parts of the hurdle model, but the changes roughly cancel out each other. This can be seen in the marginal effects plots.
I have two hypothesis set up for testing. One uses fitted values and the other uses model parameters. While they agree in this case, is one of them better to use and why?
library(MASS)
library(brms)
library(ggplot2)
set.seed(08102018)
## Simulate data
# Subject IDs, Covariates
n_obs <- 2000
outcomes_sim <- data.frame(subject = rep(seq(1, n_obs), each = 2),
time_period = rep(c(0, 1), times = n_obs/2),
treatment = rep(c(0, 1), each = n_obs/2))
# Correlated Random Effects
random_effects <- mvrnorm(n = n_obs,
mu = c(0, 0),
Sigma = matrix(c(1, sqrt(1)*sqrt(1.5)*-.6, sqrt(1)*sqrt(1.5)*-.6, 1.5),
nrow = 2))
outcomes_sim$gamma_random_effect <- rep(random_effects[, 1], each = 2)
outcomes_sim$logistic_random_effect <- rep(random_effects[, 2], each = 2)
# Interaction variable
outcomes_sim$interaction <- outcomes_sim$time_period * outcomes_sim$treatment
## Gamma
# Baseline, average = 500, log link
mu <- log(300)
# Time Period Effect
tp <- 0
# Treatment Effect
trt <- 0
# Interaction
tp_trt <- log(345) - mu
# Shape
shape <- 1.7
# Rate, log link
rate <- shape / exp( mu +
outcomes_sim$gamma_random_effect +
outcomes_sim$time_period * tp +
outcomes_sim$treatment * trt +
outcomes_sim$interaction * tp_trt)
# Gamma result
outcomes_sim$gamma <- rgamma(n = length(unique(outcomes_sim$subject)),
shape = shape,
rate = rate)
## Logistic
# Baseline = .1
mu <- log(.06 / (1-.06))
# Time Period Effect
tp <- 0
# Treatment Effect
trt <- 0
# Interaction
tp_trt <- log(.18 / (1-.18)) - mu
# Probability
prob <- plogis(mu +
outcomes_sim$logistic_random_effect +
outcomes_sim$time_period * tp +
outcomes_sim$treatment * trt +
outcomes_sim$interaction * tp_trt)
# Logistic result
outcomes_sim$logistic <- rbinom(n = length(unique(outcomes_sim$subject)),
size = 1,
prob = 1 - prob)
## Hurdle
outcomes_sim$y <- outcomes_sim$gamma * outcomes_sim$logistic
ggplot(data = outcomes_sim, aes(y)) +
geom_histogram(binwidth = 100) +
facet_grid(treatment ~ time_period, labeller = label_both)
# Gamma part average increase
aggregate(outcomes_sim[outcomes_sim$y > 0, ]$y,
by = list(outcomes_sim[outcomes_sim$y > 0, ]$time_period,
outcomes_sim[outcomes_sim$y > 0, ]$treatment),
FUN = mean )
# Logistic part = 0 increase
aggregate(outcomes_sim[outcomes_sim$y == 0, ]$y,
by = list(outcomes_sim[outcomes_sim$y == 0, ]$time_period,
outcomes_sim[outcomes_sim$y == 0, ]$treatment),
FUN = length )
# Average change
aggregate(outcomes_sim$y,
by = list(outcomes_sim$time_period,
outcomes_sim$treatment),
FUN = mean )
# brms model
outcomes_sim$time_period <- as.factor(outcomes_sim$time_period)
outcomes_sim$treatment <- as.factor(outcomes_sim$treatment)
fit_model <- brm(bf(formula = y ~ time_period + treatment + time_period*treatment + (1|subject|subject),
hu ~ time_period + treatment + time_period*treatment + (1|subject|subject),
nl = FALSE),
data = outcomes_sim,
family = hurdle_gamma(link = "log"))
summary(fit_model) # Should look good
marginal_effects(fit_model) # No random effects
marginal_effects(fit_model, re_formula = NULL) # Include random effects
newdata = data.frame(subject = c(0, 0, 0, 0),
time_period = c(0, 0, 1, 1),
treatment = c(0, 1, 0, 1),
label = c("tp_0_trt_0", "tp_0_trt_1", "tp_1_trt_0", "tp_1_trt_1"))
fi <- data.frame(fitted(fit_model, newdata, summary = FALSE, allow_new_levels = TRUE))
names(fi) <- c("tp_0_trt_0", "tp_0_trt_1", "tp_1_trt_0", "tp_1_trt_1")
hypothesis(fi, c("(tp_1_trt_0 - tp_0_trt_0) - (tp_1_trt_1 - tp_0_trt_1) = 0"))
print_formula <- function(values){
paste0("(exp(", paste0(values, collapse = " + "), ") * (1 - (1/(1+exp(-(", paste0(paste0("hu_", values), collapse = " + "), "))))) )")
}
hypo <- paste0("( ", print_formula(c("Intercept", "time_period1")), " - ", print_formula(c("Intercept")), " ) - ( ",
print_formula(c("Intercept", "time_period1", "treatment1", "time_period1:treatment1")), " - ", print_formula(c("Intercept", "treatment1")), ") = 0")
hypothesis(fit_model, c(hypo))
I can add priors, switch to better data handling packages, etc. if needed.