Overall change in hurdle model with random effects

I’m using the brms package to test the overall change in a hurdle model. I have two time periods for two treatment groups of individual observations and I want to see if there’s a difference in overall change in the average between the groups. I’m not sure what to do if the both the hurdle (0’s) and non-zero’s average increase. Also, I have concerns I’m not including the random intercepts in the testing appropriately, especially because they are important and correlated between the two parts of the model.

Attached is some R code to simulate and analyze similar data. In the analysis, the interaction effect for time period and treatment is different from 0 for both parts of the hurdle model, but the changes roughly cancel out each other. This can be seen in the marginal effects plots.

I have two hypothesis set up for testing. One uses fitted values and the other uses model parameters. While they agree in this case, is one of them better to use and why?

library(MASS)
library(brms)
library(ggplot2)

set.seed(08102018)
## Simulate data
# Subject IDs, Covariates
n_obs <- 2000
outcomes_sim <- data.frame(subject = rep(seq(1, n_obs), each = 2),
                           time_period = rep(c(0, 1), times = n_obs/2),
                           treatment = rep(c(0, 1), each = n_obs/2))

# Correlated Random Effects
random_effects <- mvrnorm(n = n_obs, 
                          mu = c(0, 0),
                          Sigma = matrix(c(1, sqrt(1)*sqrt(1.5)*-.6, sqrt(1)*sqrt(1.5)*-.6, 1.5),
                                         nrow = 2))
outcomes_sim$gamma_random_effect <- rep(random_effects[, 1], each = 2)
outcomes_sim$logistic_random_effect <- rep(random_effects[, 2], each = 2)

# Interaction variable
outcomes_sim$interaction <- outcomes_sim$time_period * outcomes_sim$treatment

## Gamma
# Baseline, average = 500, log link
mu <- log(300)

# Time Period Effect
tp <- 0

# Treatment Effect
trt <- 0

# Interaction
tp_trt <- log(345) - mu

# Shape
shape <- 1.7

# Rate, log link
rate <- shape / exp( mu + 
                      outcomes_sim$gamma_random_effect +
                      outcomes_sim$time_period * tp +
                      outcomes_sim$treatment * trt +
                      outcomes_sim$interaction * tp_trt)

# Gamma result
outcomes_sim$gamma <- rgamma(n = length(unique(outcomes_sim$subject)), 
                             shape = shape, 
                             rate = rate)

## Logistic
# Baseline = .1
mu <- log(.06 / (1-.06))

# Time Period Effect
tp <- 0

# Treatment Effect
trt <- 0

# Interaction
tp_trt <- log(.18 / (1-.18)) - mu

# Probability
prob <- plogis(mu +
  outcomes_sim$logistic_random_effect +
  outcomes_sim$time_period * tp +
  outcomes_sim$treatment * trt +
  outcomes_sim$interaction * tp_trt)

# Logistic result
outcomes_sim$logistic <- rbinom(n = length(unique(outcomes_sim$subject)), 
                                size = 1,
                                prob = 1 - prob)

## Hurdle 
outcomes_sim$y <- outcomes_sim$gamma * outcomes_sim$logistic

ggplot(data = outcomes_sim, aes(y)) +
  geom_histogram(binwidth = 100) +
  facet_grid(treatment ~ time_period, labeller = label_both)

# Gamma part average increase
aggregate(outcomes_sim[outcomes_sim$y > 0, ]$y, 
          by = list(outcomes_sim[outcomes_sim$y > 0, ]$time_period, 
                    outcomes_sim[outcomes_sim$y > 0, ]$treatment), 
          FUN = mean )

# Logistic part = 0 increase
aggregate(outcomes_sim[outcomes_sim$y == 0, ]$y, 
          by = list(outcomes_sim[outcomes_sim$y == 0, ]$time_period, 
                    outcomes_sim[outcomes_sim$y == 0, ]$treatment), 
          FUN = length )

# Average change
aggregate(outcomes_sim$y, 
          by = list(outcomes_sim$time_period, 
                    outcomes_sim$treatment), 
          FUN = mean )

# brms model
outcomes_sim$time_period <- as.factor(outcomes_sim$time_period)
outcomes_sim$treatment <- as.factor(outcomes_sim$treatment)

fit_model <- brm(bf(formula = y ~ time_period + treatment + time_period*treatment + (1|subject|subject), 
                    hu ~ time_period + treatment + time_period*treatment + (1|subject|subject), 
                    nl = FALSE), 
                 data = outcomes_sim, 
                 family = hurdle_gamma(link = "log"))


summary(fit_model) # Should look good
marginal_effects(fit_model) # No random effects

marginal_effects(fit_model, re_formula = NULL) # Include random effects

newdata = data.frame(subject = c(0, 0, 0, 0),
                     time_period  = c(0, 0, 1, 1),
                     treatment = c(0, 1, 0, 1),
                     label = c("tp_0_trt_0", "tp_0_trt_1", "tp_1_trt_0", "tp_1_trt_1"))

fi <- data.frame(fitted(fit_model, newdata, summary = FALSE, allow_new_levels = TRUE))
names(fi) <- c("tp_0_trt_0", "tp_0_trt_1", "tp_1_trt_0", "tp_1_trt_1")
hypothesis(fi, c("(tp_1_trt_0 - tp_0_trt_0) - (tp_1_trt_1 - tp_0_trt_1) = 0"))


print_formula <- function(values){
  paste0("(exp(", paste0(values, collapse = " + "), ") * (1 - (1/(1+exp(-(", paste0(paste0("hu_", values), collapse = " + "), "))))) )")
}
hypo <- paste0("( ", print_formula(c("Intercept", "time_period1")), " - ", print_formula(c("Intercept")), " ) - ( ",
               print_formula(c("Intercept", "time_period1", "treatment1", "time_period1:treatment1")), " - ", print_formula(c("Intercept", "treatment1")), ") = 0")
hypothesis(fit_model, c(hypo))

I can add priors, switch to better data handling packages, etc. if needed.

Please add the “brms” tag to brms related questions (I have done it for you this time).

To combine estimates from both model parts (i.e. estimate the mean), I recommend using the fitted method (as it returns results of means) with newdata equal to the conditions you want to compare.
And this seems to be exactly what you did :-)

Awesome, thanks!