Got it, thanks!
I have a couple of more questions. I re-wrote the code chunks showed before to better explain my rationale.
First, I will replicate the dataset in which all rows contain N = 50 and fit the model.
# Install/load packages
pacman::p_load(magrittr,
dplyr,
brms)
pacman::p_load_gh("stan/cmdstanr")
# fake data
## Version 1 (N = 50) ----
## long format
set.seed(123)
dat_long <- data.frame(
subj = rep(1:10, each = 100),
item = rep(1:10, each = 10),
bin = rep(1:10, times = 10),
cond = c(-.5,.5),
pTarget = rbinom(1000, 1, .6)
)
## aggregated over bins/items
dat_aggregated1 <- dat_long %>%
dplyr::group_by(subj, cond) %>%
dplyr::summarise(sum = sum(pTarget), N = length(pTarget))
# model using addition-terms
m_aggregated1 <- brm(formula = sum | trials(N) ~ cond,
family = binomial(),
iter = 5000,
backend = "cmdstanr",
data = dat_aggregated1)
And here is the PPC code.
I generated figures using both brms::pp_check() and your approach to compare the shape of Y in each output. I noticed you used 1 for MARGIN in sweep(), which is a rowwise operation. Honestly I do not fully understand how MARGIN = 1 worked here, since length(dat_aggregated1$N) = 20 and dim(pred1) = 10000 20. I expected that we would need a vector with 10000 values in STATS when using MARGIN = 1 in this case. Nevertheless, it worked and both Y look identical.
# posterior predictive check
set.seed(123)
pred1 <- brms::posterior_predict(m_aggregated1, cores = 1)
# Divide the samples for each row by the corresponding number of trials
pred_divided1_row <- sweep(pred1,
MARGIN = 1, #rowwise
STATS = dat_aggregated1$N, # length = 20
FUN = "/")
y_divided1 <- dat_aggregated1$sum / dat_aggregated1$N
samples_to_show1 <- sample(1:(dim(pred_divided1_row)[1]), size = 30)
# Same Y
brms::pp_check(m_aggregated1, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided1, pred_divided1_row[samples_to_show1, ])
I then generated grouped PPCs, and again both look perfect.
# Same Y
brms::pp_check(m_aggregated1,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided1,
pred_divided1_row[samples_to_show1, ],
group = dat_aggregated1$cond)
I now tried to generate the same figure with MARGIN = 2 (column wise operation) instead, which in this case looked identical to MARGIN = 1 output
# Above you ran sweep() with MARGIN = 1, thus rowwise, using
# values from dat_aggregated1$N, which has length = 20, for STATS
# pred1, however, has 20 columns, not 20 rows
dim(pred1)
# Now let's use MARGIN = 2, thus columnwise
pred_divided1_col <- sweep(pred1,
MARGIN = 2, #columnwise
STATS = dat_aggregated1$N, # length = 20
FUN = "/")
# Same Y
bayesplot::ppc_dens_overlay(y_divided1,
# MARGIN = 1
pred_divided1_row[samples_to_show1, ])
bayesplot::ppc_dens_overlay(y_divided1,
# MARGIN = 2
pred_divided1_col[samples_to_show1, ])
# Same Y
bayesplot::ppc_dens_overlay_grouped(y_divided1,
# MARGIN = 1
pred_divided1_row[samples_to_show1, ],
group = dat_aggregated1$cond)
bayesplot::ppc_dens_overlay_grouped(y_divided1,
# MARGIN = 2
pred_divided1_col[samples_to_show1, ],
group = dat_aggregated1$cond)
Later, I tried to replicate this workflow with a personal dataset and it looked very odd. I figured that it wasn’t working because my dataset does not contain identical Ns in each row, as it was the case above (N = 50).
Thus I now generated a fake dataset with different N and fit a corresponding model.
# Version 2 (varying N)
set.seed(123)
dat_aggregated2 <- dat_long %>%
dplyr::group_by(subj, cond) %>%
dplyr::summarise(sum = sum(pTarget), N = length(pTarget)) %>%
# Add numbers to randomly change N rowwise
dplyr::mutate(N = (N + round(runif(1, 0, 30))))
# model using addition-terms
m_aggregated2 <- brm(formula = sum | trials(N) ~ cond,
family = binomial(),
iter = 5000,
backend = "cmdstanr",
data = dat_aggregated2)
And here is the PPC code.
# posterior predictive check
set.seed(123)
pred2 <- brms::posterior_predict(m_aggregated2, cores = 1)
# Divide the samples for each row by the corresponding number of trials
pred_divided2_row <- sweep(pred2,
MARGIN = 1, #rowwise
STATS = dat_aggregated2$N,
FUN = "/")
y_divided2 <- dat_aggregated2$sum / dat_aggregated2$N
samples_to_show2 <- sample(1:(dim(pred_divided2_row)[1]), size = 30)
# Different Y!
brms::pp_check(m_aggregated2, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided2, pred_divided2_row[samples_to_show2, ])
# Different Y!
brms::pp_check(m_aggregated2,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
pred_divided2_row[samples_to_show2, ],
group = dat_aggregated2$cond)
# Now with MARGIN = 2
pred_divided2_col <- sweep(pred2,
MARGIN = 2, #columnwise
STATS = dat_aggregated2$N, # length = 20
FUN = "/")
# Different Y
brms::pp_check(m_aggregated2, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided2, pred_divided2_col[samples_to_show2, ])
# Same Y
brms::pp_check(m_aggregated2,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
pred_divided2_col[samples_to_show2, ],
group = dat_aggregated2$cond)
# MARGINs 1 vs. 2
# Different Y!
bayesplot::ppc_dens_overlay(y_divided2,
# MARGIN = 1
pred_divided2_row[samples_to_show2, ])
bayesplot::ppc_dens_overlay(y_divided2,
# MARGIN = 2
pred_divided2_col[samples_to_show2, ])
# Different Y!
bayesplot::ppc_dens_overlay_grouped(y_divided2,
# MARGIN = 1
pred_divided2_row[samples_to_show2, ],
group = dat_aggregated2$cond)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
# MARGIN = 2
pred_divided2_col[samples_to_show2, ],
group = dat_aggregated2$cond)
Oddly, the shape of Y is different in each plot, ie with brms::pp_check(), MARGIN = 1, MARGIN = 2, grouped or not…
I do not understand why this is happening. Do you?
Lastly, shouldn’t MARGIN = 2 be the default here, given that brms::posterior_predict() generates a matrix where each column corresponds to each row in the original dataframe?
Thanks!
Full code below:
# Install/load packages
pacman::p_load(magrittr,
dplyr,
brms)
pacman::p_load_gh("stan/cmdstanr")
# fake data
## Version 1 (N = 50) ----
## long format
set.seed(123)
dat_long <- data.frame(
subj = rep(1:10, each = 100),
item = rep(1:10, each = 10),
bin = rep(1:10, times = 10),
cond = c(-.5,.5),
pTarget = rbinom(1000, 1, .6)
)
## aggregated over bins/items
dat_aggregated1 <- dat_long %>%
dplyr::group_by(subj, cond) %>%
dplyr::summarise(sum = sum(pTarget), N = length(pTarget))
# model using addition-terms
m_aggregated1 <- brm(formula = sum | trials(N) ~ cond,
family = binomial(),
iter = 5000,
backend = "cmdstanr",
data = dat_aggregated1)
# posterior predictive check
set.seed(123)
pred1 <- brms::posterior_predict(m_aggregated1, cores = 1)
# Divide the samples for each row by the corresponding number of trials
pred_divided1_row <- sweep(pred1,
MARGIN = 1, #rowwise
STATS = dat_aggregated1$N, # length = 20
FUN = "/")
y_divided1 <- dat_aggregated1$sum / dat_aggregated1$N
samples_to_show1 <- sample(1:(dim(pred_divided1_row)[1]), size = 30)
# Same Y
brms::pp_check(m_aggregated1, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided1, pred_divided1_row[samples_to_show1, ])
# Same Y
brms::pp_check(m_aggregated1,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided1,
pred_divided1_row[samples_to_show1, ],
group = dat_aggregated1$cond)
# Above you ran sweep() with MARGIN = 1, thus rowwise, using
# values from dat_aggregated1$N, which has length = 20, for STATS
# pred1, however, has 20 columns, not 20 rows
dim(pred1)
# Now let's use MARGIN = 2, thus columnwise
pred_divided1_col <- sweep(pred1,
MARGIN = 2, #columnwise
STATS = dat_aggregated1$N, # length = 20
FUN = "/")
# Same Y
bayesplot::ppc_dens_overlay(y_divided1,
# MARGIN = 1
pred_divided1_row[samples_to_show1, ])
bayesplot::ppc_dens_overlay(y_divided1,
# MARGIN = 2
pred_divided1_col[samples_to_show1, ])
# Same Y
bayesplot::ppc_dens_overlay_grouped(y_divided1,
# MARGIN = 1
pred_divided1_row[samples_to_show1, ],
group = dat_aggregated1$cond)
bayesplot::ppc_dens_overlay_grouped(y_divided1,
# MARGIN = 2
pred_divided1_col[samples_to_show1, ],
group = dat_aggregated1$cond)
# Version 2 (varying N)
set.seed(123)
dat_aggregated2 <- dat_long %>%
dplyr::group_by(subj, cond) %>%
dplyr::summarise(sum = sum(pTarget), N = length(pTarget)) %>%
# Add numbers to randomly change N rowwise
dplyr::mutate(N = (N + round(runif(1, 0, 30))))
# model using addition-terms
m_aggregated2 <- brm(formula = sum | trials(N) ~ cond,
family = binomial(),
iter = 5000,
backend = "cmdstanr",
data = dat_aggregated2)
# posterior predictive check
set.seed(123)
pred2 <- brms::posterior_predict(m_aggregated2, cores = 1)
# Divide the samples for each row by the corresponding number of trials
pred_divided2_row <- sweep(pred2,
MARGIN = 1, #rowwise
STATS = dat_aggregated2$N,
FUN = "/")
y_divided2 <- dat_aggregated2$sum / dat_aggregated2$N
samples_to_show2 <- sample(1:(dim(pred_divided2_row)[1]), size = 30)
# Different Y!
brms::pp_check(m_aggregated2, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided2, pred_divided2_row[samples_to_show2, ])
# Different Y!
brms::pp_check(m_aggregated2,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
pred_divided2_row[samples_to_show2, ],
group = dat_aggregated2$cond)
# Now with MARGIN = 2
pred_divided2_col <- sweep(pred2,
MARGIN = 2, #columnwise
STATS = dat_aggregated2$N, # length = 20
FUN = "/")
# Different Y
brms::pp_check(m_aggregated2, ndraws = 30)
bayesplot::ppc_dens_overlay(y_divided2, pred_divided2_col[samples_to_show2, ])
# Same Y
brms::pp_check(m_aggregated2,
type = "dens_overlay_grouped",
group = "cond",
ndraws = 30)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
pred_divided2_col[samples_to_show2, ],
group = dat_aggregated2$cond)
# MARGINs 1 vs. 2
# Different Y!
bayesplot::ppc_dens_overlay(y_divided2,
# MARGIN = 1
pred_divided2_row[samples_to_show2, ])
bayesplot::ppc_dens_overlay(y_divided2,
# MARGIN = 2
pred_divided2_col[samples_to_show2, ])
# Different Y!
bayesplot::ppc_dens_overlay_grouped(y_divided2,
# MARGIN = 1
pred_divided2_row[samples_to_show2, ],
group = dat_aggregated2$cond)
bayesplot::ppc_dens_overlay_grouped(y_divided2,
# MARGIN = 2
pred_divided2_col[samples_to_show2, ],
group = dat_aggregated2$cond)