Hi everybody!
First time posting so please let me know if I’m breaking any rules, or missing something.
I’m running a hierarchical cumulative regression model with brms as a meta analysis of 15 papers about the ability to detect fake-news. The model formula is this:
formula <- bf(value ~ time*target + (time*target | study/id),
disc ~ 0 + Intercept + time*target + (0 + dummy(time, "post") * dummy(target, "fake") | study/id))
The predictors “time” and “target” are categorical and represent the measurement (“pretest” vs. “posttest”) and veracity of item (“true”, “fake”).
The ugly parameterization of disc is due to the fact that we fixed it’s intercept to 0, and therefore don’t want to model the covariances of it with other parameters.
This is the part of the prior I’ve defined, the rest of the prior is default brms:
prior <- set_prior("normal(0, 0.2)", coef = "timepost") + # change in criterion in the posttest
set_prior("normal(1, 0.2)", coef = "targetfake") + # initial sensitivity (in the pretest)
set_prior("normal(0.5, 0.2)", coef = "timepost:targetfake") + # change in sensitivity in the posttest
set_prior("normal(0, 0.2)", dpar = "disc") +
set_prior("constant(0)", dpar = "disc", coef = "Intercept")
This is the model specification:
brm(formula = formula,
data = data,
prior = prior,
family = cumulative(link = "probit"),
cores = 10,
chains = 10,
iter = 2000,
init = 0,
control = list(adapt_delta = 0.95),
backend = "cmdstanr",
refresh = 10,
seed = 14)
This is the stan code generated with brms::make_stancode()
:
// generated with brms 2.21.0
functions {
/* compute correlated group-level effects
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor(matrix z, vector SD, matrix L) {
// r is stored in another dimension order than z
return transpose(diag_pre_multiply(SD, L) * z);
}
/* cumulative-probit log-PDF for a single response
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: ordinal thresholds
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_probit_lpmf(int y, real mu, real disc, vector thres) {
int nthres = num_elements(thres);
real p;
if (y == 1) {
p = Phi(disc * (thres[1] - mu));
} else if (y == nthres + 1) {
p = 1 - Phi(disc * (thres[nthres] - mu));
} else {
p = Phi(disc * (thres[y] - mu)) -
Phi(disc * (thres[y - 1] - mu));
}
return log(p);
}
/* cumulative-probit log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_probit_merged_lpmf(int y, real mu, real disc, vector thres, array[] int j) {
return cumulative_probit_lpmf(y | mu, disc, thres[j[1]:j[2]]);
}
/* ordered-probit log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real ordered_probit_merged_lpmf(int y, real mu, vector thres, array[] int j) {
return ordered_probit_lpmf(y | mu, thres[j[1]:j[2]]);
}
}
data {
int<lower=1> N; // total number of observations
array[N] int Y; // response variable
int<lower=2> nthres; // number of thresholds
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int<lower=1> Kc; // number of population-level effects after centering
int<lower=1> K_disc; // number of population-level effects
matrix[N, K_disc] X_disc; // population-level design matrix
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
array[N] int<lower=1> J_1; // grouping indicator per observation
// group-level predictor values
vector[N] Z_1_1;
vector[N] Z_1_2;
vector[N] Z_1_3;
vector[N] Z_1_4;
int<lower=1> NC_1; // number of group-level correlations
// data for group-level effects of ID 2
int<lower=1> N_2; // number of grouping levels
int<lower=1> M_2; // number of coefficients per level
array[N] int<lower=1> J_2; // grouping indicator per observation
// group-level predictor values
vector[N] Z_2_1;
vector[N] Z_2_2;
vector[N] Z_2_3;
vector[N] Z_2_4;
int<lower=1> NC_2; // number of group-level correlations
// data for group-level effects of ID 3
int<lower=1> N_3; // number of grouping levels
int<lower=1> M_3; // number of coefficients per level
array[N] int<lower=1> J_3; // grouping indicator per observation
// group-level predictor values
vector[N] Z_3_disc_1;
vector[N] Z_3_disc_2;
vector[N] Z_3_disc_3;
int<lower=1> NC_3; // number of group-level correlations
// data for group-level effects of ID 4
int<lower=1> N_4; // number of grouping levels
int<lower=1> M_4; // number of coefficients per level
array[N] int<lower=1> J_4; // grouping indicator per observation
// group-level predictor values
vector[N] Z_4_disc_1;
vector[N] Z_4_disc_2;
vector[N] Z_4_disc_3;
int<lower=1> NC_4; // number of group-level correlations
int prior_only; // should the likelihood be ignored?
}
transformed data {
matrix[N, Kc] Xc; // centered version of X
vector[Kc] means_X; // column means of X before centering
for (i in 1:K) {
means_X[i] = mean(X[, i]);
Xc[, i] = X[, i] - means_X[i];
}
}
parameters {
vector[Kc] b; // regression coefficients
ordered[nthres] Intercept; // temporary thresholds for centered predictors
real par_b_disc_2;
real par_b_disc_3;
real par_b_disc_4;
vector<lower=0>[M_1] sd_1; // group-level standard deviations
matrix[M_1, N_1] z_1; // standardized group-level effects
cholesky_factor_corr[M_1] L_1; // cholesky factor of correlation matrix
vector<lower=0>[M_2] sd_2; // group-level standard deviations
matrix[M_2, N_2] z_2; // standardized group-level effects
cholesky_factor_corr[M_2] L_2; // cholesky factor of correlation matrix
vector<lower=0>[M_3] sd_3; // group-level standard deviations
matrix[M_3, N_3] z_3; // standardized group-level effects
cholesky_factor_corr[M_3] L_3; // cholesky factor of correlation matrix
vector<lower=0>[M_4] sd_4; // group-level standard deviations
matrix[M_4, N_4] z_4; // standardized group-level effects
cholesky_factor_corr[M_4] L_4; // cholesky factor of correlation matrix
}
transformed parameters {
vector[K_disc] b_disc; // regression coefficients
matrix[N_1, M_1] r_1; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_1] r_1_1;
vector[N_1] r_1_2;
vector[N_1] r_1_3;
vector[N_1] r_1_4;
matrix[N_2, M_2] r_2; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_2] r_2_1;
vector[N_2] r_2_2;
vector[N_2] r_2_3;
vector[N_2] r_2_4;
matrix[N_3, M_3] r_3; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_3] r_3_disc_1;
vector[N_3] r_3_disc_2;
vector[N_3] r_3_disc_3;
matrix[N_4, M_4] r_4; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_4] r_4_disc_1;
vector[N_4] r_4_disc_2;
vector[N_4] r_4_disc_3;
real lprior = 0; // prior contributions to the log posterior
b_disc[1] = 0;
b_disc[2] = par_b_disc_2;
b_disc[3] = par_b_disc_3;
b_disc[4] = par_b_disc_4;
// compute actual group-level effects
r_1 = scale_r_cor(z_1, sd_1, L_1);
r_1_1 = r_1[, 1];
r_1_2 = r_1[, 2];
r_1_3 = r_1[, 3];
r_1_4 = r_1[, 4];
// compute actual group-level effects
r_2 = scale_r_cor(z_2, sd_2, L_2);
r_2_1 = r_2[, 1];
r_2_2 = r_2[, 2];
r_2_3 = r_2[, 3];
r_2_4 = r_2[, 4];
// compute actual group-level effects
r_3 = scale_r_cor(z_3, sd_3, L_3);
r_3_disc_1 = r_3[, 1];
r_3_disc_2 = r_3[, 2];
r_3_disc_3 = r_3[, 3];
// compute actual group-level effects
r_4 = scale_r_cor(z_4, sd_4, L_4);
r_4_disc_1 = r_4[, 1];
r_4_disc_2 = r_4[, 2];
r_4_disc_3 = r_4[, 3];
lprior += normal_lpdf(b[1] | 0, 0.2);
lprior += normal_lpdf(b[2] | 1, 0.2);
lprior += normal_lpdf(b[3] | 0.5, 0.2);
lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
b_disc[1] = 0;
lprior += normal_lpdf(b_disc[2] | 0, 0.2);
lprior += normal_lpdf(b_disc[3] | 0, 0.2);
lprior += normal_lpdf(b_disc[4] | 0, 0.2);
lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
- 4 * student_t_lccdf(0 | 3, 0, 2.5);
lprior += lkj_corr_cholesky_lpdf(L_1 | 1);
lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
- 4 * student_t_lccdf(0 | 3, 0, 2.5);
lprior += lkj_corr_cholesky_lpdf(L_2 | 1);
lprior += student_t_lpdf(sd_3 | 3, 0, 2.5)
- 3 * student_t_lccdf(0 | 3, 0, 2.5);
lprior += lkj_corr_cholesky_lpdf(L_3 | 1);
lprior += student_t_lpdf(sd_4 | 3, 0, 2.5)
- 3 * student_t_lccdf(0 | 3, 0, 2.5);
lprior += lkj_corr_cholesky_lpdf(L_4 | 1);
}
model {
// likelihood including constants
if (!prior_only) {
// initialize linear predictor term
vector[N] mu = rep_vector(0.0, N);
// initialize linear predictor term
vector[N] disc = rep_vector(0.0, N);
mu += Xc * b;
disc += X_disc * b_disc;
for (n in 1:N) {
// add more terms to the linear predictor
mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_1_3[J_1[n]] * Z_1_3[n] + r_1_4[J_1[n]] * Z_1_4[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_2_2[J_2[n]] * Z_2_2[n] + r_2_3[J_2[n]] * Z_2_3[n] + r_2_4[J_2[n]] * Z_2_4[n];
}
for (n in 1:N) {
// add more terms to the linear predictor
disc[n] += r_3_disc_1[J_3[n]] * Z_3_disc_1[n] + r_3_disc_2[J_3[n]] * Z_3_disc_2[n] + r_3_disc_3[J_3[n]] * Z_3_disc_3[n] + r_4_disc_1[J_4[n]] * Z_4_disc_1[n] + r_4_disc_2[J_4[n]] * Z_4_disc_2[n] + r_4_disc_3[J_4[n]] * Z_4_disc_3[n];
}
disc = exp(disc);
for (n in 1:N) {
target += cumulative_probit_lpmf(Y[n] | mu[n], disc[n], Intercept);
}
}
// priors including constants
target += lprior;
target += std_normal_lpdf(to_vector(z_1));
target += std_normal_lpdf(to_vector(z_2));
target += std_normal_lpdf(to_vector(z_3));
target += std_normal_lpdf(to_vector(z_4));
}
generated quantities {
// compute actual thresholds
vector[nthres] b_Intercept = Intercept + dot_product(means_X, b);
// compute group-level correlations
corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
vector<lower=-1,upper=1>[NC_1] cor_1;
// compute group-level correlations
corr_matrix[M_2] Cor_2 = multiply_lower_tri_self_transpose(L_2);
vector<lower=-1,upper=1>[NC_2] cor_2;
// compute group-level correlations
corr_matrix[M_3] Cor_3 = multiply_lower_tri_self_transpose(L_3);
vector<lower=-1,upper=1>[NC_3] cor_3;
// compute group-level correlations
corr_matrix[M_4] Cor_4 = multiply_lower_tri_self_transpose(L_4);
vector<lower=-1,upper=1>[NC_4] cor_4;
// extract upper diagonal of correlation matrix
for (k in 1:M_1) {
for (j in 1:(k - 1)) {
cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_2) {
for (j in 1:(k - 1)) {
cor_2[choose(k - 1, 2) + j] = Cor_2[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_3) {
for (j in 1:(k - 1)) {
cor_3[choose(k - 1, 2) + j] = Cor_3[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_4) {
for (j in 1:(k - 1)) {
cor_4[choose(k - 1, 2) + j] = Cor_4[j, k];
}
}
}
For some reason, I get horrible convergence espicially with sd parameters involving the timepost cofficient within the id level. Here is an output:
One thing I’ve already ruled out is missing values, but every id in the data had observations both pretes and posttest, as well as true items and fake items.
Any ideas as to what can cause that?
Thank you very much in advance!