I’ve got an hierarchical ordered logistic regression and I want to use the R2D2M2 prior on it because I have 23 predictors and 5 levels. I’ve been using slightly modified brms generated stan code that I’m running with Rstan. The code is modified to incorporate the induced dirichlet prior as described in the solution to this post provided by @StaffanBetner. Long story short, the model seems to run fine at first glance (no divergent transitions) but some of the chains keep randomly collapsing because the cut-points in the ordered_logistic_lpmf() function are equal or infinite. For instance :
[1] "Chain 1: Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:"
[2] "Chain 1: Exception: ordered_logistic: Final cut-point is inf, but must be finite! (in 'anon_model', line 263, column 6 to column 63)"
[3] "Chain 1: If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,"
[4] "Chain 1: but if this warning occurs often then your model may be either severely ill-conditioned or misspecified."
[5] "Chain 1: "
[6] "Error : Exception: modelfbb557d5e7db__namespace::write_array: Cor_1 is not positive definite. (in 'anon_model', line 276, column 2 to column 66)"
I know the R2D2M2 prior is super recent (proposed by @javier.aguilar and @paul.buerkner 2 years ago, here’s a link) so I’m not sure if brms supports using it for hierarchical ordered logistic regression yet. It is also a very new concept to me so I may have made a few mistakes modifying the stan code generated by brms. Any help would be greatly appreciated! Here’s the raw stan code I’m using :
// generated with brms 2.21.0
functions {
/* Efficient computation of the horseshoe scale parameters
* see Appendix C.1 in https://projecteuclid.org/euclid.ejs/1513306866
* Args:
* lambda: local shrinkage parameters
* tau: global shrinkage parameter
* c2: slap regularization parameter
* Returns:
* scale parameter vector of the horseshoe prior
*/
vector scales_horseshoe(vector lambda, real tau, real c2) {
int K = rows(lambda);
vector[K] lambda2 = square(lambda);
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return lambda_tilde * tau;
}
/* compute scale parameters of the R2D2 prior
* Args:
* phi: local weight parameters
* tau2: global scale parameter
* Returns:
* scale parameter vector of the R2D2 prior
*/
vector scales_R2D2(vector phi, real tau2) {
return sqrt(phi * tau2);
}
/* compute correlated group-level effects
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor(matrix z, vector SD, matrix L) {
// r is stored in another dimension order than z
return transpose(diag_pre_multiply(SD, L) * z);
}
/* cumulative-logit log-PDF for a single response
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: ordinal thresholds
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_logit_lpmf(int y, real mu, real disc, vector thres) {
int nthres = num_elements(thres);
if (y == 1) {
return log_inv_logit(disc * (thres[1] - mu));
} else if (y == nthres + 1) {
return log1m_inv_logit(disc * (thres[nthres] - mu));
} else {
return log_inv_logit_diff(disc * (thres[y] - mu), disc * (thres[y - 1] - mu));
}
}
/* cumulative-logit log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_logit_merged_lpmf(int y, real mu, real disc, vector thres, array[] int j) {
return cumulative_logit_lpmf(y | mu, disc, thres[j[1]:j[2]]);
}
/* ordered-logistic log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real ordered_logistic_merged_lpmf(int y, real mu, vector thres, array[] int j) {
return ordered_logistic_lpmf(y | mu, thres[j[1]:j[2]]);
}
real induced_dirichlet_lpdf(vector c, vector alpha, real phi) {
int K = num_elements(c) + 1;
vector[K - 1] sigma = inv_logit(phi - c);
vector[K] p;
matrix[K, K] J = rep_matrix(0, K, K);
// Induced ordinal probabilities
p[1] = 1 - sigma[1];
for (k in 2:(K - 1))
p[k] = sigma[k - 1] - sigma[k];
p[K] = sigma[K - 1];
// Baseline column of Jacobian
for (k in 1:K) J[k, 1] = 1;
// Diagonal entries of Jacobian
for (k in 2:K) {
real rho = sigma[k - 1] * (1 - sigma[k - 1]);
J[k, k] = - rho;
J[k - 1, k] = rho;
}
return dirichlet_lpdf(p | alpha)
+ log_determinant(J);
}
vector induced_dirichlet_rng(vector alpha, real phi) {
int K = num_elements(alpha);
vector[K - 1] c;
vector[K] p = dirichlet_rng(alpha);
c[1] = phi - logit(1 - p[1]);
for (k in 2:(K - 1))
c[k] = phi - logit(inv_logit(phi - c[k - 1]) - p[k]);
return c;
}
}
data {
int<lower=1> N; // total number of observations
array[N] int Y; // response variable
int<lower=2> nthres; // number of thresholds
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int<lower=1> Kc; // number of population-level effects after centering
int<lower=1> Kscales; // number of local scale parameters
// data for the R2D2 prior
real<lower=0> R2D2_mean_R2; // mean of the R2 prior
real<lower=0> R2D2_prec_R2; // precision of the R2 prior
// concentration vector of the D2 prior
vector<lower=0>[Kscales] R2D2_cons_D2;
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
array[N] int<lower=1> J_1; // grouping indicator per observation
// group-level predictor values
vector[N] Z_1_1;
vector[N] Z_1_2;
vector[N] Z_1_3;
vector[N] Z_1_4;
vector[N] Z_1_5;
vector[N] Z_1_6;
vector[N] Z_1_7;
vector[N] Z_1_8;
vector[N] Z_1_9;
vector[N] Z_1_10;
vector[N] Z_1_11;
vector[N] Z_1_12;
vector[N] Z_1_13;
vector[N] Z_1_14;
vector[N] Z_1_15;
vector[N] Z_1_16;
vector[N] Z_1_17;
vector[N] Z_1_18;
vector[N] Z_1_19;
vector[N] Z_1_20;
vector[N] Z_1_21;
vector[N] Z_1_22;
vector[N] Z_1_23;
int<lower=1> NC_1; // number of group-level correlations
int prior_only; // should the likelihood be ignored?
}
transformed data {
matrix[N, Kc] Xc; // centered version of X
vector[Kc] means_X; // column means of X before centering
for (i in 1:K) {
means_X[i] = mean(X[, i]);
Xc[, i] = X[, i] - means_X[i];
}
}
parameters {
vector[Kc] zb; // unscaled coefficients
ordered[nthres] Intercept; // temporary thresholds for centered predictors
// parameters of the R2D2 prior
real<lower=0,upper=1> R2D2_R2;
simplex[Kscales] R2D2_phi;
matrix[M_1, N_1] z_1; // standardized group-level effects
cholesky_factor_corr[M_1] L_1; // cholesky factor of correlation matrix
}
transformed parameters {
vector[Kc] b; // scaled coefficients
vector<lower=0>[Kc] sdb; // SDs of the coefficients
real R2D2_tau2; // global R2D2 scale parameter
vector<lower=0>[Kscales] scales; // local R2D2 scale parameters
real disc = 1; // discrimination parameters
vector<lower=0>[M_1] sd_1; // group-level standard deviations
matrix[N_1, M_1] r_1; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_1] r_1_1;
vector[N_1] r_1_2;
vector[N_1] r_1_3;
vector[N_1] r_1_4;
vector[N_1] r_1_5;
vector[N_1] r_1_6;
vector[N_1] r_1_7;
vector[N_1] r_1_8;
vector[N_1] r_1_9;
vector[N_1] r_1_10;
vector[N_1] r_1_11;
vector[N_1] r_1_12;
vector[N_1] r_1_13;
vector[N_1] r_1_14;
vector[N_1] r_1_15;
vector[N_1] r_1_16;
vector[N_1] r_1_17;
vector[N_1] r_1_18;
vector[N_1] r_1_19;
vector[N_1] r_1_20;
vector[N_1] r_1_21;
vector[N_1] r_1_22;
vector[N_1] r_1_23;
real lprior = 0; // prior contributions to the log posterior
// compute R2D2 scale parameters
R2D2_tau2 = R2D2_R2 / (1 - R2D2_R2);
scales = scales_R2D2(R2D2_phi, R2D2_tau2);
sdb = scales[(1):(Kc)];
sd_1 = scales[(1+Kc):(Kc+M_1)];
b = zb .* sdb; // scale coefficients
// compute actual group-level effects
r_1 = scale_r_cor(z_1, sd_1, L_1);
r_1_1 = r_1[, 1];
r_1_2 = r_1[, 2];
r_1_3 = r_1[, 3];
r_1_4 = r_1[, 4];
r_1_5 = r_1[, 5];
r_1_6 = r_1[, 6];
r_1_7 = r_1[, 7];
r_1_8 = r_1[, 8];
r_1_9 = r_1[, 9];
r_1_10 = r_1[, 10];
r_1_11 = r_1[, 11];
r_1_12 = r_1[, 12];
r_1_13 = r_1[, 13];
r_1_14 = r_1[, 14];
r_1_15 = r_1[, 15];
r_1_16 = r_1[, 16];
r_1_17 = r_1[, 17];
r_1_18 = r_1[, 18];
r_1_19 = r_1[, 19];
r_1_20 = r_1[, 20];
r_1_21 = r_1[, 21];
r_1_22 = r_1[, 22];
r_1_23 = r_1[, 23];
lprior += induced_dirichlet_lpdf(Intercept | rep_vector(2, nthres+1), 0);
lprior += beta_lpdf(R2D2_R2 | R2D2_mean_R2 * R2D2_prec_R2, (1 - R2D2_mean_R2) * R2D2_prec_R2);
lprior += lkj_corr_cholesky_lpdf(L_1 | 1);
}
model {
// likelihood including constants
if (!prior_only) {
// initialize linear predictor term
vector[N] mu = rep_vector(0.0, N);
mu += Xc * b;
for (n in 1:N) {
// add more terms to the linear predictor
mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_1_3[J_1[n]] * Z_1_3[n] + r_1_4[J_1[n]] * Z_1_4[n] + r_1_5[J_1[n]] * Z_1_5[n] + r_1_6[J_1[n]] * Z_1_6[n] + r_1_7[J_1[n]] * Z_1_7[n] + r_1_8[J_1[n]] * Z_1_8[n] + r_1_9[J_1[n]] * Z_1_9[n] + r_1_10[J_1[n]] * Z_1_10[n] + r_1_11[J_1[n]] * Z_1_11[n] + r_1_12[J_1[n]] * Z_1_12[n] + r_1_13[J_1[n]] * Z_1_13[n] + r_1_14[J_1[n]] * Z_1_14[n] + r_1_15[J_1[n]] * Z_1_15[n] + r_1_16[J_1[n]] * Z_1_16[n] + r_1_17[J_1[n]] * Z_1_17[n] + r_1_18[J_1[n]] * Z_1_18[n] + r_1_19[J_1[n]] * Z_1_19[n] + r_1_20[J_1[n]] * Z_1_20[n] + r_1_21[J_1[n]] * Z_1_21[n] + r_1_22[J_1[n]] * Z_1_22[n] + r_1_23[J_1[n]] * Z_1_23[n];
}
for (n in 1:N) {
target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
}
}
// priors including constants
target += lprior;
target += std_normal_lpdf(zb);
target += dirichlet_lpdf(R2D2_phi | R2D2_cons_D2);
target += std_normal_lpdf(to_vector(z_1));
}
generated quantities {
// compute actual thresholds
vector[nthres] b_Intercept = Intercept + dot_product(means_X, b);
// compute group-level correlations
corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
vector<lower=-1,upper=1>[NC_1] cor_1;
// additionally sample draws from priors
ordered[nthres] prior_Intercept = induced_dirichlet_rng(rep_vector(1,nthres+1), 0);
real prior_R2D2_R2 = beta_rng(R2D2_mean_R2*R2D2_prec_R2,(1-R2D2_mean_R2)*R2D2_prec_R2);
real prior_cor_1 = lkj_corr_rng(M_1,1)[1, 2];
// extract upper diagonal of correlation matrix
for (k in 1:M_1) {
for (j in 1:(k - 1)) {
cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
}
}
}