I’ve got an hierarchical ordered logistic regression and I want to use the R2D2M2 prior on it because I have 23 predictors and 5 levels. I’ve been using slightly modified brms
generated stan code that I’m running with Rstan
. The code is modified to incorporate the induced dirichlet prior as described in the solution to this post provided by @StaffanBetner. Long story short, the model seems to run fine at first glance (no divergent transitions) but some of the chains keep randomly collapsing because the cut-points in the ordered_logistic_lpmf()
function are equal or infinite. For instance :
[1] "Chain 1: Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:"
[2] "Chain 1: Exception: ordered_logistic: Final cut-point is inf, but must be finite! (in 'anon_model', line 263, column 6 to column 63)"
[3] "Chain 1: If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,"
[4] "Chain 1: but if this warning occurs often then your model may be either severely ill-conditioned or misspecified."
[5] "Chain 1: "
[6] "Error : Exception: modelfbb557d5e7db__namespace::write_array: Cor_1 is not positive definite. (in 'anon_model', line 276, column 2 to column 66)"
I know the R2D2M2 prior is super recent (proposed by @javier.aguilar and @paul.buerkner 2 years ago, here’s a link) so I’m not sure if brms
supports using it for hierarchical ordered logistic regression yet. It is also a very new concept to me so I may have made a few mistakes modifying the stan code generated by brms
. Any help would be greatly appreciated! Here’s the raw stan code I’m using :
// generated with brms 2.21.0
functions {
/* Efficient computation of the horseshoe scale parameters
* see Appendix C.1 in https://projecteuclid.org/euclid.ejs/1513306866
* Args:
* lambda: local shrinkage parameters
* tau: global shrinkage parameter
* c2: slap regularization parameter
* Returns:
* scale parameter vector of the horseshoe prior
*/
vector scales_horseshoe(vector lambda, real tau, real c2) {
int K = rows(lambda);
vector[K] lambda2 = square(lambda);
vector[K] lambda_tilde = sqrt(c2 * lambda2 ./ (c2 + tau^2 * lambda2));
return lambda_tilde * tau;
}
/* compute scale parameters of the R2D2 prior
* Args:
* phi: local weight parameters
* tau2: global scale parameter
* Returns:
* scale parameter vector of the R2D2 prior
*/
vector scales_R2D2(vector phi, real tau2) {
return sqrt(phi * tau2);
}
/* compute correlated group-level effects
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor(matrix z, vector SD, matrix L) {
// r is stored in another dimension order than z
return transpose(diag_pre_multiply(SD, L) * z);
}
/* cumulative-logit log-PDF for a single response
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: ordinal thresholds
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_logit_lpmf(int y, real mu, real disc, vector thres) {
int nthres = num_elements(thres);
if (y == 1) {
return log_inv_logit(disc * (thres[1] - mu));
} else if (y == nthres + 1) {
return log1m_inv_logit(disc * (thres[nthres] - mu));
} else {
return log_inv_logit_diff(disc * (thres[y] - mu), disc * (thres[y - 1] - mu));
}
}
/* cumulative-logit log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* disc: discrimination parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real cumulative_logit_merged_lpmf(int y, real mu, real disc, vector thres, array[] int j) {
return cumulative_logit_lpmf(y | mu, disc, thres[j[1]:j[2]]);
}
/* ordered-logistic log-PDF for a single response and merged thresholds
* Args:
* y: response category
* mu: latent mean parameter
* thres: vector of merged ordinal thresholds
* j: start and end index for the applid threshold within 'thres'
* Returns:
* a scalar to be added to the log posterior
*/
real ordered_logistic_merged_lpmf(int y, real mu, vector thres, array[] int j) {
return ordered_logistic_lpmf(y | mu, thres[j[1]:j[2]]);
}
real induced_dirichlet_lpdf(vector c, vector alpha, real phi) {
int K = num_elements(c) + 1;
vector[K - 1] sigma = inv_logit(phi - c);
vector[K] p;
matrix[K, K] J = rep_matrix(0, K, K);
// Induced ordinal probabilities
p[1] = 1 - sigma[1];
for (k in 2:(K - 1))
p[k] = sigma[k - 1] - sigma[k];
p[K] = sigma[K - 1];
// Baseline column of Jacobian
for (k in 1:K) J[k, 1] = 1;
// Diagonal entries of Jacobian
for (k in 2:K) {
real rho = sigma[k - 1] * (1 - sigma[k - 1]);
J[k, k] = - rho;
J[k - 1, k] = rho;
}
return dirichlet_lpdf(p | alpha)
+ log_determinant(J);
}
vector induced_dirichlet_rng(vector alpha, real phi) {
int K = num_elements(alpha);
vector[K - 1] c;
vector[K] p = dirichlet_rng(alpha);
c[1] = phi - logit(1 - p[1]);
for (k in 2:(K - 1))
c[k] = phi - logit(inv_logit(phi - c[k - 1]) - p[k]);
return c;
}
}
data {
int<lower=1> N; // total number of observations
array[N] int Y; // response variable
int<lower=2> nthres; // number of thresholds
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int<lower=1> Kc; // number of population-level effects after centering
int<lower=1> Kscales; // number of local scale parameters
// data for the R2D2 prior
real<lower=0> R2D2_mean_R2; // mean of the R2 prior
real<lower=0> R2D2_prec_R2; // precision of the R2 prior
// concentration vector of the D2 prior
vector<lower=0>[Kscales] R2D2_cons_D2;
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
array[N] int<lower=1> J_1; // grouping indicator per observation
// group-level predictor values
vector[N] Z_1_1;
vector[N] Z_1_2;
vector[N] Z_1_3;
vector[N] Z_1_4;
vector[N] Z_1_5;
vector[N] Z_1_6;
vector[N] Z_1_7;
vector[N] Z_1_8;
vector[N] Z_1_9;
vector[N] Z_1_10;
vector[N] Z_1_11;
vector[N] Z_1_12;
vector[N] Z_1_13;
vector[N] Z_1_14;
vector[N] Z_1_15;
vector[N] Z_1_16;
vector[N] Z_1_17;
vector[N] Z_1_18;
vector[N] Z_1_19;
vector[N] Z_1_20;
vector[N] Z_1_21;
vector[N] Z_1_22;
vector[N] Z_1_23;
int<lower=1> NC_1; // number of group-level correlations
int prior_only; // should the likelihood be ignored?
}
transformed data {
matrix[N, Kc] Xc; // centered version of X
vector[Kc] means_X; // column means of X before centering
for (i in 1:K) {
means_X[i] = mean(X[, i]);
Xc[, i] = X[, i] - means_X[i];
}
}
parameters {
vector[Kc] zb; // unscaled coefficients
ordered[nthres] Intercept; // temporary thresholds for centered predictors
// parameters of the R2D2 prior
real<lower=0,upper=1> R2D2_R2;
simplex[Kscales] R2D2_phi;
matrix[M_1, N_1] z_1; // standardized group-level effects
cholesky_factor_corr[M_1] L_1; // cholesky factor of correlation matrix
}
transformed parameters {
vector[Kc] b; // scaled coefficients
vector<lower=0>[Kc] sdb; // SDs of the coefficients
real R2D2_tau2; // global R2D2 scale parameter
vector<lower=0>[Kscales] scales; // local R2D2 scale parameters
real disc = 1; // discrimination parameters
vector<lower=0>[M_1] sd_1; // group-level standard deviations
matrix[N_1, M_1] r_1; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_1] r_1_1;
vector[N_1] r_1_2;
vector[N_1] r_1_3;
vector[N_1] r_1_4;
vector[N_1] r_1_5;
vector[N_1] r_1_6;
vector[N_1] r_1_7;
vector[N_1] r_1_8;
vector[N_1] r_1_9;
vector[N_1] r_1_10;
vector[N_1] r_1_11;
vector[N_1] r_1_12;
vector[N_1] r_1_13;
vector[N_1] r_1_14;
vector[N_1] r_1_15;
vector[N_1] r_1_16;
vector[N_1] r_1_17;
vector[N_1] r_1_18;
vector[N_1] r_1_19;
vector[N_1] r_1_20;
vector[N_1] r_1_21;
vector[N_1] r_1_22;
vector[N_1] r_1_23;
real lprior = 0; // prior contributions to the log posterior
// compute R2D2 scale parameters
R2D2_tau2 = R2D2_R2 / (1 - R2D2_R2);
scales = scales_R2D2(R2D2_phi, R2D2_tau2);
sdb = scales[(1):(Kc)];
sd_1 = scales[(1+Kc):(Kc+M_1)];
b = zb .* sdb; // scale coefficients
// compute actual group-level effects
r_1 = scale_r_cor(z_1, sd_1, L_1);
r_1_1 = r_1[, 1];
r_1_2 = r_1[, 2];
r_1_3 = r_1[, 3];
r_1_4 = r_1[, 4];
r_1_5 = r_1[, 5];
r_1_6 = r_1[, 6];
r_1_7 = r_1[, 7];
r_1_8 = r_1[, 8];
r_1_9 = r_1[, 9];
r_1_10 = r_1[, 10];
r_1_11 = r_1[, 11];
r_1_12 = r_1[, 12];
r_1_13 = r_1[, 13];
r_1_14 = r_1[, 14];
r_1_15 = r_1[, 15];
r_1_16 = r_1[, 16];
r_1_17 = r_1[, 17];
r_1_18 = r_1[, 18];
r_1_19 = r_1[, 19];
r_1_20 = r_1[, 20];
r_1_21 = r_1[, 21];
r_1_22 = r_1[, 22];
r_1_23 = r_1[, 23];
lprior += induced_dirichlet_lpdf(Intercept | rep_vector(2, nthres+1), 0);
lprior += beta_lpdf(R2D2_R2 | R2D2_mean_R2 * R2D2_prec_R2, (1 - R2D2_mean_R2) * R2D2_prec_R2);
lprior += lkj_corr_cholesky_lpdf(L_1 | 1);
}
model {
// likelihood including constants
if (!prior_only) {
// initialize linear predictor term
vector[N] mu = rep_vector(0.0, N);
mu += Xc * b;
for (n in 1:N) {
// add more terms to the linear predictor
mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_1_3[J_1[n]] * Z_1_3[n] + r_1_4[J_1[n]] * Z_1_4[n] + r_1_5[J_1[n]] * Z_1_5[n] + r_1_6[J_1[n]] * Z_1_6[n] + r_1_7[J_1[n]] * Z_1_7[n] + r_1_8[J_1[n]] * Z_1_8[n] + r_1_9[J_1[n]] * Z_1_9[n] + r_1_10[J_1[n]] * Z_1_10[n] + r_1_11[J_1[n]] * Z_1_11[n] + r_1_12[J_1[n]] * Z_1_12[n] + r_1_13[J_1[n]] * Z_1_13[n] + r_1_14[J_1[n]] * Z_1_14[n] + r_1_15[J_1[n]] * Z_1_15[n] + r_1_16[J_1[n]] * Z_1_16[n] + r_1_17[J_1[n]] * Z_1_17[n] + r_1_18[J_1[n]] * Z_1_18[n] + r_1_19[J_1[n]] * Z_1_19[n] + r_1_20[J_1[n]] * Z_1_20[n] + r_1_21[J_1[n]] * Z_1_21[n] + r_1_22[J_1[n]] * Z_1_22[n] + r_1_23[J_1[n]] * Z_1_23[n];
}
for (n in 1:N) {
target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
}
}
// priors including constants
target += lprior;
target += std_normal_lpdf(zb);
target += dirichlet_lpdf(R2D2_phi | R2D2_cons_D2);
target += std_normal_lpdf(to_vector(z_1));
}
generated quantities {
// compute actual thresholds
vector[nthres] b_Intercept = Intercept + dot_product(means_X, b);
// compute group-level correlations
corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
vector<lower=-1,upper=1>[NC_1] cor_1;
// additionally sample draws from priors
ordered[nthres] prior_Intercept = induced_dirichlet_rng(rep_vector(1,nthres+1), 0);
real prior_R2D2_R2 = beta_rng(R2D2_mean_R2*R2D2_prec_R2,(1-R2D2_mean_R2)*R2D2_prec_R2);
real prior_cor_1 = lkj_corr_rng(M_1,1)[1, 2];
// extract upper diagonal of correlation matrix
for (k in 1:M_1) {
for (j in 1:(k - 1)) {
cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
}
}
}