I have a relatively large data set (~9000 observations) and am fitting a hierarchical Student model with several correlations. The model initially took a little over a week for 4 chains of 6500 iterations. I’m attempting to speed things up with some efficiency if possible. I used brms to generate some initial stan code and have done some vectorization.
brms syntax:
Mod_fmla <- bf(RECD ~ 0 + Intercept +
AvRECD.c +
Age_m.c +
Pressure.c +
Imped.c +
Volume.c +
factor(Frequency) +
factor(Frequency):Pressure.c +
factor(Frequency):Imped.c +
factor(Frequency):Volume.c +
(1 + factor(Frequency)|ME/Ear/SubjID) +
(1|test_year),
center = TRUE) +
lf(sigma ~ 0 + (1|ME), cmc = FALSE) +
lf(nu ~ 0 + (1|ME), cmc = FALSE)
Stan model:
"functions {
/* compute correlated group-level effects
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor(matrix z, vector SD, matrix L) {
// r is stored in another dimension order than z
return transpose(diag_pre_multiply(SD, L) * z);
}
/* compute the logm1 link
* Args:
* p: a positive scalar
* Returns:
* a scalar in (-Inf, Inf)
*/
real logm1(real y) {
return log(y - 1);
}
/* compute the inverse of the logm1 link
* Args:
* y: a scalar in (-Inf, Inf)
* Returns:
* a positive scalar
*/
real expp1(real y) {
return exp(y) + 1;
}
}
data {
int<lower=1> N; // total number of observations
vector[N] Y; // response variable
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
int<lower=1> J_1[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_1_1;
vector[N] Z_1_2;
vector[N] Z_1_3;
vector[N] Z_1_4;
vector[N] Z_1_5;
vector[N] Z_1_6;
vector[N] Z_1_7;
vector[N] Z_1_8;
vector[N] Z_1_9;
int<lower=1> NC_1; // number of group-level correlations
// data for group-level effects of ID 2
int<lower=1> N_2; // number of grouping levels
int<lower=1> M_2; // number of coefficients per level
int<lower=1> J_2[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_2_1;
vector[N] Z_2_2;
vector[N] Z_2_3;
vector[N] Z_2_4;
vector[N] Z_2_5;
vector[N] Z_2_6;
vector[N] Z_2_7;
vector[N] Z_2_8;
vector[N] Z_2_9;
int<lower=1> NC_2; // number of group-level correlations
// data for group-level effects of ID 3
int<lower=1> N_3; // number of grouping levels
int<lower=1> M_3; // number of coefficients per level
int<lower=1> J_3[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_3_1;
vector[N] Z_3_2;
vector[N] Z_3_3;
vector[N] Z_3_4;
vector[N] Z_3_5;
vector[N] Z_3_6;
vector[N] Z_3_7;
vector[N] Z_3_8;
vector[N] Z_3_9;
int<lower=1> NC_3; // number of group-level correlations
// data for group-level effects of ID 4
int<lower=1> N_4; // number of grouping levels
int<lower=1> M_4; // number of coefficients per level
int<lower=1> J_4[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_4_1;
// data for group-level effects of ID 5
int<lower=1> N_5; // number of grouping levels
int<lower=1> M_5; // number of coefficients per level
int<lower=1> J_5[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_5_sigma_1;
// data for group-level effects of ID 6
int<lower=1> N_6; // number of grouping levels
int<lower=1> M_6; // number of coefficients per level
int<lower=1> J_6[N]; // grouping indicator per observation
// group-level predictor values
vector[N] Z_6_nu_1;
int prior_only; // should the likelihood be ignored?
}
transformed data {
real norm18 = 18 * normal_lccdf(0 | 1,1);
real norm10 = 10 * normal_lccdf(0 | 0,0.5);
real norm0_1 = 1 * normal_lccdf(0 | 0,0.1);
real norm1_5 = 1 * normal_lccdf(0 | 0,1.5);
}
parameters {
vector[K] b; // population-level effects
vector<lower=0>[M_1] sd_1; // group-level standard deviations
matrix[M_1, N_1] z_1; // standardized group-level effects
cholesky_factor_corr[M_1] L_1; // cholesky factor of correlation matrix
vector<lower=0>[M_2] sd_2; // group-level standard deviations
matrix[M_2, N_2] z_2; // standardized group-level effects
cholesky_factor_corr[M_2] L_2; // cholesky factor of correlation matrix
vector<lower=0>[M_3] sd_3; // group-level standard deviations
matrix[M_3, N_3] z_3; // standardized group-level effects
cholesky_factor_corr[M_3] L_3; // cholesky factor of correlation matrix
vector<lower=0>[M_4] sd_4; // group-level standard deviations
vector[N_4] z_4[M_4]; // standardized group-level effects
vector<lower=0>[M_5] sd_5; // group-level standard deviations
vector[N_5] z_5[M_5]; // standardized group-level effects
vector<lower=0>[M_6] sd_6; // group-level standard deviations
vector[N_6] z_6[M_6]; // standardized group-level effects
}
transformed parameters {
matrix[N_1, M_1] r_1; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_1] r_1_1;
vector[N_1] r_1_2;
vector[N_1] r_1_3;
vector[N_1] r_1_4;
vector[N_1] r_1_5;
vector[N_1] r_1_6;
vector[N_1] r_1_7;
vector[N_1] r_1_8;
vector[N_1] r_1_9;
matrix[N_2, M_2] r_2; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_2] r_2_1;
vector[N_2] r_2_2;
vector[N_2] r_2_3;
vector[N_2] r_2_4;
vector[N_2] r_2_5;
vector[N_2] r_2_6;
vector[N_2] r_2_7;
vector[N_2] r_2_8;
vector[N_2] r_2_9;
matrix[N_3, M_3] r_3; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_3] r_3_1;
vector[N_3] r_3_2;
vector[N_3] r_3_3;
vector[N_3] r_3_4;
vector[N_3] r_3_5;
vector[N_3] r_3_6;
vector[N_3] r_3_7;
vector[N_3] r_3_8;
vector[N_3] r_3_9;
vector[N_4] r_4_1; // actual group-level effects
vector[N_5] r_5_sigma_1; // actual group-level effects
vector[N_6] r_6_nu_1; // actual group-level effects
// compute actual group-level effects
r_1 = scale_r_cor(z_1, sd_1, L_1);
r_1_1 = r_1[, 1];
r_1_2 = r_1[, 2];
r_1_3 = r_1[, 3];
r_1_4 = r_1[, 4];
r_1_5 = r_1[, 5];
r_1_6 = r_1[, 6];
r_1_7 = r_1[, 7];
r_1_8 = r_1[, 8];
r_1_9 = r_1[, 9];
// compute actual group-level effects
r_2 = scale_r_cor(z_2, sd_2, L_2);
r_2_1 = r_2[, 1];
r_2_2 = r_2[, 2];
r_2_3 = r_2[, 3];
r_2_4 = r_2[, 4];
r_2_5 = r_2[, 5];
r_2_6 = r_2[, 6];
r_2_7 = r_2[, 7];
r_2_8 = r_2[, 8];
r_2_9 = r_2[, 9];
// compute actual group-level effects
r_3 = scale_r_cor(z_3, sd_3, L_3);
r_3_1 = r_3[, 1];
r_3_2 = r_3[, 2];
r_3_3 = r_3[, 3];
r_3_4 = r_3[, 4];
r_3_5 = r_3[, 5];
r_3_6 = r_3[, 6];
r_3_7 = r_3[, 7];
r_3_8 = r_3[, 8];
r_3_9 = r_3[, 9];
r_4_1 = (sd_4[1] * (z_4[1]));
r_5_sigma_1 = (sd_5[1] * (z_5[1]));
r_6_nu_1 = (sd_6[1] * (z_6[1]));
}
model {
// likelihood including constants
if (!prior_only) {
// initialize linear predictor term
vector[N] mu = X * b;
// initialize linear predictor term
vector[N] sigma = rep_vector(0.0, N);
// initialize linear predictor term
vector[N] nu = rep_vector(0.0, N);
//for (n in 1:N) {
// add more terms to the linear predictor
mu += r_1_1[J_1] .* Z_1_1 + r_1_2[J_1] .* Z_1_2 + r_1_3[J_1] .* Z_1_3 + r_1_4[J_1] .* Z_1_4 + r_1_5[J_1] .* Z_1_5
+ r_1_6[J_1] .* Z_1_6 + r_1_7[J_1] .* Z_1_7 + r_1_8[J_1] .* Z_1_8 + r_1_9[J_1] .* Z_1_9
+ r_2_1[J_2] .* Z_2_1 + r_2_2[J_2] .* Z_2_2 + r_2_3[J_2] .* Z_2_3 + r_2_4[J_2] .* Z_2_4
+ r_2_5[J_2] .* Z_2_5 + r_2_6[J_2] .* Z_2_6 + r_2_7[J_2] .* Z_2_7 + r_2_8[J_2] .* Z_2_8
+ r_2_9[J_2] .* Z_2_9 + r_3_1[J_3] .* Z_3_1 + r_3_2[J_3] .* Z_3_2 + r_3_3[J_3] .* Z_3_3
+ r_3_4[J_3] .* Z_3_4 + r_3_5[J_3] .* Z_3_5 + r_3_6[J_3] .* Z_3_6 + r_3_7[J_3] .* Z_3_7
+ r_3_8[J_3] .* Z_3_8 + r_3_9[J_3] .* Z_3_9 + r_4_1[J_4] .* Z_4_1;
// }
//for (n in 1:N) {
// add more terms to the linear predictor
sigma += r_5_sigma_1[J_5] .* Z_5_sigma_1;
//}
//for (n in 1:N) {
// add more terms to the linear predictor
nu += r_6_nu_1[J_6] .* Z_6_nu_1;
//}
//for (n in 1:N) {
// apply the inverse link function
sigma = exp(sigma);
// }
for (n in 1:N) {
// apply the inverse link function
nu[n] = expp1(nu[n]);
}
target += -norm18;
target += -norm10;
target += -norm0_1;
target += -norm1_5;
target += student_t_lpdf(Y | nu, mu, sigma);
}
Can nu be vectorized here or can I create a vector when calling the distribution?
// priors including constants
b[1] ~ normal(5,5);
b[2] ~ normal(0,3);
b[3] ~ normal(0,3);
b[4] ~ normal(0,3);
b[5] ~ normal(0,3);
b[6] ~ normal(0,3);
b[7] ~ normal(0,3);
b[8] ~ normal(0,3);
b[9] ~ normal(0,3);
b[10] ~ normal(0,3);
b[11] ~ normal(0,3);
b[12] ~ normal(0,3);
b[13] ~ normal(0,3);
b[14] ~ normal(0,3);
b[15] ~ normal(0,3);
b[16] ~ normal(0,3);
b[17] ~ normal(0,3);
b[18] ~ normal(0,3);
b[19] ~ normal(0,3);
b[20] ~ normal(0,3);
b[21] ~ normal(0,3);
b[22] ~ normal(0,3);
b[23] ~ normal(0,3);
b[24] ~ normal(0,3);
b[25] ~ normal(0,3);
b[26] ~ normal(0,3);
b[27] ~ normal(0,3);
b[28] ~ normal(0,3);
b[29] ~ normal(0,3);
b[30] ~ normal(0,3);
b[31] ~ normal(0,3);
b[32] ~ normal(0,3);
b[33] ~ normal(0,3);
b[34] ~ normal(0,3);
b[35] ~ normal(0,3);
b[36] ~ normal(0,3);
b[37] ~ normal(0,3);
b[38] ~ normal(0,3);
//target += normal_lpdf(sd_1 | 1,1)
//- 9 * normal_lccdf(0 | 1,1);
sd_1 ~ normal(1,1);
z_1[1] ~ std_normal();
z_1[2] ~ std_normal();
z_1[3] ~ std_normal();
z_1[4] ~ std_normal();
z_1[5] ~ std_normal();
z_1[6] ~ std_normal();
z_1[7] ~ std_normal();
z_1[8] ~ std_normal();
z_1[9] ~ std_normal();
L_1 ~ lkj_corr_cholesky(1);
sd_2 ~ normal(1,1);
z_2[1] ~ std_normal();
z_2[2] ~ std_normal();
z_2[3] ~ std_normal();
z_2[4] ~ std_normal();
z_2[5] ~ std_normal();
z_2[6] ~ std_normal();
z_2[7] ~ std_normal();
z_2[8] ~ std_normal();
z_2[9] ~ std_normal();
L_2 ~ lkj_corr_cholesky(1);
sd_3 ~ normal(0,0.5);
z_3[1] ~ std_normal();
z_3[2] ~ std_normal();
z_3[3] ~ std_normal();
z_3[4] ~ std_normal();
z_3[5] ~ std_normal();
z_3[6] ~ std_normal();
z_3[7] ~ std_normal();
z_3[8] ~ std_normal();
z_3[9] ~ std_normal();
L_3 ~ lkj_corr_cholesky(1);
sd_4[1] ~ normal(0,0.5);
z_4[1] ~ std_normal();
sd_5[1] ~ normal(0,0.1);
z_5[1] ~ std_normal();
sd_5[1] ~ normal(0,1.5);
z_6[1] ~ std_normal();
}
I think the ~ syntax for priors helps.
generated quantities {
// compute group-level correlations
corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
vector<lower=-1,upper=1>[NC_1] cor_1;
// compute group-level correlations
corr_matrix[M_2] Cor_2 = multiply_lower_tri_self_transpose(L_2);
vector<lower=-1,upper=1>[NC_2] cor_2;
// compute group-level correlations
corr_matrix[M_3] Cor_3 = multiply_lower_tri_self_transpose(L_3);
vector<lower=-1,upper=1>[NC_3] cor_3;
// extract upper diagonal of correlation matrix
for (k in 1:M_1) {
for (j in 1:(k - 1)) {
cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_2) {
for (j in 1:(k - 1)) {
cor_2[choose(k - 1, 2) + j] = Cor_2[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_3) {
for (j in 1:(k - 1)) {
cor_3[choose(k - 1, 2) + j] = Cor_3[j, k];
}
}
}
Are there any additional places some vectorization might help? Can the correlation matrices in the generated quantities be vectorized?