Hi all,
I am currently running into problems with my model and the specification of multiple multilevel terms for a very heterogeneous dataset. My main data points are the duration of around 900,000 segments from 51 different languages, with data from different speakers. Trying to do things right, I would need to control for the following variables using multilevel models:
- Languages (and Speakers, between 1 and 20 per language, nested within languages)
- Sound Class (around 80 sound classes, between 10 and +10,000 observations)
In addition, I need to control for phylogeny of the languages involved (using hierarchical regression), and for contact between the languages in question (using a Gaussian process grouped by continents). And then there’s three more parameters involving causal influence on duration, as well as my main point of interest: The position of the segment in a word.
I barely get my head around all the varying terms involved in this, and my fitted model has similar problems. Is it even possible and reasonable to run a model successfully using so many different multilevel terms?
All this results in the following brms formula:
Duration ~ 1 + utt_initial + word_initial +
gp(Longitude, Latitude, by=Macroarea, gr=TRUE) +
(1 + utt_initial + word_initial | (Language/Speaker)) +
(1 + utt_initial + word_initial | CLTS) +
(1+ utt_initial + word_initial | gr(phylo, cov=phylo)) +
z_num_phones + z_word_freq + z_speech_rate,
Translated to Stan, the following model emerges:
// generated with brms 2.20.15
functions {
/* compute correlated group-level effects
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor(matrix z, vector SD, matrix L) {
// r is stored in another dimension order than z
return transpose(diag_pre_multiply(SD, L) * z);
}
/* compute correlated group-level effects
* in the presence of a within-group covariance matrix
* Args:
* z: matrix of unscaled group-level effects
* SD: vector of standard deviation parameters
* L: cholesky factor correlation matrix
* Lcov: cholesky factor of within-group correlation matrix
* Returns:
* matrix of scaled group-level effects
*/
matrix scale_r_cor_cov(matrix z, vector SD, matrix L, matrix Lcov) {
vector[num_elements(z)] z_flat = to_vector(z);
vector[num_elements(z)] r = rep_vector(0, num_elements(z));
matrix[rows(L), cols(L)] LC = diag_pre_multiply(SD, L);
int rows_z = rows(z);
int rows_L = rows(L);
// kronecker product of cholesky factors times a vector
for (icov in 1:rows(Lcov)) {
for (jcov in 1:icov) {
if (Lcov[icov, jcov] > 1e-10) {
// avoid calculating products between unrelated individuals
for (i in 1:rows_L) {
for (j in 1:i) {
// incremented element of the output vector
int k = (rows_L * (icov - 1)) + i;
// applied element of the input vector
int l = (rows_L * (jcov - 1)) + j;
r[k] = r[k] + Lcov[icov, jcov] * LC[i, j] * z_flat[l];
}
}
}
}
}
// r is returned in another dimension order than z
return to_matrix(r, cols(z), rows(z), 0);
}
/* compute a latent Gaussian process
* Args:
* x: array of continuous predictor values
* sdgp: marginal SD parameter
* lscale: length-scale parameter
* zgp: vector of independent standard normal variables
* Returns:
* a vector to be added to the linear predictor
*/
vector gp(data array[] vector x, real sdgp, vector lscale, vector zgp) {
int Dls = rows(lscale);
int N = size(x);
matrix[N, N] cov;
if (Dls == 1) {
// one dimensional or isotropic GP
cov = gp_exp_quad_cov(x, sdgp, lscale[1]);
} else {
// multi-dimensional non-isotropic GP
cov = gp_exp_quad_cov(x[, 1], sdgp, lscale[1]);
for (d in 2:Dls) {
cov = cov .* gp_exp_quad_cov(x[, d], 1, lscale[d]);
}
}
for (n in 1:N) {
// deal with numerical non-positive-definiteness
cov[n, n] += 1e-12;
}
return cholesky_decompose(cov) * zgp;
}
/* Spectral density function of a Gaussian process
* with squared exponential covariance kernel
* Args:
* x: array of numeric values of dimension NB x D
* sdgp: marginal SD parameter
* lscale: vector of length-scale parameters
* Returns:
* numeric values of the function evaluated at 'x'
*/
vector spd_cov_exp_quad(data array[] vector x, real sdgp, vector lscale) {
int NB = dims(x)[1];
int D = dims(x)[2];
int Dls = rows(lscale);
vector[NB] out;
if (Dls == 1) {
// one dimensional or isotropic GP
real constant = square(sdgp) * (sqrt(2 * pi()) * lscale[1])^D;
real neg_half_lscale2 = -0.5 * square(lscale[1]);
for (m in 1:NB) {
out[m] = constant * exp(neg_half_lscale2 * dot_self(x[m]));
}
} else {
// multi-dimensional non-isotropic GP
real constant = square(sdgp) * sqrt(2 * pi())^D * prod(lscale);
vector[Dls] neg_half_lscale2 = -0.5 * square(lscale);
for (m in 1:NB) {
out[m] = constant * exp(dot_product(neg_half_lscale2, square(x[m])));
}
}
return out;
}
/* how many elements are in a range of integers?
* Args:
* x: an integer array
* start: start of the range (inclusive)
* end: end of the range (inclusive)
* Returns:
* a scalar integer
*/
int size_range(array[] int x, int start, int end) {
int out = 0;
for (i in 1:size(x)) {
out += (x[i] >= start && x[i] <= end);
}
return out;
}
/* which elements are in a range of integers?
* Args:
* x: an integer array
* start: start of the range (inclusive)
* end: end of the range (inclusive)
* Returns:
* an integer array
*/
array[] int which_range(array[] int x, int start, int end) {
array[size_range(x, start, end)] int out;
int j = 1;
for (i in 1:size(x)) {
if (x[i] >= start && x[i] <= end) {
out[j] = i;
j += 1;
}
}
return out;
}
/* adjust array values to x - start + 1
* Args:
* x: an integer array
* start: start of the range of values in x (inclusive)
* Returns:
* an integer array
*/
array[] int start_at_one(array[] int x, int start) {
array[size(x)] int out;
for (i in 1:size(x)) {
out[i] = x[i] - start + 1;
}
return out;
}
}
data {
int<lower=1> N; // total number of observations
vector[N] Y; // response variable
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int<lower=1> Kc; // number of population-level effects after centering
// data related to GPs
int<lower=1> Kgp_1; // number of sub-GPs (equal to 1 unless 'by' was used)
int<lower=1> Dgp_1; // GP dimension
// number of observations relevant for a certain sub-GP
array[Kgp_1] int<lower=1> Ngp_1;
// indices and contrasts of sub-GPs per observation
array[Ngp_1[1]] int<lower=1> Igp_1_1;
vector[Ngp_1[1]] Cgp_1_1;
array[Ngp_1[2]] int<lower=1> Igp_1_2;
vector[Ngp_1[2]] Cgp_1_2;
array[Ngp_1[3]] int<lower=1> Igp_1_3;
vector[Ngp_1[3]] Cgp_1_3;
array[Ngp_1[4]] int<lower=1> Igp_1_4;
vector[Ngp_1[4]] Cgp_1_4;
array[Ngp_1[5]] int<lower=1> Igp_1_5;
vector[Ngp_1[5]] Cgp_1_5;
array[Ngp_1[6]] int<lower=1> Igp_1_6;
vector[Ngp_1[6]] Cgp_1_6;
// number of latent GP groups
array[Kgp_1] int<lower=1> Nsubgp_1;
// indices of latent GP groups per observation
array[Ngp_1[1]] int<lower=1> Jgp_1_1;
// indices of latent GP groups per observation
array[Ngp_1[2]] int<lower=1> Jgp_1_2;
// indices of latent GP groups per observation
array[Ngp_1[3]] int<lower=1> Jgp_1_3;
// indices of latent GP groups per observation
array[Ngp_1[4]] int<lower=1> Jgp_1_4;
// indices of latent GP groups per observation
array[Ngp_1[5]] int<lower=1> Jgp_1_5;
// indices of latent GP groups per observation
array[Ngp_1[6]] int<lower=1> Jgp_1_6;
// covariates of the GP
array[Nsubgp_1[1]] vector[Dgp_1] Xgp_1_1;
array[Nsubgp_1[2]] vector[Dgp_1] Xgp_1_2;
array[Nsubgp_1[3]] vector[Dgp_1] Xgp_1_3;
array[Nsubgp_1[4]] vector[Dgp_1] Xgp_1_4;
array[Nsubgp_1[5]] vector[Dgp_1] Xgp_1_5;
array[Nsubgp_1[6]] vector[Dgp_1] Xgp_1_6;
// data for group-level effects of ID 1
int<lower=1> N_1; // number of grouping levels
int<lower=1> M_1; // number of coefficients per level
array[N] int<lower=1> J_1; // grouping indicator per observation
// group-level predictor values
vector[N] Z_1_1;
vector[N] Z_1_2;
vector[N] Z_1_3;
int<lower=1> NC_1; // number of group-level correlations
// data for group-level effects of ID 2
int<lower=1> N_2; // number of grouping levels
int<lower=1> M_2; // number of coefficients per level
array[N] int<lower=1> J_2; // grouping indicator per observation
// group-level predictor values
vector[N] Z_2_1;
vector[N] Z_2_2;
vector[N] Z_2_3;
int<lower=1> NC_2; // number of group-level correlations
// data for group-level effects of ID 3
int<lower=1> N_3; // number of grouping levels
int<lower=1> M_3; // number of coefficients per level
array[N] int<lower=1> J_3; // grouping indicator per observation
// group-level predictor values
vector[N] Z_3_1;
vector[N] Z_3_2;
vector[N] Z_3_3;
int<lower=1> NC_3; // number of group-level correlations
// data for group-level effects of ID 4
int<lower=1> N_4; // number of grouping levels
int<lower=1> M_4; // number of coefficients per level
array[N] int<lower=1> J_4; // grouping indicator per observation
matrix[N_4, N_4] Lcov_4; // cholesky factor of known covariance matrix
// group-level predictor values
vector[N] Z_4_1;
vector[N] Z_4_2;
vector[N] Z_4_3;
int<lower=1> NC_4; // number of group-level correlations
int prior_only; // should the likelihood be ignored?
}
transformed data {
matrix[N, Kc] Xc; // centered version of X without an intercept
vector[Kc] means_X; // column means of X before centering
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
vector[Kc] b; // regression coefficients
real Intercept; // temporary intercept for centered predictors
vector<lower=0>[Kgp_1] sdgp_1; // GP standard deviation parameters
array[Kgp_1] vector<lower=0>[1] lscale_1; // GP length-scale parameters
// latent variables of the GP
vector[Nsubgp_1[1]] zgp_1_1;
vector[Nsubgp_1[2]] zgp_1_2;
vector[Nsubgp_1[3]] zgp_1_3;
vector[Nsubgp_1[4]] zgp_1_4;
vector[Nsubgp_1[5]] zgp_1_5;
vector[Nsubgp_1[6]] zgp_1_6;
real<lower=0> shape; // shape parameter
vector<lower=0>[M_1] sd_1; // group-level standard deviations
matrix[M_1, N_1] z_1; // standardized group-level effects
cholesky_factor_corr[M_1] L_1; // cholesky factor of correlation matrix
vector<lower=0>[M_2] sd_2; // group-level standard deviations
matrix[M_2, N_2] z_2; // standardized group-level effects
cholesky_factor_corr[M_2] L_2; // cholesky factor of correlation matrix
vector<lower=0>[M_3] sd_3; // group-level standard deviations
matrix[M_3, N_3] z_3; // standardized group-level effects
cholesky_factor_corr[M_3] L_3; // cholesky factor of correlation matrix
vector<lower=0>[M_4] sd_4; // group-level standard deviations
matrix[M_4, N_4] z_4; // standardized group-level effects
cholesky_factor_corr[M_4] L_4; // cholesky factor of correlation matrix
}
transformed parameters {
matrix[N_1, M_1] r_1; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_1] r_1_1;
vector[N_1] r_1_2;
vector[N_1] r_1_3;
matrix[N_2, M_2] r_2; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_2] r_2_1;
vector[N_2] r_2_2;
vector[N_2] r_2_3;
matrix[N_3, M_3] r_3; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_3] r_3_1;
vector[N_3] r_3_2;
vector[N_3] r_3_3;
matrix[N_4, M_4] r_4; // actual group-level effects
// using vectors speeds up indexing in loops
vector[N_4] r_4_1;
vector[N_4] r_4_2;
vector[N_4] r_4_3;
real lprior = 0; // prior contributions to the log posterior
// compute actual group-level effects
r_1 = scale_r_cor(z_1, sd_1, L_1);
r_1_1 = r_1[, 1];
r_1_2 = r_1[, 2];
r_1_3 = r_1[, 3];
// compute actual group-level effects
r_2 = scale_r_cor(z_2, sd_2, L_2);
r_2_1 = r_2[, 1];
r_2_2 = r_2[, 2];
r_2_3 = r_2[, 3];
// compute actual group-level effects
r_3 = scale_r_cor(z_3, sd_3, L_3);
r_3_1 = r_3[, 1];
r_3_2 = r_3[, 2];
r_3_3 = r_3[, 3];
// compute actual group-level effects
r_4 = scale_r_cor_cov(z_4, sd_4, L_4, Lcov_4);
r_4_1 = r_4[, 1];
r_4_2 = r_4[, 2];
r_4_3 = r_4[, 3];
lprior += normal_lpdf(b | 0, 0.3);
lprior += normal_lpdf(Intercept | 4.5, 0.1);
lprior += exponential_lpdf(sdgp_1 | 5);
lprior += inv_gamma_lpdf(lscale_1[1][1] | 1.772337, 0.101952);
lprior += inv_gamma_lpdf(lscale_1[2][1] | 13.210624, 6.242006);
lprior += inv_gamma_lpdf(lscale_1[3][1] | 1.494197, 0.056607);
lprior += inv_gamma_lpdf(lscale_1[4][1] | 6.161284, 1.870791);
lprior += inv_gamma_lpdf(lscale_1[5][1] | 1.494197, 0.056607);
lprior += inv_gamma_lpdf(lscale_1[6][1] | 3.340648, 0.558666);
lprior += normal_lpdf(shape | 6, 0.5)
- 1 * normal_lccdf(0 | 6, 0.5);
lprior += exponential_lpdf(sd_1 | 12);
lprior += lkj_corr_cholesky_lpdf(L_1 | 5);
lprior += exponential_lpdf(sd_2 | 12);
lprior += lkj_corr_cholesky_lpdf(L_2 | 5);
lprior += exponential_lpdf(sd_3 | 12);
lprior += lkj_corr_cholesky_lpdf(L_3 | 5);
lprior += exponential_lpdf(sd_4 | 12);
lprior += lkj_corr_cholesky_lpdf(L_4 | 5);
}
model {
// likelihood including constants
if (!prior_only) {
vector[Nsubgp_1[1]] gp_pred_1_1 = gp(Xgp_1_1, sdgp_1[1], lscale_1[1], zgp_1_1);
vector[Nsubgp_1[2]] gp_pred_1_2 = gp(Xgp_1_2, sdgp_1[2], lscale_1[2], zgp_1_2);
vector[Nsubgp_1[3]] gp_pred_1_3 = gp(Xgp_1_3, sdgp_1[3], lscale_1[3], zgp_1_3);
vector[Nsubgp_1[4]] gp_pred_1_4 = gp(Xgp_1_4, sdgp_1[4], lscale_1[4], zgp_1_4);
vector[Nsubgp_1[5]] gp_pred_1_5 = gp(Xgp_1_5, sdgp_1[5], lscale_1[5], zgp_1_5);
vector[Nsubgp_1[6]] gp_pred_1_6 = gp(Xgp_1_6, sdgp_1[6], lscale_1[6], zgp_1_6);
// initialize linear predictor term
vector[N] mu = rep_vector(0.0, N);
mu[Igp_1_1] += Cgp_1_1 .* gp_pred_1_1[Jgp_1_1];
mu[Igp_1_2] += Cgp_1_2 .* gp_pred_1_2[Jgp_1_2];
mu[Igp_1_3] += Cgp_1_3 .* gp_pred_1_3[Jgp_1_3];
mu[Igp_1_4] += Cgp_1_4 .* gp_pred_1_4[Jgp_1_4];
mu[Igp_1_5] += Cgp_1_5 .* gp_pred_1_5[Jgp_1_5];
mu[Igp_1_6] += Cgp_1_6 .* gp_pred_1_6[Jgp_1_6];
mu += Intercept + Xc * b;
for (n in 1:N) {
// add more terms to the linear predictor
mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_1_3[J_1[n]] * Z_1_3[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_2_2[J_2[n]] * Z_2_2[n] + r_2_3[J_2[n]] * Z_2_3[n] + r_3_1[J_3[n]] * Z_3_1[n] + r_3_2[J_3[n]] * Z_3_2[n] + r_3_3[J_3[n]] * Z_3_3[n] + r_4_1[J_4[n]] * Z_4_1[n] + r_4_2[J_4[n]] * Z_4_2[n] + r_4_3[J_4[n]] * Z_4_3[n];
}
mu = exp(mu);
target += gamma_lpdf(Y | shape, shape ./ mu);
}
// priors including constants
target += lprior;
target += std_normal_lpdf(zgp_1_1);
target += std_normal_lpdf(zgp_1_2);
target += std_normal_lpdf(zgp_1_3);
target += std_normal_lpdf(zgp_1_4);
target += std_normal_lpdf(zgp_1_5);
target += std_normal_lpdf(zgp_1_6);
target += std_normal_lpdf(to_vector(z_1));
target += std_normal_lpdf(to_vector(z_2));
target += std_normal_lpdf(to_vector(z_3));
target += std_normal_lpdf(to_vector(z_4));
}
generated quantities {
// actual population-level intercept
real b_Intercept = Intercept - dot_product(means_X, b);
// compute group-level correlations
corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
vector<lower=-1,upper=1>[NC_1] cor_1;
// compute group-level correlations
corr_matrix[M_2] Cor_2 = multiply_lower_tri_self_transpose(L_2);
vector<lower=-1,upper=1>[NC_2] cor_2;
// compute group-level correlations
corr_matrix[M_3] Cor_3 = multiply_lower_tri_self_transpose(L_3);
vector<lower=-1,upper=1>[NC_3] cor_3;
// compute group-level correlations
corr_matrix[M_4] Cor_4 = multiply_lower_tri_self_transpose(L_4);
vector<lower=-1,upper=1>[NC_4] cor_4;
// extract upper diagonal of correlation matrix
for (k in 1:M_1) {
for (j in 1:(k - 1)) {
cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_2) {
for (j in 1:(k - 1)) {
cor_2[choose(k - 1, 2) + j] = Cor_2[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_3) {
for (j in 1:(k - 1)) {
cor_3[choose(k - 1, 2) + j] = Cor_3[j, k];
}
}
// extract upper diagonal of correlation matrix
for (k in 1:M_4) {
for (j in 1:(k - 1)) {
cor_4[choose(k - 1, 2) + j] = Cor_4[j, k];
}
}
}
One thing I thought about was to at least remove the Gaussian process from the model and run it on the results, to see how much of the final estimate can be explained by contact, instead of making it part of the initial inference. Or to remove the varying slopes on language phylogeny. But even then, The model takes months to finish, and is dominated by huge amounts of uncertainty.
Are there other ways to ease the model inference, without cutting too much of the desired control? How many multilevel factors can I actually expect to reasonably fit? Can I reduce the model complexity using Stan while maintaining the controls?