Slow run time when fitting multi_normal_cholesky likelihood in bivariate model

Hi ,

I am new to fitting multivariate analyses in stan and have managed to set up a running bivariate model with a multi_normal_cholesky likelihood function so that I can estimate the residual correlation between z_1 and z_2.

However, the model provides a very slow fit with some simulated data. I have tried to fit the model with additional fixed and random effects for my full model but I cancelled the fit after it was on 3000/5000 iterations after letting it run for over a week. The model is not vectorised because one of the variables (opponent) is arbitrarily assigned and needs to be passed to the same cell in matrix I.

I can’t really isolate any mistakes that make the model fit take so long anda ny help with improving the model fit is greatly appreciated! I pasted a simplified version of the stan model below and attached the simulated dataset that I use to verify the model.

Stan_data_CdG.csv (373.7 KB)

The stan model:

 data {
   int<lower=0> n_obs; // number of observations 
   int<lower=0> n_ind; // number of individuals

   int<lower=0> individual[n_obs];  //  Individual ID repeated obs
   int<lower=0> opponent1[n_obs];  //  Individual ID opponent1 repeated obs
   int<lower=0> opponent2[n_obs];  //  Individual ID opponent2 repeated obs
   
   real z_1[n_obs];  // phenotypic observations
   real z_2[n_obs];  // phenotypic observations
 }
 
 parameters {
// Define parameters to estimate for equation 1
 	real B_0; //intercept

   // Random effects
   matrix[4,n_ind]     	zI; //(intercepts and opponent effect for each individual)
   vector<lower=0>[4]  	sigma_I; // sd  intercepts 
   cholesky_factor_corr[4] L;  // factor to estimate covariance intercepts
   
// Define parameters to estimate for equation 2
 	real B_0_2; //intercept

   // Residual correlation 
  vector<lower=0>[2] sd_R; //residual standard deviations;
  cholesky_factor_corr[2] L_R; //Cholesky corr matrix for residuals

 }
 
 transformed parameters{
	matrix[4,n_ind] I; //  Unscaled blups intercept and res_impact for each individual
	real e_z[n_obs]; // predicted values for phenotype
	
  I  = diag_pre_multiply(sigma_I, L) * zI; // get the unscaled value
  
// Equation 1: Primary producing events

   for (i in 1:n_obs) {
 	e_z[i]  = B_0 + I[1, individual[i]] + I[2, opponent1[i]] + I[2, opponent2[i]];
   }
  
// Equation 2

	real e_z_2[n_obs]; // predicted values for phenotype
	
   for (i in 1:n_obs) {
 	e_z_2[i]  = B_0_2 + I[3, individual[i]] + I[4, opponent1[i]] + I[4, opponent2[i]];
   }
}
 
model {

// Equation 1

// Fixed effects prior distributions
 B_0 ~ normal(0, 2); 
 B_0_2 ~ normal(0, 2); 

 // Random effects prior distribution
	to_vector(zI) ~ normal(0, 1);
  to_vector(sigma_I) ~ cauchy(0, 3);

// Residual correlation
  matrix[2,2] L_sigma = diag_pre_multiply(sd_R, L_R);

// Likelihood function
 for (i in 1:n_obs) {
 [z_1[i], z_2[i]]' ~ multi_normal_cholesky ([e_z[i], e_z_2[i]]', L_sigma); //Cholesky factorized lower-tri residual cor matrix
 }

  sd_R ~ exponential(3);
  L_R ~ lkj_corr_cholesky(3);
  L ~ lkj_corr_cholesky(3);
}

generated quantities{

// residual correlations
  matrix[2,2] res_cor_mat = L_R * L_R';
  real<lower=-1,upper=1> res_cor;
  res_cor = res_cor_mat[1,2];

  real<lower=0> Sigma2_res;
  Sigma2_res = sd_R[1]^2;

  real<lower=0> Sigma2_res_2;
  Sigma2_res_2 = sd_R[2]^2;

// Correlation matrix
  matrix[4, 4]  Omega_I;
  Omega_I = L * L';

// Equation 1

  real<lower=0> Sigma2_intercept;
  real<lower=0> Sigma2_res_impact;
  real<lower=0> Sigma2_total;

  Sigma2_intercept = sigma_I[1]^2;
  Sigma2_res_impact= sigma_I[2]^2;
  Sigma2_total = Sigma2_intercept + Sigma2_res_impact + Sigma2_res;

  real var_comp_focal;
  real var_comp_opponent;
  real var_comp_res;

  var_comp_focal = Sigma2_intercept/Sigma2_total;
  var_comp_opponent = Sigma2_res_impact/Sigma2_total;
  var_comp_res = Sigma2_res/Sigma2_total;

// Equation 2

  real<lower=0> Sigma2_intercept_2;
  real<lower=0> Sigma2_res_impact_2;
  real<lower=0> Sigma2_total_2;

  Sigma2_intercept_2 = sigma_I[3]^2;
  Sigma2_res_impact_2 = sigma_I[4]^2;

  Sigma2_total_2 = Sigma2_intercept_2 + Sigma2_res_impact_2 + Sigma2_res_2;

// Estimate variance components of random effects

  real var_comp_focal_2;
  real var_comp_opponent_2;
  real var_comp_res_2;

  var_comp_focal_2 = Sigma2_intercept_2/Sigma2_total_2;
  var_comp_opponent_2 = Sigma2_res_impact_2/Sigma2_total_2;
  var_comp_res_2 = Sigma2_res_2/Sigma2_total_2;

// Covariances and correlations
  real cov_1; 
  real cor_1;
  real cov_2;
  real cor_2;
  real cov_3; 
  real cor_3;
  real cov_4;
  real cor_4;
  real cov_5; 
  real cor_5;
  real cov_6; 
  real cor_6;

  cov_1 = Omega_I[1,2]*sqrt(Sigma2_res_impact*Sigma2_intercept);
  cor_1 = Omega_I[1,2];

  cov_2 = Omega_I[1,3]*sqrt(Sigma2_intercept*Sigma2_intercept_2);
  cor_2 = Omega_I[1,3];
  
  cov_3 = Omega_I[1,4]*sqrt(Sigma2_res_impact_2*Sigma2_intercept);
  cor_3 = Omega_I[1,4];

  cov_4 = Omega_I[2,3]*sqrt(Sigma2_res_impact*Sigma2_intercept_2);
  cor_4 = Omega_I[2,3];

  cov_5 = Omega_I[2,4]*sqrt(Sigma2_res_impact*Sigma2_res_impact_2);
  cor_5 = Omega_I[2,4];

  cov_6 = Omega_I[3,4]*sqrt(Sigma2_res_impact_2*Sigma2_intercept_2);
  cor_6 = Omega_I[3,4];
}

Fitting the model and creating the datafile for the model and the parameters that should be provided in the model summary:

params_bivar_cholesky <- c("B_0",  
                           "Sigma2_intercept", "Sigma2_res_impact", "Sigma2_res", "Sigma2_total",
                           "var_comp_focal", "var_comp_opponent", "var_comp_res",
                           "B_0_2",  
                           "Sigma2_intercept_2", "Sigma2_res_impact_2", "Sigma2_res_2", "Sigma2_total_2",
                           "var_comp_focal_2", "var_comp_opponent_2",  "var_comp_res_2",
                           "cov_1", "cor_1", "cov_2", "cor_2", "cov_3", "cor_3", "cov_4", "cor_4", "cov_5","cor_5", "cov_6", "cor_6", "res_cor")

stan_bivar_data <- list(n_obs = nrow(df),
                        n_ind = length(unique(df$n_ind)),
                        individual = df$individual,
                        opponent1 = df$opponent1,
                        opponent2 = df$opponent2,
                        z_1 = df$z_1,
                        z_2 =df$z_2)

md <- stan("RR_cholesky_multi_normal_test.stan", data = stan_bivar_data,
           pars = params_bivar_cholesky,
           chains = 3, iter = 5000,  
           warmup = 1000, thin = 2, cores = 6)

round(summary(md)$summary[,c(1,4,6, 8, 9,10)],3)

Hi, @CdeGroot and welcome to the Stan forums. Sorry this took a while to answer. We seem to have gotten a lot of involved modeling questions recently.

The slowdown is from this call:

This is requiring the Cholseky factory L_sigma to bed olved n_obs times. The ting to do is arrange all this data into an array of vectors and call it once.

array[n_obs] vector[2] zs;
array[n_obs] vector[2] mus;
for (I in 1:n_obs) { 
  zs[n_obs] = [z_1[i], z_2[i]]';
  mus[n_obs] = [e_z[i], e_z_2[i]]';
}
zs ~ multi_normal_cholesky(mus, L_sigma);

As an intermediate step, you could define zs in the transformed data block so you only have to do that once. Even better, just declare zs instead of z_1 and z_2 in the data block. Or put the definition of mus in the Having said that, loops in Stan that don’t involve autodiff are pretty fast. Same for the parameters.

You can also clean a bunch the code up by using declare/define. For example,

matrix[4, n_ind] I = diag_pre_multiply(sigma_I, L) * zI; 

And then just define mus in the transformed parameters block instead of what you have. That can also be simplified and sped up by replacing

for (i in 1:n_obs) {
 	e_z[i]  = B_0 + I[1, individual[i]] + I[2, opponent1[i]] + I[2, opponent2[i]];
   }

with

e_z = B_0 + I[1, individual] + I[2, opponent1] + I[2, opponent2];

That probably will require declaring some things you have as arrays as vectors. The advantage here is that it speeds up autodiff by simplifying the expression graph with vectorization.

You might want to drop the generated quantities until you get everything else fitted as there’s a lot going on there and it could be slower. You can always generate it in a second pass using standalone generated quantities if necessary and there’s a real speed/scaling problem with fitting.

Hi @Bob_Carpenter ,

Thanks a lot for your answer! I changed the code according to your suggestions (see pasted below). It is indeed running a lot faster now, and is manageable within a day now. It takes about 11 hours to fit the model with warmup = 1000 and iter = 5000 with 2 chains. The fitting takes about equally as long when I fit it with the generated quantities, so that does not seem to slow it down significantly. I still have a few questions about the model:

  1. I was wondering whether I correctly vectorised the model formula now, perhaps it could still be fitted faster.
  2. The model has very low n_eff (<2) even with 8000 samples, had low BFMI, and some iterations exceeded the maximum treedepth. So I assume there might be a misspecification in the model somewhere. Perhaps this is why the model is not running as efficiently?
data {
  //Phenotype dataset
  // Number of clusters (an integer)
  int<lower=0> n_obs; // number of observations for phenotypes or number of rows
  int<lower=0> n_ind; // number of individuals
  // Clusters identifiers (an integer)
  int<lower=0> individual[n_obs];  //  Individual ID repeated obs
  int<lower=0> opponent1[n_obs];  //  Individual ID opponent1 repeated obs
  int<lower=0> opponent2[n_obs];  //  Individual ID opponent2 repeated obs
  // Continuous variables
  array[n_obs] vector[2] zs;  // phenotypic observations
}
parameters {
  // Define parameters to estimate for equation 1
  real B_0; //intercept
  // Random effects
  matrix[4,n_ind]     	zI; //(intercepts and opponent effect for each individual)
  vector<lower=0>[4]  	sigma_I; // sd  intercepts and slopes
  cholesky_factor_corr[4] L;  // factor to estimate covariance int-slopes
  // Define parameters to estimate for equation 2
  // Parameters for model of phenotype
  real B_0_2; //intercept
  // Residual correlation 
  vector<lower=0>[2] sd_R; //residual standard deviations;
  cholesky_factor_corr[2] L_R; //Cholesky corr matrix for residuals
}
transformed parameters{
  matrix[4, n_ind] I = diag_pre_multiply(sigma_I, L) * zI;  //  Unscaled blups intercept and res_impact for each individual
  row_vector[n_obs] e_z; // predicted values for phenotype
  // Equation 1: Primary producing events
    e_z  = B_0 + I[1, individual] + I[2, opponent1] + I[2, opponent2];
  // Equation 2
  row_vector[n_obs] e_z_2; // predicted values for phenotype
    e_z_2  = B_0_2 + I[3, individual] + I[4, opponent1] + I[4, opponent2];
}
model {
  // Fixed effects prior distributions
  B_0 ~ normal(0, 2); 
  B_0_2 ~ normal(0, 2); 
  // Random effects prior distribution
  to_vector(zI) ~ normal(0, 1);
  to_vector(sigma_I) ~ cauchy(0, 3);
  sd_R ~ exponential(3);
  L_R ~ lkj_corr_cholesky(3);
  L ~ lkj_corr_cholesky(3);
  // Residual correlation Choleksky factor
  matrix[2,2] L_sigma = diag_pre_multiply(sd_R, L_R);
array[n_obs] vector[2] mus;
for (i in 1:n_obs) { 
  mus[i] = [e_z[i], e_z_2[i]]';
}
// Cholesky multinormal likelihood function to estimate residual correlations
zs ~ multi_normal_cholesky(mus, L_sigma); 
}
generated quantities{
  // residual correlations
  matrix[2,2] res_cor_mat = L_R * L_R';
  real<lower=-1,upper=1> res_cor = res_cor_mat[1,2];
  real<lower=0> Sigma2_res = sd_R[1]^2;
  real<lower=0> Sigma2_res_2 = sd_R[2]^2;
  // Correlation matrix
  matrix[4, 4]  Omega_I = L * L';
  // Equation 1
  real<lower=0> Sigma2_intercept = sigma_I[1]^2;
  real<lower=0> Sigma2_res_impact = sigma_I[2]^2;
  real<lower=0> Sigma2_total = Sigma2_intercept + Sigma2_res_impact + Sigma2_res;
  // Estimate variance components of random effects Eq1
  real var_comp_focal = Sigma2_intercept/Sigma2_total;
  real var_comp_opponent = Sigma2_res_impact/Sigma2_total;
  real var_comp_res = Sigma2_res/Sigma2_total;
  // Equation 2
  real<lower=0> Sigma2_intercept_2 = sigma_I[3]^2;
  real<lower=0> Sigma2_res_impact_2 = sigma_I[4]^2;
  real<lower=0> Sigma2_total_2 = Sigma2_intercept_2 + Sigma2_res_impact_2 + Sigma2_res_2;
  // Estimate variance components of random effects Eq2
  real var_comp_focal_2 = Sigma2_intercept_2/Sigma2_total_2;
  real var_comp_opponent_2 = Sigma2_res_impact_2/Sigma2_total_2;
  real var_comp_res_2 = Sigma2_res_2/Sigma2_total_2;
  // Covariances and correlations
  real cov_1 = Omega_I[1,2]*sqrt(Sigma2_res_impact*Sigma2_intercept); 
  real cor_1 = Omega_I[1,2];
  real cov_2 = Omega_I[1,3]*sqrt(Sigma2_intercept*Sigma2_intercept_2);
  real cor_2 = Omega_I[1,3];
  real cov_3 = Omega_I[1,4]*sqrt(Sigma2_res_impact_2*Sigma2_intercept);
  real cor_3 = Omega_I[1,4];
  real cov_4 = Omega_I[2,3]*sqrt(Sigma2_res_impact*Sigma2_intercept_2);
  real cor_4 = Omega_I[2,3];
  real cov_5 = Omega_I[2,4]*sqrt(Sigma2_res_impact*Sigma2_res_impact_2);
  real cor_5 = Omega_I[2,4];
  real cov_6 = Omega_I[3,4]*sqrt(Sigma2_res_impact_2*Sigma2_intercept_2);
  real cor_6 = Omega_I[3,4];
}

The edited data assignment:

stan_bivar_data <- list(n_obs = nrow(df),
                        n_ind = length(unique(df$n_ind)),
                        individual = df$individual,
                        opponent1 = df$opponent1,
                        opponent2 = df$opponent2,
                        zs = as.matrix(cbind(df$z_1,df$z_2)))