Issue with covariance matrix in Stan

Canaan_Breiss · June 23, 2023, 11:40pm

Hi, I’m extremely new to using covariance matrices in Stan, and I’m running into some errors. My code is as follows:

// Define the beta-binomial regression model
data {
  int<lower=0> N;                      // Number of adjectives
  int<lower=0> y_comparative[N];       // Observed morphological comparative
  int<lower=0> y_superlative[N];       // Observed morphological superlative

  real FreqNonCentered_lemma[N];       // Lemma frequency (summed log freq of each)
  real FreqNonCentered_comparative[N]; // Comparative frequency
  real FreqNonCentered_superlative[N]; // Superlative frequency

  int<lower=0> n_comparative[N];       // Total number of comparative forms
  int<lower=0> n_superlative[N];       // Total number of superlative forms
  int<lower=0> K;                      // Number of predictors
  matrix[N, K] X;                      // Design matrix
  matrix[2, N] mu;                     // Location parameter for bivariate_offsets
}

parameters {
  vector[K] shared_betas;              // Regression coefficients
  real beta_freq;
  real alpha;
  matrix[2, N] bivariate_offsets;
  real rho;
}

model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  matrix[2, 2] Sigmas[N];              // Covariance matrix for each observation

  rho ~ uniform(-1, 1);

  // Priors
  shared_betas ~ normal(0, 10);        // Prior for regression coefficients
  alpha ~ normal(0, 10);
  beta_freq ~ normal(0, 10);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);

    Sigmas[i][1, 1] = square(sigma_comparative[i]);
    Sigmas[i][2, 2] = square(sigma_superlative[i]);
    Sigmas[i][1, 2] = sigma_comparative[i] * rho * sigma_superlative[i];
    Sigmas[i][2, 1] = sigma_comparative[i] * rho * sigma_superlative[i];
    bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

  }

  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

My data are
data.csv (26.8 KB)
.

My R code is here:


stan_model_file <- "./Models/bivariate_stan.stan"
# Read the data into R
N <- nrow(stan_data)  # Number of observations
y_comparative <- round(stan_data$Count_COMP*stan_data$PercentMorph_COMP)+1  # Observed successes
y_superlative <- round(stan_data$Count_SUP*stan_data$PercentMorph_SUP)+1  # Observed successes

n_comparative <- stan_data$Count_COMP +1 # Total number of trials
n_superlative <- stan_data$Count_SUP +1 # Total number of trials

X <- stan_data %>% 
  select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL) %>% 
  as.matrix()
K <- ncol(stan_data %>%   select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL))  # Number of predictors

FreqNonCentered_lemma <- log(stan_data$Count_COMP+stan_data$Count_SUP)
FreqNonCentered_superlative <- log(stan_data$Count_SUP)
FreqNonCentered_comparative <- log(stan_data$Count_COMP)
mu <- matrix(0, nrow = 2, ncol = N)
 # a_ulam$Freq_noncentered

# Create a list of data for STAN
stan_data_list <- list(N = N, 
                  y_comparative = y_comparative, 
                  y_superlative = y_superlative, 
                  n_comparative = n_comparative, 
                  n_superlative = n_superlative, 
                  X = X, 
                  K = K, 
                  FreqNonCentered_lemma = FreqNonCentered_lemma,
                  FreqNonCentered_comparative = FreqNonCentered_comparative,
                  FreqNonCentered_superlative = FreqNonCentered_superlative,
                  mu=mu

                  )


# Compile the STAN model
model <- cmdstan_model(stan_model_file)

# Fit the model to the data
fit <- model$sample(data = stan_data_list, cores = 4, chains = 4, threads = 2, init = .1)

I’m getting errors related to the covariance matrix not being positive definite and not symmetric, and I’m not quite sure how to proceed. I realize this is probably something fairly trivial for an experienced user, but I’m new to this aspect of stan and so not sure where to go from here. Any help or suggestions would be very much appreciated!

mike-lawrence · June 24, 2023, 6:20am

Whenever making a covariance matrix by hand, add a small amount of “jitter” to the diagonal, where small is proportional to the magnitude of the values already there. I usually add 1e-5 if the diagonal is 1 and that seems to cause things to be PSD without noticeably affecting inference.

Canaan_Breiss · June 24, 2023, 1:39pm

Hi @mike-lawrence , thanks a lot for your suggestion! I tried that, plus a few other small numerical-stability tricks, and the modified model block is below:


model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  matrix[2, 2] Sigmas[N];              // Covariance matrix for each observation
  rho ~ uniform(-1, 1);

  // Priors
  shared_betas ~ normal(0, 10);        // Prior for regression coefficients
  alpha ~ normal(0, 10);
  beta_freq ~ normal(0, 10);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);{
    real jitter;
    real temp;

    real off_diag;

    jitter = 1e5;
    temp = log_sum_exp(sigma_comparative[i],sigma_superlative[i]);

    off_diag = log_sum_exp(temp, rho);

    Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
    Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
    Sigmas[i][1, 2] = off_diag;
    Sigmas[i][2, 1] = off_diag;;
    bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);
}
  }

  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

And it compiles fine, but gives the following errors:

Chain 3 Rejecting initial value:
Chain 3 Log probability evaluates to log(0), i.e. negative infinity.
Chain 3 Stan can’t start sampling from this initial value.
Chain 3 Rejecting initial value:
Chain 3 Error evaluating the log probability at the initial value.
Chain 3 Exception: multi_normal_lpdf: Covariance matrix is not symmetric. Covariance matrix[1,2] = inf, but Covariance matrix[2,1] = inf (in ‘C:/Users/canaa/AppData/Local/Temp/Rtmp2dkYGM/model-c6083d154042.stan’, line 57, column 4 to column 60)

Line 57 implies that the issue is with the line bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]); - I’m not sure what to do here. Would you mind suggesting other possible solutions?

mike-lawrence · June 24, 2023, 1:42pm

You can always ignore the “rejecting intitial value” warnings if the chains then go on to continue warm up and sampling without failing any other diagnostics.

Canaan_Breiss · June 24, 2023, 1:54pm

Ah, sorry I meant that all four chains can’t start sampling at all, there doesn’t seem to be a good place to “catch on”. So it never starts sampling, unfortunately.

mike-lawrence · June 24, 2023, 4:31pm

Could you clarify the motivation for expressing the off-diag element of the covariance as log( exp(s_c) + exp(s_s) )? I’m not quite following that math. Why not a standard representation of the covariance as a function of two variances and a correlation?

Canaan_Breiss · June 24, 2023, 6:04pm

That’s a great question - I wrote the code trying to pattern-match off something I found online, so really there’s no justification. Should I try it as just the off-diagonals being s_c and s_s just to check? so something like:

for (i in 1:N) {
sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);{

Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
Sigmas[i][1, 2] = sigma_superlative[i];
Sigmas[i][2, 1] = sigma_comparative[i];
bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

}

mike-lawrence · June 24, 2023, 6:07pm

Oh! Then you should just use the parameterizarion of the multivariate normal used in SUG1.13, which constructs the covariance (implicitly) for you.

Canaan_Breiss · June 24, 2023, 6:36pm

Great! I’m having a bit of trouble constructing it though - does something like this look right? Any pointers would be appreciated - it’s giving me some syntax errors I’m not sure how to solve.

Current code:

// Define the beta-binomial regression model
data {
  int<lower=0> N;                      // Number of adjectives
  int<lower=0> y_comparative[N];       // Observed morphological comparative
  int<lower=0> y_superlative[N];       // Observed morphological superlative

  real FreqNonCentered_lemma[N];       // Lemma frequency (summed log freq of each)
  real FreqNonCentered_comparative[N]; // Comparative frequency
  real FreqNonCentered_superlative[N]; // Superlative frequency

  int<lower=0> n_comparative[N];       // Total number of comparative forms
  int<lower=0> n_superlative[N];       // Total number of superlative forms
  int<lower=0> K;                      // Number of predictors
  matrix[N, K] X;                      // Design matrix
  vector[2] mu;                     // Location parameter for bivariate_offsets
}

parameters {
  vector[K] shared_betas;              // Regression coefficients
  real beta_freq;
  real alpha;
  matrix[2, N] bivariate_offsets;
  cholesky_factor_corr[2] Lcorr;
  matrix[2,N] sigmas;
}

model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  // Priors
  shared_betas ~ normal(0, 1);        // Prior for regression coefficients
  alpha ~ normal(0, 1);
  beta_freq ~ normal(0, 1);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    vector[2] sigma;

    sigma[1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma[2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);
    sigmas[i,] = sigma;

    bivariate_offsets[i,] ~ multi_normal(mu, diag_pre_multiply(sigma, Lcorr));


  }


  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

generated quantities {
  matrix[2,2] Omega;
  matrix[2,2] Sigma;
  Omega = multiply_lower_tri_self_transpose(Lcorr);
  Sigma = quad_form_diag(Omega, sigmas);
}'

Canaan_Breiss · June 26, 2023, 3:32am

edit: @mike-lawrence , any suggestions on how to proceed? Any specifics would be appreciated!

mike-lawrence · June 26, 2023, 11:42am

In your previous code you had separate correlations for each N but here you only have one; presumably unintentional?

Also, you’re still using multi_normal, but the final code in the SUG1.13 shows how to use just normal on a helper variable matrix that gets multiplied by the cholesky factor matrix (well, diag_ore_multiply’d by the sigmas, but I prefer to just multiply by the cholesky factor matrix and elt-multiply by rep_matrix(sigmas,N).

Finally, I think you want to assign the result to bivariate_offsets[,i] rather than bivariate_offsets[i,].

Canaan_Breiss · June 26, 2023, 2:09pm

Hi @mike-lawrence , thanks a lot for these pointers. I guess I’m still quite stuck on how to translate the final block of code in SUG1.13 into something that I can use. Particularly, I need to have the sigma’s for each observation differ parametrically (these are my sigma_comparative and sigma_superlative). Right now I define these in a loop, but then I don’t know where to hook these in to the SUG1.13 code. Sorry to be such a novice about this, but would you mind suggesting how I can do that? Thanks a lot in advance.

For reference, my current code is:

// Define the beta-binomial regression model
data {
int<lower=0> N; // Number of adjectives
int<lower=0> y_comparative[N]; // Observed morphological comparative
int<lower=0> y_superlative[N]; // Observed morphological superlative

real FreqNonCentered_lemma[N]; // Lemma frequency (summed log freq of each)
real FreqNonCentered_comparative[N]; // Comparative frequency
real FreqNonCentered_superlative[N]; // Superlative frequency

int<lower=0> n_comparative[N]; // Total number of comparative forms
int<lower=0> n_superlative[N]; // Total number of superlative forms
int<lower=0> K; // Number of predictors
matrix[N, K] X; // Design matrix
matrix[2, N] mu; // Location parameter for bivariate_offsets
}

parameters {
vector[K] shared_betas; // Regression coefficients
real beta_freq;
real alpha;
matrix[2, N] bivariate_offsets;
real rho;
}

model {
vector[N] eta_comparative; // Probability of success for comparative
vector[N] eta_superlative; // Probability of success for superlative
vector[N] sigma_comparative; // Standard deviation for comparative
vector[N] sigma_superlative; // Standard deviation for superlative
matrix[2, 2] Sigmas[N]; // Covariance matrix for each observation
real jitter;
jitter = 1e5;
rho ~ uniform(-1, 1);

// Priors
shared_betas ~ normal(0, 10); // Prior for regression coefficients
alpha ~ normal(0, 10);
beta_freq ~ normal(0, 10);

// Calculate the standard deviations and covariance matrices
for (i in 1:N) {
sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);

Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
Sigmas[i][1, 2] = sigma_superlative[i];
Sigmas[i][2, 1] = sigma_comparative[i];
bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

}
// Priors for bivariate_offsets

// Calculate the probability of success for each observation
eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

// Likelihood
y_comparative ~ binomial_logit(n_comparative, eta_comparative);
y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

And I get a ton of errors about the covariance matrix not being symmetric, to the extent that the chains don’t actually ever start sampling.

mike-lawrence · June 27, 2023, 3:54pm

How’s this look? This employs the sds-varying-by-levels-of-N that you indicated you want but has a common correlation parameter across levels of N:

data {
	int<lower=0> N; // Number of adjectives
	int<lower=0> y_comparative[N]; // Observed morphological comparative
	int<lower=0> y_superlative[N]; // Observed morphological superlative

	real FreqNonCentered_lemma[N]; // Lemma frequency (summed log freq of each)
	real FreqNonCentered_comparative[N]; // Comparative frequency
	real FreqNonCentered_superlative[N]; // Superlative frequency

	int<lower=0> n_comparative[N]; // Total number of comparative forms
	int<lower=0> n_superlative[N]; // Total number of superlative forms
	int<lower=0> K; // Number of predictors
	matrix[N, K] X; // Design matrix
	row_vector[2] mu; // Location parameter for bivariate_offsets
}

parameters {
	vector[K] shared_betas; // Regression coefficients
	real beta_freq;
	real alpha;
	matrix[N,2] bivariate_offsets_helper;
	cholesky_factor_corr[2] Lcorr;
}
transformed parameters{
	vector[N] eta_comparative; // Probability of success for comparative
	vector[N] eta_superlative; // Probability of success for superlative
	matrix[N,2] sigmas;
	matrix[N,2] bivariate_offsets;
	{ // local environment to avoid saving-to-file bivariate_offsets_helper2
		// Calculate the standard deviations and covariance matrices
		matrix[N,2] bivariate_offsets_helper2 = Lcorr * bivariate_offsets_helper ;
		for (i in 1:N) {
			sigmas[i,1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
			sigmas[i,2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);
			bivariate_offsets[i,] = (
				bivariate_offsets_helper2[i,]
				+ sigmas[i,]
				+ mu
			) ;
		}
	}
	// Calculate the probability of success for each observation
	eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
	eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);
}
model {

	// Priors
	shared_betas ~ normal(0, 1); // Prior for regression coefficients
	alpha ~ normal(0, 1);
	beta_freq ~ normal(0, 1);
	to_vector(bivariate_offsets_helper) ~ std_normal() ;
	Lcorr ~ lkj_corr_cholesky(1) ;

	// Likelihood
	y_comparative ~ binomial_logit(n_comparative, eta_comparative);
	y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

generated quantities {
	matrix[2,2] Omega;
	Omega = multiply_lower_tri_self_transpose(Lcorr);
}

Canaan_Breiss · June 27, 2023, 5:00pm

Hi @mike-lawrence , thanks a lot, this looks potentially really good! I’m not quite sure what you did with the block here:

{ // local environment to avoid saving-to-file bivariate_offsets_helper2 // Calculate the standard deviations and covariance matrices matrix[N,2] bivariate_offsets_helper2 = Lcorr * bivariate_offsets_helper ; for (i in 1:N) { sigmas[i,1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq); sigmas[i,2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq); bivariate_offsets[i,] = ( bivariate_offsets_helper2[i,] + sigmas[i,] + mu ) ; } }

I assume this is basically the “noncentered parameterization” version of what I was thinking about? If you could clarify a little bit more that would be super helpful for me in learning how to build these kinds of models on my own going forward.

I tried fitting the model, with the R code as follows:


stan_model_file <- "./Models/from_stan_forums.stan"
# Read the data into R
N <- nrow(stan_data) # Number of observations
y_comparative <- round(stan_data$Count_COMP*stan_data$PercentMorph_COMP)+1 # Observed successes
y_superlative <- round(stan_data$Count_SUP*stan_data$PercentMorph_SUP)+1 # Observed successes

n_comparative <- stan_data$Count_COMP +1 # Total number of trials
n_superlative <- stan_data$Count_SUP +1 # Total number of trials

X <- stan_data %>%
select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL) %>%
as.matrix()
K <- ncol(stan_data %>% select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL)) # Number of predictors

FreqNonCentered_lemma <- log(stan_data$Count_COMP+stan_data$Count_SUP)
FreqNonCentered_superlative <- log(stan_data$Count_SUP)
FreqNonCentered_comparative <- log(stan_data$Count_COMP)
mu <- c(0,0)
# a_ulam$Freq_noncentered

# Create a list of data for STAN
stan_data_list <- list(N = N,
y_comparative = y_comparative,
y_superlative = y_superlative,
n_comparative = n_comparative,
n_superlative = n_superlative,
X = X,
K = K,
FreqNonCentered_lemma = FreqNonCentered_lemma,
FreqNonCentered_comparative = FreqNonCentered_comparative,
FreqNonCentered_superlative = FreqNonCentered_superlative,
mu=mu

)

# Compile the STAN model
model <- cmdstan_model(stan_model_file)

# Fit the model to the data
fit <- model$sample(data = stan_data_list, cores = 4, chains = 4, seed = 1)

But I got an error suggesting there was something off in the part that I don’t quite understand:

Topic		Replies	Views
Multivariate Hierarchical Model Modeling techniques , fitting-issues	7	980	April 7, 2021
Exception: S_n (cov_matrix) not symmetric for large value elements Modeling fitting-issues	10	103	May 5, 2025
Sampling covariance matrix with additional constrain Modeling	5	648	December 5, 2018
Model with Kronecker cov matrix Modeling	7	1022	July 11, 2018
Prior for Variance Covariance Matrix and Error obtained in RStan regarding that Modeling rstan	1	1014	April 27, 2021

Issue with covariance matrix in Stan

Related topics