Issue with covariance matrix in Stan

Hi, I’m extremely new to using covariance matrices in Stan, and I’m running into some errors. My code is as follows:

// Define the beta-binomial regression model
data {
  int<lower=0> N;                      // Number of adjectives
  int<lower=0> y_comparative[N];       // Observed morphological comparative
  int<lower=0> y_superlative[N];       // Observed morphological superlative

  real FreqNonCentered_lemma[N];       // Lemma frequency (summed log freq of each)
  real FreqNonCentered_comparative[N]; // Comparative frequency
  real FreqNonCentered_superlative[N]; // Superlative frequency

  int<lower=0> n_comparative[N];       // Total number of comparative forms
  int<lower=0> n_superlative[N];       // Total number of superlative forms
  int<lower=0> K;                      // Number of predictors
  matrix[N, K] X;                      // Design matrix
  matrix[2, N] mu;                     // Location parameter for bivariate_offsets
}

parameters {
  vector[K] shared_betas;              // Regression coefficients
  real beta_freq;
  real alpha;
  matrix[2, N] bivariate_offsets;
  real rho;
}

model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  matrix[2, 2] Sigmas[N];              // Covariance matrix for each observation

  rho ~ uniform(-1, 1);

  // Priors
  shared_betas ~ normal(0, 10);        // Prior for regression coefficients
  alpha ~ normal(0, 10);
  beta_freq ~ normal(0, 10);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);

    Sigmas[i][1, 1] = square(sigma_comparative[i]);
    Sigmas[i][2, 2] = square(sigma_superlative[i]);
    Sigmas[i][1, 2] = sigma_comparative[i] * rho * sigma_superlative[i];
    Sigmas[i][2, 1] = sigma_comparative[i] * rho * sigma_superlative[i];
    bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

  }

  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}


My data are
data.csv (26.8 KB)
.

My R code is here:


stan_model_file <- "./Models/bivariate_stan.stan"
# Read the data into R
N <- nrow(stan_data)  # Number of observations
y_comparative <- round(stan_data$Count_COMP*stan_data$PercentMorph_COMP)+1  # Observed successes
y_superlative <- round(stan_data$Count_SUP*stan_data$PercentMorph_SUP)+1  # Observed successes

n_comparative <- stan_data$Count_COMP +1 # Total number of trials
n_superlative <- stan_data$Count_SUP +1 # Total number of trials

X <- stan_data %>% 
  select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL) %>% 
  as.matrix()
K <- ncol(stan_data %>%   select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL))  # Number of predictors

FreqNonCentered_lemma <- log(stan_data$Count_COMP+stan_data$Count_SUP)
FreqNonCentered_superlative <- log(stan_data$Count_SUP)
FreqNonCentered_comparative <- log(stan_data$Count_COMP)
mu <- matrix(0, nrow = 2, ncol = N)
 # a_ulam$Freq_noncentered

# Create a list of data for STAN
stan_data_list <- list(N = N, 
                  y_comparative = y_comparative, 
                  y_superlative = y_superlative, 
                  n_comparative = n_comparative, 
                  n_superlative = n_superlative, 
                  X = X, 
                  K = K, 
                  FreqNonCentered_lemma = FreqNonCentered_lemma,
                  FreqNonCentered_comparative = FreqNonCentered_comparative,
                  FreqNonCentered_superlative = FreqNonCentered_superlative,
                  mu=mu

                  )


# Compile the STAN model
model <- cmdstan_model(stan_model_file)

# Fit the model to the data
fit <- model$sample(data = stan_data_list, cores = 4, chains = 4, threads = 2, init = .1)


I’m getting errors related to the covariance matrix not being positive definite and not symmetric, and I’m not quite sure how to proceed. I realize this is probably something fairly trivial for an experienced user, but I’m new to this aspect of stan and so not sure where to go from here. Any help or suggestions would be very much appreciated!

Whenever making a covariance matrix by hand, add a small amount of “jitter” to the diagonal, where small is proportional to the magnitude of the values already there. I usually add 1e-5 if the diagonal is 1 and that seems to cause things to be PSD without noticeably affecting inference.

Hi @mike-lawrence , thanks a lot for your suggestion! I tried that, plus a few other small numerical-stability tricks, and the modified model block is below:


model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  matrix[2, 2] Sigmas[N];              // Covariance matrix for each observation
  rho ~ uniform(-1, 1);

  // Priors
  shared_betas ~ normal(0, 10);        // Prior for regression coefficients
  alpha ~ normal(0, 10);
  beta_freq ~ normal(0, 10);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);{
    real jitter;
    real temp;

    real off_diag;

    jitter = 1e5;
    temp = log_sum_exp(sigma_comparative[i],sigma_superlative[i]);

    off_diag = log_sum_exp(temp, rho);

    Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
    Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
    Sigmas[i][1, 2] = off_diag;
    Sigmas[i][2, 1] = off_diag;;
    bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);
}
  }

  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

And it compiles fine, but gives the following errors:

Chain 3 Rejecting initial value:
Chain 3 Log probability evaluates to log(0), i.e. negative infinity.
Chain 3 Stan can’t start sampling from this initial value.
Chain 3 Rejecting initial value:
Chain 3 Error evaluating the log probability at the initial value.
Chain 3 Exception: multi_normal_lpdf: Covariance matrix is not symmetric. Covariance matrix[1,2] = inf, but Covariance matrix[2,1] = inf (in ‘C:/Users/canaa/AppData/Local/Temp/Rtmp2dkYGM/model-c6083d154042.stan’, line 57, column 4 to column 60)

Line 57 implies that the issue is with the line bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]); - I’m not sure what to do here. Would you mind suggesting other possible solutions?

You can always ignore the “rejecting intitial value” warnings if the chains then go on to continue warm up and sampling without failing any other diagnostics.

Ah, sorry I meant that all four chains can’t start sampling at all, there doesn’t seem to be a good place to “catch on”. So it never starts sampling, unfortunately.

Could you clarify the motivation for expressing the off-diag element of the covariance as log( exp(s_c) + exp(s_s) )? I’m not quite following that math. Why not a standard representation of the covariance as a function of two variances and a correlation?

That’s a great question - I wrote the code trying to pattern-match off something I found online, so really there’s no justification. Should I try it as just the off-diagonals being s_c and s_s just to check? so something like:

for (i in 1:N) {
sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);{

Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
Sigmas[i][1, 2] = sigma_superlative[i];
Sigmas[i][2, 1] = sigma_comparative[i];
bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

}

Oh! Then you should just use the parameterizarion of the multivariate normal used in SUG1.13, which constructs the covariance (implicitly) for you.

Great! I’m having a bit of trouble constructing it though - does something like this look right? Any pointers would be appreciated - it’s giving me some syntax errors I’m not sure how to solve.

Current code:

// Define the beta-binomial regression model
data {
  int<lower=0> N;                      // Number of adjectives
  int<lower=0> y_comparative[N];       // Observed morphological comparative
  int<lower=0> y_superlative[N];       // Observed morphological superlative

  real FreqNonCentered_lemma[N];       // Lemma frequency (summed log freq of each)
  real FreqNonCentered_comparative[N]; // Comparative frequency
  real FreqNonCentered_superlative[N]; // Superlative frequency

  int<lower=0> n_comparative[N];       // Total number of comparative forms
  int<lower=0> n_superlative[N];       // Total number of superlative forms
  int<lower=0> K;                      // Number of predictors
  matrix[N, K] X;                      // Design matrix
  vector[2] mu;                     // Location parameter for bivariate_offsets
}

parameters {
  vector[K] shared_betas;              // Regression coefficients
  real beta_freq;
  real alpha;
  matrix[2, N] bivariate_offsets;
  cholesky_factor_corr[2] Lcorr;
  matrix[2,N] sigmas;
}

model {
  vector[N] eta_comparative;           // Probability of success for comparative
  vector[N] eta_superlative;           // Probability of success for superlative
  vector[N] sigma_comparative;         // Standard deviation for comparative
  vector[N] sigma_superlative;         // Standard deviation for superlative
  // Priors
  shared_betas ~ normal(0, 1);        // Prior for regression coefficients
  alpha ~ normal(0, 1);
  beta_freq ~ normal(0, 1);

  // Calculate the standard deviations and covariance matrices
  for (i in 1:N) {
    vector[2] sigma;

    sigma[1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
    sigma[2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);
    sigmas[i,] = sigma;

    bivariate_offsets[i,] ~ multi_normal(mu, diag_pre_multiply(sigma, Lcorr));


  }


  // Priors for bivariate_offsets

  // Calculate the probability of success for each observation
  eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
  eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

  // Likelihood
  y_comparative ~ binomial_logit(n_comparative, eta_comparative);
  y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

generated quantities {
  matrix[2,2] Omega;
  matrix[2,2] Sigma;
  Omega = multiply_lower_tri_self_transpose(Lcorr);
  Sigma = quad_form_diag(Omega, sigmas);
}'


edit: @mike-lawrence , any suggestions on how to proceed? Any specifics would be appreciated!

In your previous code you had separate correlations for each N but here you only have one; presumably unintentional?

Also, you’re still using multi_normal, but the final code in the SUG1.13 shows how to use just normal on a helper variable matrix that gets multiplied by the cholesky factor matrix (well, diag_ore_multiply’d by the sigmas, but I prefer to just multiply by the cholesky factor matrix and elt-multiply by rep_matrix(sigmas,N).

Finally, I think you want to assign the result to bivariate_offsets[,i] rather than bivariate_offsets[i,].

Hi @mike-lawrence , thanks a lot for these pointers. I guess I’m still quite stuck on how to translate the final block of code in SUG1.13 into something that I can use. Particularly, I need to have the sigma’s for each observation differ parametrically (these are my sigma_comparative and sigma_superlative). Right now I define these in a loop, but then I don’t know where to hook these in to the SUG1.13 code. Sorry to be such a novice about this, but would you mind suggesting how I can do that? Thanks a lot in advance.

For reference, my current code is:

// Define the beta-binomial regression model
data {
int<lower=0> N; // Number of adjectives
int<lower=0> y_comparative[N]; // Observed morphological comparative
int<lower=0> y_superlative[N]; // Observed morphological superlative

real FreqNonCentered_lemma[N]; // Lemma frequency (summed log freq of each)
real FreqNonCentered_comparative[N]; // Comparative frequency
real FreqNonCentered_superlative[N]; // Superlative frequency

int<lower=0> n_comparative[N]; // Total number of comparative forms
int<lower=0> n_superlative[N]; // Total number of superlative forms
int<lower=0> K; // Number of predictors
matrix[N, K] X; // Design matrix
matrix[2, N] mu; // Location parameter for bivariate_offsets
}

parameters {
vector[K] shared_betas; // Regression coefficients
real beta_freq;
real alpha;
matrix[2, N] bivariate_offsets;
real rho;
}

model {
vector[N] eta_comparative; // Probability of success for comparative
vector[N] eta_superlative; // Probability of success for superlative
vector[N] sigma_comparative; // Standard deviation for comparative
vector[N] sigma_superlative; // Standard deviation for superlative
matrix[2, 2] Sigmas[N]; // Covariance matrix for each observation
real jitter;
jitter = 1e5;
rho ~ uniform(-1, 1);

// Priors
shared_betas ~ normal(0, 10); // Prior for regression coefficients
alpha ~ normal(0, 10);
beta_freq ~ normal(0, 10);

// Calculate the standard deviations and covariance matrices
for (i in 1:N) {
sigma_comparative[i] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
sigma_superlative[i] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);

Sigmas[i][1, 1] = square(sigma_comparative[i])+jitter;
Sigmas[i][2, 2] = square(sigma_superlative[i])+jitter;
Sigmas[i][1, 2] = sigma_superlative[i];
Sigmas[i][2, 1] = sigma_comparative[i];
bivariate_offsets[,i] ~ multi_normal(mu[,i], Sigmas[i]);

}
// Priors for bivariate_offsets

// Calculate the probability of success for each observation
eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);

// Likelihood
y_comparative ~ binomial_logit(n_comparative, eta_comparative);
y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

And I get a ton of errors about the covariance matrix not being symmetric, to the extent that the chains don’t actually ever start sampling.

How’s this look? This employs the sds-varying-by-levels-of-N that you indicated you want but has a common correlation parameter across levels of N:

data {
	int<lower=0> N; // Number of adjectives
	int<lower=0> y_comparative[N]; // Observed morphological comparative
	int<lower=0> y_superlative[N]; // Observed morphological superlative

	real FreqNonCentered_lemma[N]; // Lemma frequency (summed log freq of each)
	real FreqNonCentered_comparative[N]; // Comparative frequency
	real FreqNonCentered_superlative[N]; // Superlative frequency

	int<lower=0> n_comparative[N]; // Total number of comparative forms
	int<lower=0> n_superlative[N]; // Total number of superlative forms
	int<lower=0> K; // Number of predictors
	matrix[N, K] X; // Design matrix
	row_vector[2] mu; // Location parameter for bivariate_offsets
}

parameters {
	vector[K] shared_betas; // Regression coefficients
	real beta_freq;
	real alpha;
	matrix[N,2] bivariate_offsets_helper;
	cholesky_factor_corr[2] Lcorr;
}
transformed parameters{
	vector[N] eta_comparative; // Probability of success for comparative
	vector[N] eta_superlative; // Probability of success for superlative
	matrix[N,2] sigmas;
	matrix[N,2] bivariate_offsets;
	{ // local environment to avoid saving-to-file bivariate_offsets_helper2
		// Calculate the standard deviations and covariance matrices
		matrix[N,2] bivariate_offsets_helper2 = Lcorr * bivariate_offsets_helper ;
		for (i in 1:N) {
			sigmas[i,1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq);
			sigmas[i,2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq);
			bivariate_offsets[i,] = (
				bivariate_offsets_helper2[i,]
				+ sigmas[i,]
				+ mu
			) ;
		}
	}
	// Calculate the probability of success for each observation
	eta_comparative = X * shared_betas + to_vector(bivariate_offsets[1, ]);
	eta_superlative = X * shared_betas + to_vector(bivariate_offsets[2, ]);
}
model {

	// Priors
	shared_betas ~ normal(0, 1); // Prior for regression coefficients
	alpha ~ normal(0, 1);
	beta_freq ~ normal(0, 1);
	to_vector(bivariate_offsets_helper) ~ std_normal() ;
	Lcorr ~ lkj_corr_cholesky(1) ;

	// Likelihood
	y_comparative ~ binomial_logit(n_comparative, eta_comparative);
	y_superlative ~ binomial_logit(n_superlative, eta_superlative);
}

generated quantities {
	matrix[2,2] Omega;
	Omega = multiply_lower_tri_self_transpose(Lcorr);
}

Hi @mike-lawrence , thanks a lot, this looks potentially really good! I’m not quite sure what you did with the block here:

{ // local environment to avoid saving-to-file bivariate_offsets_helper2 // Calculate the standard deviations and covariance matrices matrix[N,2] bivariate_offsets_helper2 = Lcorr * bivariate_offsets_helper ; for (i in 1:N) { sigmas[i,1] = exp(alpha + FreqNonCentered_comparative[i] * beta_freq); sigmas[i,2] = exp(alpha + FreqNonCentered_superlative[i] * beta_freq); bivariate_offsets[i,] = ( bivariate_offsets_helper2[i,] + sigmas[i,] + mu ) ; } }

I assume this is basically the “noncentered parameterization” version of what I was thinking about? If you could clarify a little bit more that would be super helpful for me in learning how to build these kinds of models on my own going forward.

I tried fitting the model, with the R code as follows:


stan_model_file <- "./Models/from_stan_forums.stan"
# Read the data into R
N <- nrow(stan_data) # Number of observations
y_comparative <- round(stan_data$Count_COMP*stan_data$PercentMorph_COMP)+1 # Observed successes
y_superlative <- round(stan_data$Count_SUP*stan_data$PercentMorph_SUP)+1 # Observed successes

n_comparative <- stan_data$Count_COMP +1 # Total number of trials
n_superlative <- stan_data$Count_SUP +1 # Total number of trials

X <- stan_data %>%
select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL) %>%
as.matrix()
K <- ncol(stan_data %>% select(mon_er,l_more,i_er,r_er,cluster_more,finalStress_more,initialCluster_est,sibfinal_most,rInitial_er,Log_Freq_HAL)) # Number of predictors

FreqNonCentered_lemma <- log(stan_data$Count_COMP+stan_data$Count_SUP)
FreqNonCentered_superlative <- log(stan_data$Count_SUP)
FreqNonCentered_comparative <- log(stan_data$Count_COMP)
mu <- c(0,0)
# a_ulam$Freq_noncentered

# Create a list of data for STAN
stan_data_list <- list(N = N,
y_comparative = y_comparative,
y_superlative = y_superlative,
n_comparative = n_comparative,
n_superlative = n_superlative,
X = X,
K = K,
FreqNonCentered_lemma = FreqNonCentered_lemma,
FreqNonCentered_comparative = FreqNonCentered_comparative,
FreqNonCentered_superlative = FreqNonCentered_superlative,
mu=mu

)

# Compile the STAN model
model <- cmdstan_model(stan_model_file)

# Fit the model to the data
fit <- model$sample(data = stan_data_list, cores = 4, chains = 4, seed = 1)

But I got an error suggesting there was something off in the part that I don’t quite understand: