Defining an unconventional residual covariance for a multivariate response model


#1

I’m working on a multivariate normal model with dyadic data, which therefore requires an unconventional covariance matrix for the residuals. The crux of the model is a covariance matrix that looks like this:

     [ s1  c12  r1  r12 ]
     [ c12  s2  r12  r2 ]
     [ r1  r12  s1  c12 ]
     [ r12  r2  c12  s2 ]

It wasn’t clear how and where I should define this covariance structure, so I began by toying with the code generated by brms and some ideas that @richard_mcelreath had used for another model that we are developing.

I ended up trying to define this covariance in the transformed parameters block:

parameters { 
  real response_1_Intercept; 
  real response_2_Intercept;
  
  real<lower=0> sigma_response_1;  // residual SD for response 1
  real<lower=0> sigma_response_2;  // residual SD for response 2
    
  // Covariances
  real<lower=-1,upper=1> rho_c12;
  real<lower=-1,upper=1> rho_r1;
  real<lower=-1,upper=1> rho_r2;
  real<lower=-1,upper=1> rho_r12;
} 

transformed parameters { 

  // residual-level effects and covariances
  	matrix [4,4] RHO;
	matrix [4,4] Lrescor;

 // Transform residuals from residual matrix
    RHO[1,1] = sigma_response_1;
    RHO[1,2] = rho_c12;
    RHO[1,3] = rho_r1;
    RHO[1,4] = rho_r12;
    RHO[2,1] = rho_c12;
    RHO[2,2] = sigma_response_2;
    RHO[2,3] = rho_r12;
    RHO[2,4] = rho_r2;
    RHO[3,1] = rho_r1;
    RHO[3,2] = rho_r12;
    RHO[3,3] = sigma_response_1;
    RHO[3,4] = rho_c12;
    RHO[4,1] = rho_r12;
    RHO[4,2] = rho_r2;
    RHO[4,3] = rho_c12;
    RHO[4,4] = sigma_response_2;
    
    Lrescor = cholesky_decompose(RHO);

} 

But I’m pretty sure that I’m being tripped up by the Cholesky transformation. The model returns simulated parameters when data are simulated with variances of 1 for both responses, but any other simulated variances lead to biased parameters.

One hunch is that maybe there’s a diag_pre_multiply() that I’ve neglected.

I would greatly appreciate some guidance here.


#2

Also, I should probably include full simulation and model code.

First, the simulation:


## Simulating data for multiple dyadic responses
library(mvtnorm)
library(MASS)

nodes <- data.frame(row.names = 1:300)
nodes$i_ID <- seq(1:300)
nodes$j_ID <- nodes$i_ID
nodes$k_ID <- rep(1:15, each = 20)
d.full <- expand.grid(nodes$i_ID, nodes$j_ID)
colnames(d.full) <-  c("i_ID", "j_ID")
d.nodes <- subset (d.full, i_ID != j_ID)
d.nodes$k_ID <- nodes$k_ID [ match (d.nodes$i_ID, nodes$i_ID)]
d.nodes$l_ID <- nodes$k_ID [ match (d.nodes$j_ID, nodes$j_ID)]
d <- subset (d.nodes, k_ID == l_ID)
d <- subset (d, select = -(l_ID))
d$first.node <- ifelse( d$i_ID < d$j_ID, paste (d$i_ID), paste(d$j_ID))
d$second.node <- ifelse( d$i_ID > d$j_ID, paste (d$i_ID), paste(d$j_ID))
d$ij_ID <- paste(d$first.node, "_", d$second.node, sep = "")
d <- subset (d, select = -c(first.node, second.node))
d$ij_dyad_ID <- match (d$ij_ID, unique(d$ij_ID))
d <- subset (d, select = -(ij_ID))
d <- subset (d, i_ID < j_ID)
d <- d[order(d$i_ID),]
row.names(d) <- d$ij_dyad_ID

## Prepping dyad-level random effects -- that is, the residuals
N <- nrow(d) 

SIGMA <- matrix (NA, nrow = 4, ncol = 4)

    # covariance matrix is defined by:
    # [ s1 c12 r1  r12 ]
    # [    s2  r12 r2  ]
    # [        s1  c12 ]
    # [            s2  ]

s1 <- 1.5
s2 <- .5
c12 <- s1 * s2 * -0.5
r1 <- s1 * s1 * 0.5
r2 <- s2 * s2 * 0.5
r12 <- s1 * s2 * -0.5

SIGMA[1,] <- c(s1, c12, r1, r12)
SIGMA[2,] <- c(c12, s2, r12, r2)
SIGMA[3,] <- c(r1, r12, s1, c12)
SIGMA[4,] <- c(r12, r2, c12, s2)

v_e <- rmvnorm(N, mean = c(0,0,0,0), sigma = SIGMA) 

## Set intercept
a1 <- 5
a2 <- 6

## Make i to j response, outcome 1
Y1 <- rep (NA, N)
for ( i in 1:N ) {
Y1[i] <- with ( d ,
	{ a1 +
		v_e [ ij_dyad_ID[i] , 1 ]
	}
	)
}

## Make i to j response, outcome 2
Y2 <- rep (NA, N) 
for ( i in 1:N ) {
Y2[i] <- with ( d ,
	{ a2 +
		v_e [ ij_dyad_ID[i] , 2 ]
	}
	)
}

## Make j to i response, outcome 1
Y3 <- rep (NA, N)
for ( i in 1:N ) {
Y3[i] <- with ( d ,
	{ a1 +
		v_e [ ij_dyad_ID[i] , 3 ]
	}
	)
}

## Make j to i response, outcome 1
Y4 <- rep (NA, N)
for ( i in 1:N ) {
Y4[i] <- with ( d ,
	{ a2 +
		v_e [ ij_dyad_ID[i] , 4 ]
	}
	)
}

d$Y1 <- Y1
d$Y2 <- Y2
d$Y3 <- Y3
d$Y4 <- Y4

dat_list <- list (
 N = N,
 Y1 = d$Y1,
 Y2 = d$Y2,
 Y3 = d$Y3,
 Y4 = d$Y4,
 nresp = 4,
 ij_dyad_ID = d$ij_dyad_ID
)

And the current model code is:

model_code <- "
data { 
  int<lower=1> N;  // total number of observations 
  real Y1[N];  // response variable 
  real Y2[N];  // response variable 
  real Y3[N];  // response variable 
  real Y4[N];  // response variable 
  int<lower=1> nresp;  // number of responses
  
} 

transformed data { 
  vector[nresp] Y[N];  // response matrix
  for (n in 1:N) {
    Y[n] = [Y1[n], Y2[n], Y3[n], Y4[n]]';
  }
} 
parameters { 
  real response_1_Intercept; 
  real response_2_Intercept;
  
  real<lower=0> sigma_response_1;  // residual SD for response 1
  real<lower=0> sigma_response_2;  // residual SD for response 2
    
  // Covariances
  real<lower=-1,upper=1> rho_c12;
  real<lower=-1,upper=1> rho_r1;
  real<lower=-1,upper=1> rho_r2;
  real<lower=-1,upper=1> rho_r12;

} 


transformed parameters { 

  // residual-level effects and covariances
  	matrix [4,4] RHO;
	matrix [4,4] Lrescor;

 // Transform residuals from residual matrix
    RHO[1,1] = sigma_response_1;
    RHO[1,2] = rho_c12;
    RHO[1,3] = rho_r1;
    RHO[1,4] = rho_r12;
    RHO[2,1] = rho_c12;
    RHO[2,2] = sigma_response_2;
    RHO[2,3] = rho_r12;
    RHO[2,4] = rho_r2;
    RHO[3,1] = rho_r1;
    RHO[3,2] = rho_r12;
    RHO[3,3] = sigma_response_1;
    RHO[3,4] = rho_c12;
    RHO[4,1] = rho_r12;
    RHO[4,2] = rho_r2;
    RHO[4,3] = rho_c12;
    RHO[4,4] = sigma_response_2;
    
    Lrescor = cholesky_decompose(RHO);

} 

model { 
  vector[N] mu_Y1 = rep_vector(0, N) + response_1_Intercept; 
  vector[N] mu_Y2 = rep_vector(0, N) + response_2_Intercept; 
  vector[N] mu_Y3 = rep_vector(0, N) + response_1_Intercept; 
  vector[N] mu_Y4 = rep_vector(0, N) + response_2_Intercept; 
  
  // Defining Mu and sigma vectors
  vector[nresp] Mu[N]; 
  vector[nresp] sigma = [sigma_response_1, sigma_response_2, sigma_response_1, sigma_response_2]';
  
  // cholesky factor of residual covariance matrix 
  matrix[nresp, nresp] LSigma = diag_pre_multiply(sigma, Lrescor); 
  for (n in 1:N) { 
    Mu[n] = [mu_Y1[n], mu_Y2[n], mu_Y3[n], mu_Y4[n]]';
  } 
  
  // priors including all constants 
  target += student_t_lpdf(response_1_Intercept | 3, 5, 10); 
  target += student_t_lpdf(sigma_response_1 | 3, 0, 10)
    - 1 * student_t_lccdf(0 | 3, 0, 10); 
  target += student_t_lpdf(response_2_Intercept | 3, 6, 10); 
  target += student_t_lpdf(sigma_response_2 | 3, 0, 10)
    - 1 * student_t_lccdf(0 | 3, 0, 10); 

  target += lkj_corr_cholesky_lpdf(Lrescor | 1); 
  
  // likelihood including all constants 
  { 
    target += multi_normal_cholesky_lpdf(Y | Mu, LSigma); 
  } 
} 
"

#3

The challenge is going to be ensuring that such a 4\times 4 structured covariance matrix is positive definite. One way to think about it is that you have a 2 \times 2 submatrix
[ s1 c12 ]
[ c12 s2 ]
in the top left and bottom right that also has to be positive definite, so you could declare that in Stan with

parameters {
  cov_matrix[2,2] top_left;
}

or by scaling a (Cholesky factor of a) correlation matrix. That leaves, r1, r12, and r2, but only some combinations of those three parameters imply that the 4 \times 4 covariance matrix is positive definite. According to


and specializing Wikipedia’s notation such that \mathbf{A} = \mbox{top_left} = \mathbf{C} and \mathbf{B} is

[ r1  r12 ]
[ r12  r2 ]

then the whole 4 \times 4 matrix is positive definite, if and only if \mathbf{A} - \mathbf{B}^\top \mathbf{A}^{-1} \mathbf{B} is positive definite, which is to say its eigenvalues are both positive.

In situations like this, it is best to declare r1 and r2 to be elements of a simplex and a positive scale parameter, s, in order to create a single inequality condition on s, like

simplex[2] r;
real r12_unscaled;
real<lower=0, upper = ub(top_left[1,1], top_left[2,1], top_left[2,2], r[1], r12_unscaled, r[2])> s;

Then, we need a user-defined function called ub to compute the upper bound on s, such that the eigenvalues of the Schur complement are positive, which is given by Wolfram Alpha.
You are going to need to use Stan’s algebraic equation(s) solver in order to find the value of s such that the smaller of the two eigenvalues is zero, but s = 0 is an admissible starting value. It would be like

functions {
  vector eigenvalue(real s, vector theta, real [] x_r, int [] x_i) {
    // unpack theta and copy equation from Wolfram Alpha
  }

  real ub(real s_1, real c_12, real s_2, real r_1, real r12_unscaled, real r_2) {
    real x_r[0];
    real x_i[0];
    vector[6] theta = [s_1, c_12, s_2, r_1, r12_unscaled, r_2]';
    return algebra_solver(eigenvalue, rep_vector(0, 1), theta, x_r, x_i)[1];
  }
}

Then, you can construct your covariance matrix in the transformed parameters block as

transformed parameters {
  matrix[4, 4] Sigma;
  Sigma[1:2, 1:2] = top_left;
  Sigma[3:4, 3:4] = top_left;
  {
    matrix[2,2] B = [ [r[1], r12_unscaled], [r12_unscaled, r[2]] ] * s;
    Sigma[1:2, 3:4] = B;
    Sigma[3:4, 1:2] = B;
  }
}

#4

Ben, I’ve heard from others on this forum that you have worked magic on their faulty code, so I was glad to see your name atop the first reply to my query. Everything you say about maintaining a positive definitive matrix makes sense in principle. However, I confess that in terms of putting everything together, I’m still unsure how the parts fit.

I’ll copy and paste the current version of the code, which basically entailed pasting your suggestions into what seemed like the appropriate place. It’s likely that I haven’t followed through on your naming conventions, and I’ll work on that now. But I also wanted to get this back on your radar quickly in case you were able to devote more attention to it this evening.

model_code <- "
data { 
  int<lower=1> N;  // total number of observations 
  real Y1[N];  // response variable 
  real Y2[N];  // response variable 
  real Y3[N];  // response variable 
  real Y4[N];  // response variable 
  int<lower=1> nresp;  // number of responses
  
} 

transformed data { 
  vector[nresp] Y[N];  // response matrix
  for (n in 1:N) {
    Y[n] = [Y1[n], Y2[n], Y3[n], Y4[n]]';
  }
} 
parameters { 
  real response_1_Intercept; 
  real response_2_Intercept;
  
  real<lower=0> sigma_response_1;  // residual SD for response 1
  real<lower=0> sigma_response_2;  // residual SD for response 2
  
  cov_matrix[2] top_left; // Changed from original Ben Goodrich edit, per page 43 in Stan manual
  
 	simplex[2] r;
	real r12_unscaled;
	real<lower=0, upper = ub(top_left[1,1], top_left[2,1], top_left[2,2], r[1], r12_unscaled, r[2])> s;
  
  // Old code commented out after Ben feedback
  //real<lower=-1,upper=1> rho_c12;
  //real<lower=-1,upper=1> rho_r1;
  //real<lower=-1,upper=1> rho_r2;
  //real<lower=-1,upper=1> rho_r12;
} 

functions {
  vector eigenvalue(real s, vector theta, real [] x_r, int [] x_i) {
    // unpack theta and copy equation from Wolfram Alpha
  }

  real ub(real s_1, real c_12, real s_2, real r_1, real r12_unscaled, real r_2) {
    real x_r[0];
    real x_i[0];
    vector[6] theta = [s_1, c_12, s_2, r_1, r12_unscaled, r_2]';
    return algebra_solver(eigenvalue, rep_vector(0, 1), theta, x_r, x_i)[1];
  }
}

transformed parameters {
  matrix[4, 4] Sigma;
  Sigma[1:2, 1:2] = top_left;
  Sigma[3:4, 3:4] = top_left;
  {
    matrix[2,2] B = [ [r[1], r12_unscaled], [r12_unscaled, r[2]] ] * s;
    Sigma[1:2, 3:4] = B;
    Sigma[3:4, 1:2] = B;
  }
}


model { 
  vector[N] mu_Y1 = rep_vector(0, N) + response_1_Intercept; 
  vector[N] mu_Y2 = rep_vector(0, N) + response_2_Intercept; 
  vector[N] mu_Y3 = rep_vector(0, N) + response_1_Intercept; 
  vector[N] mu_Y4 = rep_vector(0, N) + response_2_Intercept; 
  
  // Defining Mu and sigma vectors
  vector[nresp] Mu[N]; 
  vector[nresp] sigma = [sigma_response_1, sigma_response_2, sigma_response_1, sigma_response_2]';
  
  // cholesky factor of residual covariance matrix 
  matrix[nresp, nresp] LSigma = diag_pre_multiply(sigma, Lrescor); 
  for (n in 1:N) { 
    Mu[n] = [mu_Y1[n], mu_Y2[n], mu_Y3[n], mu_Y4[n]]';
  } 
  
  // priors including all constants 
  target += student_t_lpdf(response_1_Intercept | 3, 5, 10); 
  target += student_t_lpdf(sigma_response_1 | 3, 0, 10)
    - 1 * student_t_lccdf(0 | 3, 0, 10); 
  target += student_t_lpdf(response_2_Intercept | 3, 6, 10); 
  target += student_t_lpdf(sigma_response_2 | 3, 0, 10)
    - 1 * student_t_lccdf(0 | 3, 0, 10); 

  target += lkj_corr_cholesky_lpdf(Lrescor | 1); 
  
  // likelihood including all constants 
  { 
    target += multi_normal_cholesky_lpdf(Y | Mu, LSigma); 
  } 
} 
"


#5

The functions block has to at the top of the Stan program. And you have to fill in the eigenvalue function based on the output from Wolfram Alpha.