Cholesky Decomposition with Ordinal Outcome

I’m trying to glean a better understanding of correlation calculations in Stan and am wondering what the correlation matrix represents in a model of an ordinal outcome. In the model below, am I calculating the correlation between Dimensions (Dim1, Dim2, Dim3) on the latent (linear predictor) scale? Or since the outcomes are ordinal , am I calculating polychoric correlations between dimensions?

Or am I way off and it’s neither of these?

# List required packages
Pkgs <- c("tidyverse", 
          "parallel", 
          "rstan",
          "fabricatr")

# Load packages
lapply(Pkgs, require, c = T)

## Set computing options
ncores = detectCores()
rstan_options(auto_write = TRUE)

# Create data 
df <- fabricate(
  N = 50,
  SubjID = rep(1:10, 5),
  Group = rep(rep(LETTERS[1:2], each = 5),5),
  latent = draw_normal_icc(mean = 0, N = N, clusters = SubjID, ICC = 0.7),        
  Dim1 = draw_likert(x = latent, type = 5),
  Dim2 = draw_likert(x = latent + 0.25, type = 5),
  Dim3 = draw_likert(x = latent - 0.25, type = 5)
) %>%
  select(-ID, -latent) %>%
  pivot_longer(Dim1:Dim3, names_to = "Dim", values_to = "Response") %>%
  mutate(SubjId = factor(SubjID),
         Group = factor(Group),
         Response = factor(Response, ordered = TRUE))

## Make a list for Stan
data <- list(N = nrow(df),
             Y = as.numeric(df$Response),
             X = model.matrix(Response ~ Dim, data = df),
             nthres = 4,
             K = ncol(X),
             N_1 = length(unique(df$SubjID)),
             M_1 = 1,
             J_1 = as.numeric(df$SubjID),
             Z_1_1 = X[,1],
             N_2 = length(unique(df$SubjID)),
             M_2 = length(unique(df$Dim)),
             NC_2 = length(unique(df$Dim)),
             J_2 = as.numeric(df$SubjID),
             Z_2_1 = ifelse(df$Dim == "Dim1", 1, 0),
             Z_2_2 = ifelse(df$Dim == "Dim2", 1, 0),
             Z_2_3 = ifelse(df$Dim == "Dim3", 1, 0)
)

Model

model_code = "
functions {
 // compute correlated group-level effects
  matrix scale_r_cor(matrix z, vector SD, matrix L) {
    // r is stored in another dimension order than z
    return transpose(diag_pre_multiply(SD, L) * z);
  }
  // cumulative-logit log-PDF for a single response
   real cumulative_logit_lpmf(int y, real mu, real disc, vector thres) {
     int nthres = num_elements(thres);
     if (y == 1) {
       return log_inv_logit(disc * (thres[1] - mu));
     } else if (y == nthres + 1) {
       return log1m_inv_logit(disc * (thres[nthres] - mu));
     } else {
       return log_diff_exp(
         log_inv_logit(disc * (thres[y] - mu)), 
         log_inv_logit(disc * (thres[y - 1] - mu))
       );
     }
   }
  // cumulative-logit log-PDF for a single response and merged thresholds
   real cumulative_logit_merged_lpmf(int y, real mu, real disc, vector thres, int[] j) {
     return cumulative_logit_lpmf(y | mu, disc, thres[j[1]:j[2]]);
   }
  // ordered-logistic log-PDF for a single response and merged thresholds
   real ordered_logistic_merged_lpmf(int y, real mu, vector thres, int[] j) {
     return ordered_logistic_lpmf(y | mu, thres[j[1]:j[2]]);
   }
}
data {
  int<lower=1> N;  // total number of observations
  int Y[N];  // response variable
  int<lower=2> nthres;  // number of thresholds
  int<lower=1> K;  // number of population-level effects
  matrix[N, K] X;  // population-level design matrix
  // data for group-level effects of ID 1
  int<lower=1> N_1;  // number of grouping levels
  int<lower=1> M_1;  // number of coefficients per level
  int<lower=1> J_1[N];  // grouping indicator per observation
  // group-level predictor values
  vector[N] Z_1_1;
  // data for group-level effects of ID 2
  int<lower=1> N_2;  // number of grouping levels
  int<lower=1> M_2;  // number of coefficients per level
  int<lower=1> J_2[N];  // grouping indicator per observation
  // group-level predictor values
  vector[N] Z_2_1;
  vector[N] Z_2_2;
  vector[N] Z_2_3;
  int<lower=1> NC_2;  // number of group-level correlations
}
transformed data {
}
parameters {
  vector[K] b;  // population-level effects
  ordered[nthres] Intercept;  // temporary thresholds for centered predictors
  vector<lower=0>[M_1] sd_1;  // group-level standard deviations
  vector[N_1] z_1[M_1];  // standardized group-level effects
  vector<lower=0>[M_2] sd_2;  // group-level standard deviations
  matrix[M_2, N_2] z_2;  // standardized group-level effects
  cholesky_factor_corr[M_2] L_2;  // cholesky factor of correlation matrix
}
transformed parameters {
  real<lower=0> disc = 1;  // discrimination parameters
  vector[N_1] r_1_1;  // actual group-level effects
  matrix[N_2, M_2] r_2;  // actual group-level effects
  // using vectors speeds up indexing in loops
  vector[N_2] r_2_1;
  vector[N_2] r_2_2;
  vector[N_2] r_2_3;
  r_1_1 = (sd_1[1] * (z_1[1]));
  // compute actual group-level effects
  r_2 = scale_r_cor(z_2, sd_2, L_2);
  r_2_1 = r_2[, 1];
  r_2_2 = r_2[, 2];
  r_2_3 = r_2[, 3];
}
model {
  // likelihood not including constants
    // initialize linear predictor term
    vector[N] mu = X * b;
    for (n in 1:N) {
      // add more terms to the linear predictor
      mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_2_2[J_2[n]] * Z_2_2[n] + r_2_3[J_2[n]] * Z_2_3[n];
    }
    for (n in 1:N) {
      target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
    }
  // priors not including constants
  target += student_t_lupdf(Intercept | 3, 0, 2.5);
  target += student_t_lupdf(sd_1 | 3, 0, 2.5);
  target += std_normal_lupdf(z_1[1]);
  target += student_t_lupdf(sd_2 | 3, 0, 2.5);
  target += std_normal_lupdf(to_vector(z_2));
  target += lkj_corr_cholesky_lupdf(L_2 | 1);
}
generated quantities {
  // compute actual thresholds
  vector[nthres] b_Intercept = Intercept;
  // compute group-level correlations
  corr_matrix[M_2] Cor_2 = multiply_lower_tri_self_transpose(L_2);
  vector<lower=-1,upper=1>[NC_2] cor_2;
  // extract upper diagonal of correlation matrix
  for (k in 1:M_2) {
    for (j in 1:(k - 1)) {
      cor_2[choose(k - 1, 2) + j] = Cor_2[j, k];
    }
  }
}
"

From a brief scan of your model, based on this section:

    vector[N] mu = X * b;
    for (n in 1:N) {
      // add more terms to the linear predictor
      mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_2_2[J_2[n]] * Z_2_2[n] + r_2_3[J_2[n]] * Z_2_3[n];
    }
    for (n in 1:N) {
      target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
    }

It looks like you’re modelling the data using correlated random intercepts for each outcome. Under this model the same cutpoints and regression coefficients are used for every outcome, but the latent scale is shifted by a slightly different amount for each item.

This model essentially assumes that the same latent variable underlies each likert outcome, but the individual’s position is offset by a different amount depending on which likert item is the outcome, and that the amount of this offset is correlated with the amount of offset on other items.

That’s if I’m reading the model right though, happy to be corrected

That’s generally what I’m trying to model by grouping respondes by SubjID with flexible cutpoints on the latent distribution.

Does this mean the correlation decomposition reflects the relationship between cutpoints on the latent distribution within each SubjID?

You’re assuming the same cutpoints for every outcome for every individual:

  ordered[nthres] Intercept;  // temporary thresholds for centered predictors
 
 ...
  
  for (n in 1:N) {
    target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
  }

So I don’t think there’s a relationship to reflect

Then what do the correlation parameters represent?

The correlation between the offsets to the location parameter.

So you have:

r_2 = scale_r_cor(z_2, sd_2, L_2)

Which is the same as:

  r_2 ~ multi_normal_cholesky(0,diag_pre_multiply(sd_2,L_2))

So the r_2 are the random intercepts for each outcome, which are specified to be correlated, and the cholesky factor L_2 gives their correlation.

These intercepts change the ‘starting point’ on the latent response variable for each outcome. So while the cutpoint parameters themselves are not different between the outcomes, the individual would need to change by a different amount on each latent variable to exceed the same threshold.

So in a way you could say that it’s a flexible cutpoint model, because the amount of change needed is different, but my inner pedant disagrees because the cutpoints are *technically* identical

Does this make the correlation estimates more meaningful?

model_code = "
functions {
 // compute correlated group-level effects
  matrix scale_r_cor(matrix z, vector SD, matrix L) {
    // r is stored in another dimension order than z
    return transpose(diag_pre_multiply(SD, L) * z);
  }
  // cumulative-logit log-PDF for a single response
   real cumulative_logit_lpmf(int y, real mu, real disc, vector thres) {
     int nthres = num_elements(thres);
     if (y == 1) {
       return log_inv_logit(disc * (thres[1] - mu));
     } else if (y == nthres + 1) {
       return log1m_inv_logit(disc * (thres[nthres] - mu));
     } else {
       return log_diff_exp(
         log_inv_logit(disc * (thres[y] - mu)), 
         log_inv_logit(disc * (thres[y - 1] - mu))
       );
     }
   }
  // cumulative-logit log-PDF for a single response and merged thresholds
   real cumulative_logit_merged_lpmf(int y, real mu, real disc, vector thres, int[] j) {
     return cumulative_logit_lpmf(y | mu, disc, thres[j[1]:j[2]]);
   }
  // ordered-logistic log-PDF for a single response and merged thresholds
   real ordered_logistic_merged_lpmf(int y, real mu, vector thres, int[] j) {
     return ordered_logistic_lpmf(y | mu, thres[j[1]:j[2]]);
   }
}
data {
  int<lower=1> N;  // total number of observations
  int Y[N];  // response variable
  int<lower=2> nthres;  // number of thresholds
  int<lower=1> K;  // number of population-level effects
  matrix[N, K] X;  // population-level design matrix
  // data for group-level effects of ID 1
  int<lower=1> N_1;  // number of grouping levels
  int<lower=1> M_1;  // number of coefficients per level
  int<lower=1> J_1[N];  // grouping indicator per observation
  // group-level predictor values
  vector[N] Z_1_1;
  vector[N] Z_1_2;
  vector[N] Z_1_3;
  int<lower=1> NC_1;  // number of group-level correlations
  int prior_only;  // should the likelihood be ignored?
}
parameters {
  vector[K] b;  // population-level effects
  ordered[nthres] Intercept;  // temporary thresholds for centered predictors
  vector<lower=0>[M_1] sd_1;  // group-level standard deviations
  matrix[M_1, N_1] z_1;  // standardized group-level effects
  cholesky_factor_corr[M_1] L_1;  // cholesky factor of correlation matrix
}
transformed parameters {
  real<lower=0> disc = 1;  // discrimination parameters
  matrix[N_1, M_1] r_1;  // actual group-level effects
  // using vectors speeds up indexing in loops
  vector[N_1] r_1_1;
  vector[N_1] r_1_2;
  vector[N_1] r_1_3;
  // compute actual group-level effects
  r_1 = scale_r_cor(z_1, sd_1, L_1);
  r_1_1 = r_1[, 1];
  r_1_2 = r_1[, 2];
  r_1_3 = r_1[, 3];
}
model {
  // likelihood not including constants
    // initialize linear predictor term
    vector[N] mu = X * b;
    for (n in 1:N) {
      // add more terms to the linear predictor
      mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_1_3[J_1[n]] * Z_1_3[n];
    }
    for (n in 1:N) {
      target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
    }
  // priors not including constants
  target += student_t_lupdf(Intercept | 3, 0, 2.5);
  target += student_t_lupdf(sd_1 | 3, 0, 2.5);
  target += std_normal_lupdf(to_vector(z_1));
  target += lkj_corr_cholesky_lupdf(L_1 | 1);
}
generated quantities {
  // compute actual thresholds
  vector[nthres] b_Intercept = Intercept;
  // compute group-level correlations
  corr_matrix[M_1] Cor_1 = multiply_lower_tri_self_transpose(L_1);
  vector<lower=-1,upper=1>[NC_1] cor_1;
  // extract upper diagonal of correlation matrix
  for (k in 1:M_1) {
    for (j in 1:(k - 1)) {
      cor_1[choose(k - 1, 2) + j] = Cor_1[j, k];
    }
  }
}
"

Can you explain what you’ve changed and what you’re aiming for in terms of ‘meaningful’?

I’m attempting to estimate the correlations between outcomes (M_1/NC_1). After your extremely helpful replies, I think I’m looking at correlations between outcomes on the (same) latent scale (i.e., all outcomes are on the same latent scale).

In the first example, in the first model (I think) I have parameters representing the correlation between each dimension outcome within subject:

"cor_SubjId__DimDim1__DimDim2"
"cor_SubjId__DimDim1__DimDim3"
"cor_SubjId__DimDim2__DimDim3" 

After further reading, I see that I was misunderstanding what Cholesky factors represent. Now I see that the output from lkj_corr_cholesky is not a correlation matrix, but only the Cholesky factor of a correlation matrix.

I think I’ll need a separate model to calculate correlations between outcomes, which is what I’m looking for in addition to the above model. I was looking for a way to incorporate both goals into a single model, but perhaps that’s more trouble than it’s worth!

I have seen relatively straightforward approaches to robust correlation:

data {
    int<lower=1> N;  // number of observations
    vector[2] x[N];  // input data: rows are observations, columns are the two variables
}

parameters {
    vector[2] mu;                 // locations of the marginal t distributions
    real<lower=0> sigma[2];       // scales of the marginal t distributions
    real<lower=1> nu;             // degrees of freedom of the marginal t distributions
    real<lower=-1, upper=1> rho;  // correlation coefficient
}

transformed parameters {
    // Covariance matrix
    cov_matrix[2] cov = [[      sigma[1] ^ 2       , sigma[1] * sigma[2] * rho],
                         [sigma[1] * sigma[2] * rho,       sigma[2] ^ 2       ]];
}

model {
  // Likelihood
  // Bivariate Student's t-distribution instead of normal for robustness
  x ~ multi_student_t(nu, mu, cov);
    
  // Noninformative priors on all parameters
  sigma ~ normal(0,100);
  mu ~ normal(0, 100);
  nu ~ gamma(2, 0.1);
}

generated quantities {
  // Random samples from the estimated bivariate t-distribution (for assessment of fit)
  vector[2] x_rand;
  x_rand = multi_student_t_rng(nu, mu, cov);
}

But, I haven’t been able to find something similar for estimating polychoric correlations.

Have you come across any implementations?