Conditioning a correlation on another variable

tl;dr: I want to model a correlation between two latent parameters (estimated from observable data) and condition that correlation on another observable variable. I got stuck modeling the correlation, and extending the model to include conditioning of the correlation.

Imagine a situation in which two latent variables are correlated. E.g. Cognitive ability X in the child and in the parent. These cognitive abilities are not observed directly, but estimated from some observable performance in whatever test.

Generating the data:

library(MASS)
library(tidyverse)

## We have two variables (e.g. rates in a binomial or a poisson): rateC and rateP
## The two variables are generated according to a multivariate gaussian: they are correlated according to rho
## They are then used to generate performance in an aggregated binomial fashion (correct out of 30 trials)

n <- 200 # n of participants
trials = 30 # n of trials

muC <- 0.4 # pop level mean of child underlying performance rate
muP <- 0.8 # pop level mean of parent underlying performance rate
sigmaC <- 0.1  # pop level sd of child underlying performance rate
sigmaP <- 0.12 #pop level  sd of parent underlying performance rate

rho <- 0.4 # correlation between parent and child performance rate

## Now we generate the covariance matrix

VarCovarM <- matrix(c(NA, NA, NA, NA), nrow = 2, ncol = 2)

VarCovarM[1,1] = sigmaC^2
VarCovarM[1,2] = rho * sigmaC * sigmaP
VarCovarM[2,1] = rho * sigmaC * sigmaP
VarCovarM[2,2] = sigmaP^2

vars <- mvrnorm(n = n, mu=c(muC, muP), Sigma = VarCovarM)


df <- tibble(
  RateC = vars[,1], 
  RateP = vars[,2])

## Now we generate performance from rates
for (i in seq(n)){
  temp <- tibble(
    ID = i,
    ScoreC = rbinom(1, size=trials, prob = df$RateC[i]),
    ScoreP = rbinom(1, size=trials, prob = df$RateP[i]),
    trials = trials
  )
  if (i==1){d <- temp} else{d<-rbind(d, temp)}
}

data <- list(
  np = n,
  nt = trials,
  ScoreC = d$ScoreC, 
  ScoreP = d$ScoreP) # to be passed on to Stan

I then model this. Easy peasy in brms (just to make sure the setup works and I can recover the parameter values)

library(brms)
fC <- bf(ScoreC | trials(trials) ~ 1 + (1 | p | ID))
fP <- bf(ScoreP | trials(trials) ~ 1 + (1 | p | ID))

prior <- c(
  prior(normal(0, 1), class = Intercept, resp=ScoreC),
  prior(normal(0, 1), class = Intercept, resp=ScoreP),
  prior(normal(0, .2), class = sd, resp=ScoreC),
  prior(normal(0, .2), class = sd, resp=ScoreP)
)

m <- brm(
  fC + fP + set_rescor(rescor=FALSE),
  d,
  family = binomial,
  chains= 2, 
  cores= 2,
  prior = prior,
  sample_prior = T,
  backend = "cmdstanr"
)

A bit less straightforward directly in Stan:

stan_file <- write_stan_file("
// The input data is a vector 'y' of length 'N'.
data {
  int<lower=0> np;  // n of participants
  int<lower=0> nt;  // n of trials
  int ScoreC[np]; // vector of children scores
  int ScoreP[np]; // vector of parents scores
  
}

// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
  real mu[2];
  real<lower=0> sigma[2];
  real<lower=0,upper=1> Rate[2]; // rate of success for each person
  real<lower=0,upper=1> p[np,2]; // rate of success for each person
  real<lower=-1, upper=1> rho;
  
}

transformed parameters {
  cov_matrix[2] T;

  // Reparameterization
  T[1,1] = square(sigma[1]);
  T[1,2] = rho * sigma[1] * sigma[2];
  T[2,1] = rho * sigma[1] * sigma[2];
  T[2,2] = square(sigma[2]);

}

model {
  // Priors
  mu ~ normal(1, 1);
  sigma ~ normal(0, 2);
  rho ~ normal(0, 0.3); 
  
  // Data
  p ~ multi_normal(mu, T);
  for (i in 1:np)
    ScoreC[i, 1] ~ binomial(nt, p[i,1]);
    ScoreP[i, 2] ~ binomial(nt, p[i,2]);
  
}

")

Here I get the first issue:

p ~ multi_normal(mu, T);
is apparently misspecified:
Ill-typed arguments to ‘~’ statement. No distribution ‘multi_normal’ was found with the correct signature.

The second issue is how do I extend it to include a regression model generating rho. Let’s imagine that the correlation between parent and child cognitive function is (also) a function of how much time they spent together.
I’m thinking something like

for (n in 1:np)
    rho[n] = ((inv_logit(InterceptRho + SlopeTimeRho * Time[n])*2)-1);

in the transformed parameters block. But any suggestion is welcome.

Generating data for this latter scenario:

library(MASS)
library(brms)
library(tidyverse)

n <- 200 # n of participants
trials = 30 # n of trials

muC <- 0.4 # pop level mean of child underlying performance rate
muP <- 0.8 # pop level mean of parent underlying performance rate
sigmaC <- 0.1  # pop level sd of child underlying performance rate
sigmaP <- 0.12 #pop level  sd of parent underlying performance rate

## a 0-4 integer parameter representing some abstract way of measuring time spent together
Time <- round(runif(n, 0, 4), 0)

## Intercept and slope of the relation between time and rho
InterceptRho <- 0.2
SlopeRho <- 0.3

for (i in seq(n)){
  rho <- inv_logit_scaled(InterceptRho + SlopeRho * Time[i], -1, 1)

  VarCovarM <- matrix(c(NA, NA, NA, NA), nrow = 2, ncol = 2)
  
  VarCovarM[1,1] = sigmaC^2
  VarCovarM[1,2] = rho * sigmaC * sigmaP
  VarCovarM[2,1] = rho * sigmaC * sigmaP
  VarCovarM[2,2] = sigmaP^2
  
  vars <- mvrnorm(n = 1, mu=c(muC, muP), Sigma = VarCovarM)
  
  temp <- tibble(
    RateC = vars[1], 
    RateP = vars[2],
    rho,
    Time = Time[i])

  if (i==1){df <- temp} else{df <- rbind(df, temp)}

}

for (i in seq(n)){
  temp <- tibble(
    ID = i,
    scoreC = rbinom(1, size=trials, prob = df$RateC[i]),
    scoreP = rbinom(1, size=trials, prob = df$RateP[i]),
    Time = Time[i]
  )
  if (i==1){d <- temp} else{d<-rbind(d, temp)}
}

Two issues:

  1. You can’t have a constrained variable type on the left side of ~ multi_normal; you’ll have to use an unconstrained then a transform to achieve your desired constraints.
  2. p is a 2D array but I think that isn’t valid; try a 1D array of row-vectors instead.
1 Like

I’m a bit unsure as to how the 1D array of row-vectors should be implemented.
I specified in the parameter block (as per the handbook)

vector[2] Rate[np]; // rate of success for each person: array of size np including vectors of length 2

and in the model block:

Rate ~ multi_normal(mu, T);

given that yields the same issue as before, I also tried a loop

for (i in 1:np)
  Rate[i] ~ multi_normal(mu, T);

to no avail. How should I change the implementation?

The full reprex is below:

library(MASS)
library(cmdstanr)
library(brms)


## We have two variables (e.g. rates in a binomial or a poisson): rateC and rateP
## The two variables are generated according to a multivariate gaussian: they are correlated according to rho
## They are then used to generate performance in an aggregated binomial fashion (correct out of 30 trials)

n <- 200 # n of participants
trials = 30 # n of trials

muC <- 0.4 # pop level mean of child underlying performance rate
muP <- 0.8 # pop level mean of parent underlying performance rate
sigmaC <- 0.1  # pop level sd of child underlying performance rate
sigmaP <- 0.12 #pop level  sd of parent underlying performance rate

rho <- 0.4 # correlation between parent and child performance rate

VarCovarM <- matrix(c(NA, NA, NA, NA), nrow = 2, ncol = 2)

VarCovarM[1,1] = sigmaC^2
VarCovarM[1,2] = rho * sigmaC * sigmaP
VarCovarM[2,1] = rho * sigmaC * sigmaP
VarCovarM[2,2] = sigmaP^2

vars <- mvrnorm(n = n, mu=c(muC, muP), Sigma = VarCovarM, tol = 1e-6, empirical = FALSE, EISPACK = FALSE)


df <- tibble(
  RateC = vars[,1], 
  RateP = vars[,2])

for (i in seq(n)){
  temp <- tibble(
    ID = i,
    ScoreC = rbinom(1, size=trials, prob = df$RateC[i]),
    ScoreP = rbinom(1, size=trials, prob = df$RateP[i]),
    trials = trials
  )
  if (i==1){d <- temp} else{d<-rbind(d, temp)}
}


d

## Define data for stan
data <- list(
  np = n,
  nt = trials,
  ScoreC = d$ScoreC, 
  ScoreP = d$ScoreP) # to be passed on to Stan

stan_file <- write_stan_file("
// The input data is a vector 'y' of length 'N'.
data {
  int<lower=0> np;  // n of participants
  int<lower=0> nt;  // n of trials
  int ScoreC[np]; // vector of children scores
  int ScoreP[np]; // vector of parents scores
  
}

// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
  real mu[2];
  real<lower=0> sigma[2];
  vector[2] Rate[np]; // rate of success for each person
  real<lower=-1, upper=1> rho;  
}

transformed parameters {
  cov_matrix[2] T;

  // Reparameterization
  T[1,1] = square(sigma[1]);
  T[1,2] = rho * sigma[1] * sigma[2];
  T[2,1] = rho * sigma[1] * sigma[2];
  T[2,2] = square(sigma[2]);

}

model {
  // Priors
  mu ~ normal(1, 1);
  sigma ~ normal(0, 2);
  rho ~ normal(0, 0.3); 
  
  // Data
  Rate ~ multi_normal(mu, T);
  for (i in 1:np)
    ScoreC[i, 1] ~ binomial(nt, Rate[i,1]);
    ScoreP[i, 2] ~ binomial(nt, Rate[i,2]);
  
}

")

mod <- cmdstan_model(stan_file, cpp_options = list(stan_threads = TRUE), pedantic = TRUE)


samples <- mod$sample(
  data = data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  threads_per_chain = 2,
  iter_warmup = 2000,
  iter_sampling = 2000,
  refresh = 500,
  max_treedepth = 20,
  adapt_delta = 0.99,
)

While putting multi_normal distribution on constrained variables is often unlikely to give sensible models, I don’t think that your statement here is correct, i.e. this is a valid Stan program:

parameters {
  vector<lower=0, upper=1>[3] x[15];
}

model {
  x ~ multi_normal(rep_vector(0, 3), diag_matrix(rep_vector(1, 3)));
}

Here, also mu needs to be a vector. Unfortunately, the compiler doesn’t produce particularly helpful messages here.

2 Likes

thanks! this solved the current issue. now onto modeling the correlation!