Adding covariance matrix to non-centered learning model

Hi all,

I’ve been using Stan for awhile to estimate reinforcement learning models and it’s been very helpful, so many thanks!

I’m currently working on estimating correlations among parameters from a model from two sessions of participants’ behavior. The only paper I have found on this topic (https://link.springer.com/article/10.3758/s13423-014-0684-4) used posterior means estimated from within each session and found reduced correlations among sessions for hierarchical vs. individual modeling, likely due to the effect of pooling within sessions. I think that estimating the covariance matrix during model estimation for both sessions combined would more appropriately assess these correlations, and am trying to figure out how to set up a model to do this.

The main problem I am encountering is how to define my learning model parameters in the transformed parameters block (I am getting a syntax error about adding a vector to a row vector, plus I am not sure how to add the covariance terms to my already non-centered parameters), but any feedback on my model would be appreciated.

Below are the relevant parts of my model:

data {
  int<lower=1> nT; // number of trials per subject
  int<lower=1> nS; // number of subjects
  int<lower=0,upper=1> choice[nS,nT,2,2]; // choices
  int<lower=0,upper=1> reward[nS,nT,2]; // rewards
  int<lower=1,upper=2> state_2[nS,nT,2]; // states
  int missing_choice[nS,nT,2,2]; 
  int missing_reward[nS,nT,2];
}

parameters {
  real alpha_1_m_v1; // visit 1 alpha_1 mean
  real<lower=0> alpha_1_s_v1; // visit 1 alpha_1 variance
  real alpha_2_m_v1;
  real<lower=0> alpha_2_s_v1;
  real<lower=0> beta_1_m_v1;
  real<lower=0> beta_1_s_v1;
  real<lower=0> beta_2_m_v1;
  real<lower=0> beta_2_s_v1;
  real omega_m_v1;
  real<lower=0> omega_s_v1;
  real lambda_m_v1;
  real<lower=0> lambda_s_v1;
  real pers_m_v1;
  real<lower=0> pers_s_v1;
  vector[nS] alpha_1_raw_v1; // visit 1 alpha_1 subject variance
  vector[nS] alpha_2_raw_v1;
  vector[nS] beta_1_raw_v1;
  vector[nS] beta_2_raw_v1;
  vector[nS] omega_raw_v1;
  vector[nS] lambda_raw_v1;
  vector[nS] pers_raw_v1;
  
  real alpha_1_m_v2;
  real<lower=0> alpha_1_s_v2;
  real alpha_2_m_v2;
  real<lower=0> alpha_2_s_v2;
  real<lower=0> beta_1_m_v2;
  real<lower=0> beta_1_s_v2;
  real<lower=0> beta_2_m_v2;
  real<lower=0> beta_2_s_v2;
  real omega_m_v2;
  real<lower=0> omega_s_v2;
  real lambda_m_v2;
  real<lower=0> lambda_s_v2;
  real pers_m_v2;
  real<lower=0> pers_s_v2;
  vector[nS] alpha_1_raw_v2;
  vector[nS] alpha_2_raw_v2;
  vector[nS] beta_1_raw_v2;
  vector[nS] beta_2_raw_v2;
  vector[nS] omega_raw_v2;
  vector[nS] lambda_raw_v2;
  vector[nS] pers_raw_v2;
  
  vector[nS] z_alpha_1;
  cholesky_factor_corr[2] l_covm_alpha_1;
  vector[2] tau_unif_alpha_1;
  vector[nS] z_alpha_2;
  cholesky_factor_corr[2] l_covm_alpha_2;
  vector[2] tau_unif_alpha_2;
  vector[nS] z_beta_1;
  cholesky_factor_corr[2] l_covm_beta_1;
  vector[2] tau_unif_beta_1;
  vector[nS] z_beta_2;
  cholesky_factor_corr[2] l_covm_beta_2;
  vector[2] tau_unif_beta_2;
  vector[nS] z_omega;
  cholesky_factor_corr[2] l_covm_omega;
  vector[2] tau_unif_omega;
  vector[nS] z_lambda;
  cholesky_factor_corr[2] l_covm_lambda;
  vector[2] tau_unif_lambda;
  vector[nS] z_pers;
  cholesky_factor_corr[2] l_covm_pers;
  vector[2] tau_unif_pers;
}

transformed parameters {
  //define transformed parameters
  row_vector<lower=0,upper=1>[2] alpha_1[nS];
  row_vector<lower=0,upper=1>[2] alpha_2[nS];
  row_vector[2] alpha_1_normal[nS];
  row_vector[2] alpha_2_normal[nS];
  row_vector[2] beta_1[nS];
  row_vector[2] beta_2;[nS]
  row_vector<lower=0,upper=1>[2] omega[nS];
  row_vector<lower=0,upper=1>[2] lambda[nS];
  row_vector[2] omega_normal[nS];
  row_vector[2] lambda_normal[nS];
  row_vector[2] pers[nS];
  
  // transformed scale parameter for covariance matrix
  vector<lower=0>[2] tau_alpha_1;
  vector<lower=0>[2] tau_alpha_2;
  vector<lower=0>[2] tau_beta_1;
  vector<lower=0>[2] tau_beta_2;
  vector<lower=0>[2] tau_omega;
  vector<lower=0>[2] tau_lambda;
  vector<lower=0>[2] tau_pers;

  //create transformed parameters using non-centered parameterization for all
  // and logistic transformation for alpha, omega, & lambda (range: 0 to 1)
  // plus cholesky factorization for covariance matrix
  tau_alpha_1=2.5*tan(tau_unif_alpha_1);
  tau_alpha_2=2.5*tan(tau_unif_alpha_2);
  tau_beta_1=2.5*tan(tau_unif_beta_1);
  tau_beta_2=2.5*tan(tau_unif_beta_2);
  tau_omega=2.5*tan(tau_unif_omega);
  tau_lambda=2.5*tan(tau_unif_lambda);
  tau_pers=2.5*tan(tau_unif_pers);
  
  alpha_1_normal = alpha_1_m_v1 + alpha_1_s_v1*alpha_1_raw_v1 + (diag_pre_multiply(tau_alpha_1,l_covm_alpha_1)*z_alpha_1)';
  alpha_1 = inv_logit(alpha_1_normal);
  alpha_2_normal = alpha_2_m + alpha_2_s*alpha_2_raw + (diag_pre_multiply(tau_alpha_2,l_covm_alpha_2)*z_alpha_2)';
  alpha_2 = inv_logit(alpha_2_normal);
  beta_1 = beta_1_m + beta_1_s*beta_1_raw + (diag_pre_multiply(tau_beta_1,l_covm_beta1)*z_beta_1)';
  beta_2 = beta_2_m + beta_2_s*beta_2_raw + (diag_pre_multiply(tau_beta_2,l_covm_beta_2*z_beta_2)';
  omega_normal = omega_m + omega_s*omega_raw + (diag_pre_multiply(tau_omega,l_covm_omega)*z_omega)';
  omega = inv_logit(omega_normal);
  lambda_normal = lambda_m + lambda_s*lambda_raw + (diag_pre_multiply(tau_lambda,l_covm_lambda)*z_lambda)';
  lambda = inv_logit(lambda_normal);
  pers = pers_m + pers_s*pers_raw + (diag_pre_multiply(tau_pers,l_covm_pers)*z_pers)';
}

model {
  //define variables
  int prev_choice;
  int tran_count;
  int tran_type[2];
  real delta_1;
  real delta_2;
  real Q_TD[2];
  real Q_MB[2];
  real Q_net[2];
  real Q_2[2,2];
  
  //define priors
  to_vector(z_alpha_1) ~ normal(0,1);
  l_covm_alpha_1 ~ lkj_corr_cholesky(2);
  to_vector(z_alpha_2) ~ normal(0,1);
  l_covm_alpha_2 ~ lkj_corr_cholesky(2);
  to_vector(z_beta_1) ~ normal(0,1);
  l_covm_beta_1 ~ lkj_corr_cholesky(2);
  to_vector(z_beta_2) ~ normal(0,1);
  l_covm_beta_2 ~ lkj_corr_cholesky(2);
  to_vector(z_omega) ~ normal(0,1);
  l_covm_omega ~ lkj_corr_cholesky(2);
  to_vector(z_lambda) ~ normal(0,1);
  l_covm_lambda ~ lkj_corr_cholesky(2);
  to_vector(z_pers) ~ normal(0,1);
  l_covm_pers ~ lkj_corr_cholesky(2);
  alpha_1_m ~ normal(0,2);
  alpha_2_m ~ normal(0,2);
  beta_1_m ~ normal(0,5);
  beta_2_m ~ normal(0,5);
  omega_m ~ normal(0,2);
  lambda_m ~ normal(0,2);
  pers_m ~ normal(0,2);
  alpha_1_s ~ cauchy(0,1);
  alpha_2_s ~ cauchy(0,1);
  beta_1_s ~ cauchy(0,2);
  beta_2_s ~ cauchy(0,2);
  omega_s ~ cauchy(0,1);
  lambda_s ~ cauchy(0,1);
  pers_s ~ cauchy(0,1);
  alpha_1_raw ~ normal(0,1);
  alpha_2_raw ~ normal(0,1);
  beta_1_raw ~ normal(0,1);
  beta_2_raw ~ normal(0,1);
  omega_raw ~ normal(0,1);
  lambda_raw ~ normal(0,1);
  pers_raw ~ normal(0,1);
  
  for (s in 1:nS) {
    for (v in 1:2) {
  
  //set initial values
  for (i in 1:2) {
    Q_TD[i]=.5;
    Q_MB[i]=.5;
    Q_net[i]=.5;
    Q_2[1,i]=.5;
    Q_2[2,i]=.5;
    tran_type[i]=0;
  }
  
    for (t in 1:nT) {
      //use if not missing 1st stage choice
      if (missing_choice[s,t,1,v]==0) {
        prev_choice = 2*choice[s,t,1,v]-1; //1 if choice 2, -1 if choice 1
        choice[s,t,1,v] ~ bernoulli_logit(beta_1[s,v]*(Q_net[2]-Q_net[1])+pers[s,v]*prev_choice);
// lots more model code here...

} 
    }
  }
  }
}

generated quantities {
  
  cor_matrix[2] cor_alpha_1;
  cor_alpha_1=multiply_lower_tri_self_transpose(l_covm_alpha_1);
  cor_matrix[2] cor_alpha_2;
  cor_alpha_2=multiply_lower_tri_self_transpose(l_covm_alpha_2);
  cor_matrix[2] cor_beta_1;
  cor_beta_1=multiply_lower_tri_self_transpose(l_covm_beta_1);
  cor_matrix[2] cor_beta_2;
  cor_beta_2=multiply_lower_tri_self_transpose(l_covm_beta_2);
  cor_matrix[2] cor_omega;
  cor_omega=multiply_lower_tri_self_transpose(l_covm_omega);
  cor_matrix[2] cor_lambda;
  cor_lambda=multiply_lower_tri_self_transpose(l_covm_lambda);
  cor_matrix[2] cor_pers;
  cor_pers=multiply_lower_tri_self_transpose(l_covm_pers);
}

an update: I’ve managed to work out the vector/row_vector issues, but am still concerned about how I’m defining my transformed parameters with the multiple non-centered parameterizations (i.e.: alpha_1_normal[v] = alpha_1_m[v] + alpha_1_s[v]*alpha_1_raw[v] + (diag_pre_multiply(tau_alpha_1,l_covm_alpha_1)*z_alpha_1); ). Updated code below (I’ve trimmed out 6 of the 7 learning model parameters for brevity, as the rest will have the same construction):

data {
  int<lower=1> nT; // number of trials per subject
  int<lower=1> nS; // number of subjects
  int<lower=0,upper=1> choice[nS,nT,2,2]; // choices
  int<lower=0,upper=1> reward[nS,nT,2]; // rewards
  int<lower=1,upper=2> state_2[nS,nT,2]; // states
  int missing_choice[nS,nT,2,2]; 
  int missing_reward[nS,nT,2];
}

parameters {
  vector[2] alpha_1_m; // alpha_1 mean
  vector<lower=0>[2] alpha_1_s; // alpha_1 variance
  vector[nS] alpha_1_raw[2]; // alpha_1 subject variance
  
  vector[nS] z_alpha_1;
  cholesky_factor_corr[2] l_covm_alpha_1;
  vector[2] tau_unif_alpha_1;

}

transformed parameters {
  //define transformed parameters
  vector<lower=0,upper=1>[2] alpha_1[nS];
  vector[2] alpha_1_normal[nS];
  
  // transformed scale parameter for covariance matrix
  vector<lower=0>[2] tau_alpha_1;

  //create transformed parameters using non-centered parameterization for all
  // and logistic transformation for alpha, omega, & lambda (range: 0 to 1)
  // plus cholesky factorization for covariance matrix
  tau_alpha_1=2.5*tan(tau_unif_alpha_1);
  
  for (v in 1:2) {
  alpha_1_normal[v] = alpha_1_m[v] + alpha_1_s[v]*alpha_1_raw[v] + (diag_pre_multiply(tau_alpha_1,l_covm_alpha_1)*z_alpha_1);
  alpha_1[v] = inv_logit(alpha_1_normal[v]);
  }
}

model {
  //define variables
  int prev_choice;
  int tran_count;
  int tran_type[2];
  real delta_1;
  real delta_2;
  real Q_TD[2];
  real Q_MB[2];
  real Q_net[2];
  real Q_2[2,2];
  
  //define priors
  to_vector(z_alpha_1) ~ normal(0,1);
  l_covm_alpha_1 ~ lkj_corr_cholesky(2);
  
  for (v in 1:2) {
  alpha_1_m[v] ~ normal(0,2);
  alpha_1_s[v] ~ cauchy(0,1);
  alpha_1_raw[v] ~ normal(0,1);
  }
  
  for (s in 1:nS) {
    for (v in 1:2) {
  
  //set initial values
  for (i in 1:2) {
    Q_TD[i]=.5;
    Q_MB[i]=.5;
    Q_net[i]=.5;
    Q_2[1,i]=.5;
    Q_2[2,i]=.5;
    tran_type[i]=0;
  }
  
    for (t in 1:nT) {
      //use if not missing 1st stage choice
      if (missing_choice[s,t,1,v]==0) {
        prev_choice = 2*choice[s,t,1,v]-1; //1 if choice 2, -1 if choice 1
        choice[s,t,1,v] ~ bernoulli_logit(beta_1[s,v]*(Q_net[2]-Q_net[1])+pers[s]*prev_choice);
// more learning model code here

}
    }
  }
  }
}

generated quantities { 
  corr_matrix[2] cor_alpha_1;
  cor_alpha_1=multiply_lower_tri_self_transpose(l_covm_alpha_1);
}

Can you write out the prior you want to put on alpha_1_normal? Like \text{alpha_1_normal} \sim \mathcal{N}(0, \Sigma) or whatever (surround things with single dollar signs to get LaTeX). Then it should be pretty easy to tell if the non-centered parameterization is correct.

that actually helps me conceptualize it, thanks! As written now, my prior is alpha_normal_1 ~ N(0,\Sigma + \sigma) , which doesn’t make sense. Should I just go with the non-centered parameterization of the \Sigma and drop the alpha_1_s[v]*alpha_1_raw[v] as no longer needed?

I think as it’s written now, define L = \text{diag_pre_multiply}(\text{tau_alpha_1},\text{l_covm_alpha_1}), and your prior is (dropping the v index):

\alpha_{1, \text{normal}} \sim \mathcal{N}(\alpha_{1, m}, LL^T + \alpha_{1, s}^2 I)

Edit: The only question is, does that make sense? If so, groovy! If not, then we can change it.

ah, sorry- I thought you meant to rewrite it as the centered version. As I understand non-centered parameterization, both z_alpha_1 and alpha_1_raw are subject-specific indicators of how far each subject’s (with subjects 1:nS) mean varies from the group mean, which are then multiplied by the variance- is that expressed correctly in my model? I’m not sure if I follow your notation.

Given all that, my question is: do I need both LL^T and \alpha^2_{1,s}I terms for the variance, or should I drop one or combine them somehow? I’m just learning about Cholesky factorization, so apologies if this unclear/incorrect, but it seems like the variance is already taken care of when adding covariance matrix and I don’t need this additional term. Before adding the covariances and when estimating each session independently, my non-centered parameterization was alpha_normal_1 = alpha_m + alpha_s*alpha_raw, but just adding on the additional non-centered variance term from the Cholesky factorization doesn’t seem right.

thank you for your help!

Yeah I’d guess the \alpha^2_{1, s} parameter would end up non-identified here. The meaning of things would all get lumped together, but that’s what happens with non-identified things :P.

that makes sense. I have a version with only the variance from the covariance matrix running, and although I still need to test it against simulated data, it’s getting patterns of correlations similar to what I got with maximum likelihood, so it seems to be working. I fixed a few other things related to dimensions & to bring the priors on the scale in line with my previous model, so I pasted my (hopefully) final code below in case others are interested.

thank you for your help, Ben!

data {
  int<lower=1> nT;
  int<lower=1> nS;
  int<lower=0,upper=1> choice[nS,nT,2,2]; 
  int<lower=0,upper=1> reward[nS,nT,2];
  int<lower=1,upper=2> state_2[nS,nT,2]; 
  int missing_choice[nS,nT,2,2];
  int missing_reward[nS,nT,2];
}

parameters {
  vector[2] alpha_1_m;
  matrix[2,nS] z_alpha_1;
  cholesky_factor_corr[2] l_covm_alpha_1;
  vector<lower=0,upper=pi()/2>[2] tau_unif_alpha_1;
}

transformed parameters {
  //define transformed parameters
  vector<lower=0,upper=1>[2] alpha_1[nS];
  vector[2] alpha_1_normal[nS];
  
  // transformed scale parameter for covariance matrix
  vector<lower=0>[2] tau_alpha_1;

  //create transformed parameters using non-centered parameterization for all
  // and logistic transformation for alpha, omega, & lambda (range: 0 to 1)
  // plus cholesky factorization for covariance matrix
  for (v in 1:2) {
  tau_alpha_1[v]=1*tan(tau_unif_alpha_1[v]);
  }
  
  for (s in 1:nS) {
  alpha_1_normal[s] = alpha_1_m + (diag_pre_multiply(tau_alpha_1,l_covm_alpha_1)*z_alpha_1[,s]); //+ alpha_1_s*alpha_1_raw;
  }
  alpha_1 = inv_logit(alpha_1_normal);
}

model {
  //define variables
  int prev_choice;
  int tran_count;
  int tran_type[2];
  real delta_1;
  real delta_2;
  real Q_TD[2];
  real Q_MB[2];
  real Q_net[2];
  real Q_2[2,2];
  
  //define priors
  to_vector(z_alpha_1) ~ normal(0,1);
  l_covm_alpha_1 ~ lkj_corr_cholesky(1);
  
  for (v in 1:2) {
  alpha_1_m[v] ~ normal(0,2);
  }
  
  for (s in 1:nS) {
    for (v in 1:2) {
  
  //set initial values
  for (i in 1:2) {
    Q_TD[i]=.5;
    Q_MB[i]=.5;
    Q_net[i]=.5;
    Q_2[1,i]=.5;
    Q_2[2,i]=.5;
    tran_type[i]=0;
  }
  
    for (t in 1:nT) {
      //use if not missing 1st stage choice
      if (missing_choice[s,t,1,v]==0) {
        prev_choice = 2*choice[s,t,1,v]-1; //1 if choice 2, -1 if choice 1
        choice[s,t,1,v] ~ bernoulli((1-epsilon[s,v])*inv_logit(beta_1[s,v]*(Q_net[2]-Q_net[1])+pers[s,v]*prev_choice) + .5*epsilon[s,v]);
// rest of learning model code...
} 
    }
    }
  }
}

generated quantities {
  
  corr_matrix[2] cor_alpha_1;
  
  cor_alpha_1=multiply_lower_tri_self_transpose(l_covm_alpha_1);
}
1 Like