Multivariate Hierarchical Model

Hi,

I’m rather new to working with stan (as you will see) and trying to run the toy covariance model from Palestro et al. 2018 (http://compmem.org/assets/pubs/Palestro.etal.2018a.pdf). I consulted the Stan documentation and the forums, but I fail to make the model run properly.

The following data are generated in R:

# need both logit and logit^{-1} functions
logit=function(x)log(x/(1-x))
invlogit=function(x){1/(1+exp(-x))}

# set up model specification
n <- 500	    # total number of trials 

# establish the hyperparameters
sig1 <- .5		# std. dev. of single-trial BOLD responses
sig2 <- 1			# std. dev. of item memory strength (logit scale)
rho <- .6			# cor b/n brain activation and memory strength

# set up hyper variance-covariance matrix Sigma
sigma <- matrix(c(sig1^2,         # element [1,1]
                  sig1*sig2*rho,  # element [1,2]
                  sig1*sig2*rho,  # element [2,1]
                  sig2^2          # element [2,2]
),2,2,byrow=TRUE)

# set up hyper mean vector phi
phi <- c(2,0)

# simulate single-trial delta and theta matrix
DeltaTheta <- rmvnorm(n,phi,sigma)

# generate observed variable nodes
ts <- seq(0,4,1)   # scan times
sig <- .5			     # the std. dev. of BOLD responses

# declare some storage objects
N=matrix(NA,n,length(ts))
B=numeric(n)

# loop over trials
for(i in 1:n){
  # N is a normal deviate with mean controlled by delta
  N[i,]=rnorm(length(ts),DeltaTheta[i,1]*ts,sig) 
  # B is a Bernoulli deviate with probability controlled by theta
  B[i]=rbinom(1,1,invlogit(DeltaTheta[i,2]))
}

NB = matrix(c(N,B),nrow = n,ncol = 6)

data = list(n=n,
            nPar=2,
            B=B, 
            N=N, 
            ts=ts, 
            Nt=length(ts), 
            sig=sig,
            I0=diag(2), 
            n0=2,
            phi0=rep(0,2), 
            s0=diag(2))

Then they fit the model in JAGS using

model {
    # convert sig to tau for convenience
    tau <- pow(sig, -2)
    
    # loop through trials to define likelihood
    for (i in 1:n){
        for (t in 1:Nt){
            # likelihood for neural data
            N[i,t] ~ dnorm(DeltaTheta[i,1]*ts[t],tau);
        }
            # likelihood for behavioral data
            B[i] ~ dbin(1/(1+exp(-DeltaTheta[i,2])),1);
    }
    
    # loop through trials to define prior on (delta, theta)
    for(i in 1:n){
        DeltaTheta[i,1:2] ~ dmnorm(phi,Omega);
    }
    
    # priors on hyperparameters
    phi ~ dmnorm(phi0,s0);
    Omega ~ dwish(I0, n0);
    # convert Omega to Sigma for convenience
    Sigma <- inverse(Omega);
}

I tried to translate the model to Stan and also estimate the correlation not the covariance matrix.

    // joint neural and behavioural model

    data{
      int<lower=1> n; //number of trials
      int B[n]; // Behavioral data matrix
      
      int<lower=1> Nt; // number of time points
      vector[Nt] ts; //time points
      real N[n,Nt]; // Neural  data matrix
      real<lower=0,upper=1> sig; // sd of neural data
      
      int<lower=1> nPar; // number of parameters in the joint model
      vector[nPar] phi0; // joint mean vector
      matrix[nPar, nPar]  s0; // joint sigma
    }

    parameters{
      row_vector[nPar] phi; // joint mean vector
      corr_matrix[2] omega; // Correlation matrix of N and B data
      vector<lower=0>[2] tau; // scale of the correlation matrix
      matrix[n, nPar] raw_DeltaTheta;
    }

    transformed parameters{
      matrix[n, nPar] DeltaTheta; //neural and behavioral parameters
      

     
     DeltaTheta= raw_DeltaTheta*quad_form_diag(omega,tau);
     DeltaTheta[,1] = DeltaTheta[,1]+phi[1];
     DeltaTheta[,2] = DeltaTheta[,2]+phi[2];
   
      
    }

    model{
      // prior for scale of the hyperparameter correlation matrix sigma
      tau ~ normal(0,2.5);//cauchy(0,2.5);
      // Joint model priors
      phi ~ multi_normal(phi0,s0);
      // prior on the correlation matrix
      omega ~ lkj_corr(6);
      
      // for the non-centered reparametrization
      to_vector(raw_DeltaTheta) ~ std_normal();

      // likelihood
      for(i in 1:n){
        for(t in 1:Nt){
          N[i,t]~normal(DeltaTheta[i,1]*ts[t],sig);
        }
        B[i]~binomial(1,inv_logit(DeltaTheta[i,2]));
        
      }
    }

This results in the following error messages (4 chains with 6000 iterations)

    1: In system(paste(CXX, ARGS), ignore.stdout = TRUE, ignore.stderr = TRUE) :
      '-E' not found
    2: There were 3021 divergent transitions after warmup. See
    http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
    to find out why this is a problem and how to eliminate them. 
    3: There were 4 chains where the estimated Bayesian Fraction of Missing Information was low. See
    http://mc-stan.org/misc/warnings.html#bfmi-low 
    4: Examine the pairs() plot to diagnose sampling problems
     
    5: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
    Running the chains for more iterations may help. See
    http://mc-stan.org/misc/warnings.html#bulk-ess 
    6: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
    Running the chains for more iterations may help. See
    http://mc-stan.org/misc/warnings.html#tail-ess 

omega[1,1] always has an Rhat = NaN.

It’s a fairly simple model, but I can’t make it run properly. Could you point me in the right direction? Thanks!

Is nPar==2 ? if not, then only the first two entries in phi are being used in the likelihood, leaving the others unupdated from the prior which might cause issues.

Consider using the cholesky_factor_corr variable type instead of the corr_matrix type, in which case you use a lkj_corr_cholesky() prior and combine with tau to make the covariance matrix via diag_pre_multiply(tau,L_omega). You can get the correlations in the generated quantities section by doing corr_matrix[nPar] omega = multiply_lower_tri_self_transpose(L_omega) ;

You might also replace ~binomial(1,inv_logit(...)) with ~bernoulli_logit(...)); that shouldn’t make any difference other than being possibly more optimized for compute speed internally. Once you get things actually sampling without divergences, there’s also a ton of unnecessary loops that you can avoid for speedups.

1 Like

Otherwise, maybe do as the error message suggests and look at / post some of the pairs plots to help see if we can discern what’s going awry.

Given your use of the multi_normal prior for phi, maybe declare phi with an offset & multiplier:

    row_vector< offset=phi0 , multiplier=sqrt(diagonal(s0)) >[nPar] phi ;

Hi Mike,

thanks for the quick reply! nPar is in fact 2. This time I changed the max_treedepth to 150 and adapt_delta to .99.

In the Stan model, I changed the following lines:

    parameters {
      ...
      cholesky_factor_corr[2] L_omega; // Correlation matrix of N and B data
      ...
    }

    model{
      ...
      // prior on the correlation matrix
      L_omega ~ lkj_corr_cholesky(1);
      
      // calculate the likelihood
      ...
      B[i]~bernoulli_logit(DeltaTheta[i,2]);
      ...
      }
    }

    generated quantities{
      corr_matrix[nPar] omega = multiply_lower_tri_self_transpose(L_omega) ;
    }

This still gives me:

2: There were 106 divergent transitions after warmup. See
    http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
    to find out why this is a problem and how to eliminate them. 
3: There were 2 chains where the estimated Bayesian Fraction of Missing Information was low. See
    http://mc-stan.org/misc/warnings.html#bfmi-low 
4: Examine the pairs() plot to diagnose sampling problems
5: The largest R-hat is NA, indicating chains have not mixed.
    Running the chains for more iterations may help. See
    http://mc-stan.org/misc/warnings.html#r-hat 
6: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be 
     unreliable.
    Running the chains for more iterations may help. See
    http://mc-stan.org/misc/warnings.html#bulk-ess

With output (for the first 500 entries of DeltaTheta the values are more or less the same):

Inference for Stan model: CJM.
4 chains, each with iter=6000; warmup=3000; thin=1; 
post-warmup draws per chain=3000, total post-warmup draws=12000.

                      mean se_mean    sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
phi[1]                2.00    0.00  0.02     1.96     1.99     2.01     2.02     2.05 10300 1.00
phi[2]               -0.11    0.00  0.15    -0.45    -0.19    -0.10    -0.01     0.16  2171 1.00
omega[1,1]            1.00     NaN  0.00     1.00     1.00     1.00     1.00     1.00   NaN  NaN
omega[1,2]            0.28    0.02  0.19     0.04     0.11     0.25     0.45     0.61    65 1.06
omega[2,1]            0.28    0.02  0.19     0.04     0.11     0.25     0.45     0.61    65 1.06
omega[2,2]            1.00    0.00  0.00     1.00     1.00     1.00     1.00     1.00 11456 1.00
DeltaTheta[1,1]       1.25    0.00  0.09     1.07     1.19     1.25     1.31     1.42 11753 1.00
DeltaTheta[1,2]      -2.02    0.10  1.81    -7.19    -2.46    -1.48    -1.03     0.13   308 1.01

Also the pairs plot for some parameters.

Have you tried the centered parameterization for DeltaTheta?

Hi,

I’ll do that next. What I did notice is, that the posterior samples for the second column of DeltaTheta, i.e. Thetas are bimodal in respect to the original thetas. I’m not really sure why that is happening. But it probably has something to do with the sampling problems.

I think my first translation from JAGS to Stan is a centered approach:

// joint neural and behavioural model

        data{
      int<lower=1> n; //number of trials
      int B[n]; // beh data matrix
      int<lower=1> Nt; // number of time points
      real N[n,Nt]; // neural data matrix
      int<lower=1> nPar; // number of parameters in the hyper model
      vector[Nt] ts; //time points
      real<lower=0,upper=1> sig; // sd of neural data
      matrix[nPar, nPar] I0; // dispersion matrix of the Wishart
      int<lower=0> n0; // Df of Wishart 
      vector[nPar] phi0; // hyper mean vector
      matrix[nPar, nPar]  s0; // phi hyper sigma
    }

    parameters{
      vector[nPar] phi;
      cov_matrix[nPar] Sigma; // Correlation matrix
      matrix[n, nPar] DeltaTheta;
    }


    model{
      // Hyperparameter priors
      phi ~ multi_normal(phi0,s0);
      Sigma ~ inv_wishart(n0,I0);
      // Priors on delta
      for(i in 1:n){
        DeltaTheta[i,]~multi_normal(phi,Sigma);
        
      }
      for(i in 1:n){
        for(t in 1:Nt){
            N[i,t]~normal(DeltaTheta[i,1]*ts[t],sig);
        }
        B[i]~binomial(1,inv_logit(DeltaTheta[i,2]));
        
      }
    }

But this does not recover the parameters at all.

So I managed to make the model work! It runs fast and recovers the parameters pretty well.

// joint neural and behavioural model

data{
  int<lower=1> n; //number of trials
  int B[n]; // Behavioral data matrix
  
  int<lower=1> Nt; // number of time points
  vector[Nt] ts; //time points
  real N[n,Nt]; // Neural  data matrix
  real<lower=0> sig; // sd of neural data
  
  int<lower=1> nPar; // number of parameters in the hyper model
  vector[nPar] phi0; // hyper mean vector - Delta, Theta mean
  matrix[nPar, nPar]  s0; // phi hyper sigma covariance matrix
}

parameters{
  // vector of hyper means
  row_vector[nPar] phi; 
  // Cholesky factor of the Correlation matrix of N and B data
  cholesky_factor_corr[2] L_omega; 
  vector<lower=0,upper=pi()/2>[nPar] tau_unif; // scale for omega
  //vector<lower=0>[nPar] tau;
  //matrix[n, nPar] DeltaTheta;
  matrix[nPar, n] raw_DeltaTheta;
  
}

transformed parameters{
  // Scale of the correlation matrix
  vector<lower=0>[nPar] tau = 1 * tau_unif;
  //neural and behavioral parameters
  
  matrix[n, nPar] DeltaTheta;
  
  for(i in 1:n) DeltaTheta[i,]= phi + (tau .*(L_omega*raw_DeltaTheta[,i]))';
  
}

model{
  // prior for scale of the hyperparameter correlation matrix 
  //tau ~ cauchy(0,1);
  
  // Hyperparameter means priors
  phi ~ multi_normal(phi0,s0);
  // prior for the cholesky factor of the correlation matrix
  L_omega ~ lkj_corr_cholesky(1);
  
  // For the non-centered parametrization
  to_vector(raw_DeltaTheta) ~ std_normal();
  
  // calculate the likelihood
  for(i in 1:n){
    for(t in 1:Nt){
      N[i,t]~normal(DeltaTheta[i,1]*ts[t],sig);
    }
    B[i] ~ bernoulli_logit(DeltaTheta[i,2]);
  }
  
}

generated quantities{
  // Correlation Matrix
  corr_matrix[nPar]Omega = multiply_lower_tri_self_transpose(L_omega);
  // Covariance Matrix
  cov_matrix[nPar] Sigma = quad_form_diag(Omega,tau) ;
}

However, the relationship between the deltas and thetas still looks pretty unusual.

1 Like