Divergences in multilevel signal detection theory model

I have been working on a simple multilevel signal detection theory model as presented in Lee & Wagenmakers Bayesian Cognitive Modeling).

The model can fit simulated data nicely enough, but I’m getting a small percentage of divergences and I cannot reconstruct the source of them. The handbook suggests issues might arise in the hierarchical variance parameter, and that parameter expansion might help. However, using multiplicative parameters to rescale as indicated didn’t solve the divergence issues.

I also tried varying the priors, and brute-forcing the issue via a large number of iterations (as if that ever works…), to no avail.

Reproducible example, with diagnostic plots at the bottom.

library(cmdstanr)

std_i <- matrix(
  c(3,1,1,3,4,0,0,4,4,4,0,0,4,1,0,3,4,3,0,1,4,4,0,0,4,1,0,3,4,3,0,1,4,
    0,0,4,4,2,0,2,3,2,1,2,4,2,0,2,4,0,0,4,3,4,1,0,3,4,1,0,4,1,0,3,3,2,
    1,2,4,1,0,3,3,1,1,3,4,1,0,3,3,2,1,2,4,0,0,4,3,4,1,0,4,4,0,0,4,2,0,
    2,4,4,0,0,4,4,0,0,4,4,0,0,4,4,0,0,4,0,0,4,2,3,2,1,4,0,0,4,4,4,0,0,
    2,3,2,1,4,0,0,4,4,3,0,1,4,4,0,0,4,1,0,3,4,1,0,3,4,4,0,0),
  nrow=40,ncol=4, byrow=T)

  
hits_i <- std_i[, 1]
falsealarms_i <- std_i[, 2]
missed_i <- std_i[, 3]
correctrejections_i <- std_i[, 4]

signal_i <- hits_i + missed_i
noise_i <- falsealarms_i + correctrejections_i
signal_i <- signal_i[1]; 
noise_i <- noise_i[1] #Each subject gets same number of signal and noise trials 

k_i <- nrow(std_i) 

data <- list(
  hits_i = hits_i, 
  falsealarms_i = falsealarms_i,
  signal_i = signal_i,
  noise_i = noise_i, 
  k_i = k_i) # To be passed on to Stan

stan_file <- write_stan_file("
// Signal detection theory model
//

// The input data 
data {
  
  int<lower=1> k_i; // n of participants
  int<lower=0> signal_i; // cases
  int<lower=0> hits_i[k_i]; // true positives
  int<lower=0> noise_i; // non-casees
  int<lower=0> falsealarms_i[k_i]; // false positives
  
}

// The parameters accepted by the model.
parameters {
  
  vector[k_i] criterion_i;
  real criterionprior;
  
  vector[k_i] discriminability_i;
  real discriminabilityprior;
  
  real mu_crit_i;
  real mu_disc_i;
  real muprior;
  
  real<lower=0> sigma_crit_i;
  real<lower=0> sigma_disc_i;
  real<lower=0> sigmaprior;
  
}

transformed parameters{
  real<lower=0,upper=1> theta_h_i[k_i];
  real<lower=0,upper=1> theta_f_i[k_i];
  
  real<lower=0,upper=1> theta_h_prior;
  real<lower=0,upper=1> theta_f_prior;
  
  // Reparameterization Using Equal-Variance Gaussian SDT
  theta_h_prior = Phi(discriminabilityprior / 2 - criterionprior);
  theta_f_prior = Phi(-discriminabilityprior / 2 - criterionprior);
    
  for(i in 1:k_i) {
    theta_h_i[i] = Phi(discriminability_i[i] / 2 - criterion_i[i]);
    theta_f_i[i] = Phi(-discriminability_i[i] / 2 - criterion_i[i]);
  }
  
}

// The model to be estimated. 
model {

  // These Priors over Discriminability and Bias Correspond 
  // to Uniform Priors over the Hit and False Alarm Rates
  mu_crit_i ~ normal(0, 5);
  mu_disc_i ~ normal(0, 5);
  muprior ~ normal(0, 5);
  
  sigma_crit_i ~ normal(0, 3);
  sigma_disc_i ~ normal(0, 3);
  sigmaprior ~ normal(0, 3);
  
  discriminability_i ~ normal(mu_disc_i, sigma_disc_i);
  discriminabilityprior ~ normal(muprior, sigmaprior);
  criterion_i ~ normal(mu_crit_i, sigma_crit_i);
  criterionprior ~ normal(muprior, sigmaprior);
  
  // Observed counts
  hits_i ~ binomial(signal_i, theta_h_i);
  falsealarms_i ~ binomial(noise_i, theta_f_i);
  
}")
  
mod <- cmdstan_model(stan_file, cpp_options = list(stan_threads = TRUE), pedantic = TRUE)

samples <- mod$sample(
  data = data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  threads_per_chain = 2,
  iter_warmup = 1000,
  iter_sampling = 1000,
  refresh = 100,
  max_treedepth = 20,
  adapt_delta = 0.99,
)

# Some basic quality checks of prior / posterior updates (was the prior too narrow/broad/badly positioned?)

draws_df <- as_draws_df(samples$draws()) 

ggplot(draws_df)+
  geom_density(aes(muprior), fill="red", alpha=0.3) +
  geom_density(aes(`mu_crit_i`), fill="blue", alpha=0.3) +
  geom_density(aes(`mu_disc_i`), fill="green", alpha=0.3) +
  theme_classic()

ggplot(draws_df)+
  geom_density(aes(sigmaprior), fill="red", alpha=0.3) +
  geom_density(aes(`sigma_crit_i`), fill="blue", alpha=0.3) +
  geom_density(aes(`sigma_disc_i`), fill="green", alpha=0.3) +
  theme_classic()

ggplot(draws_df)+
  geom_density(aes(criterionprior), fill="red", alpha=0.3) +
  geom_density(aes(`criterion_i[1]`), fill="blue", alpha=0.3) +
  geom_density(aes(`criterion_i[2]`), fill="blue", alpha=0.3) +
  geom_density(aes(`criterion_i[3]`), fill="blue", alpha=0.3) +
  theme_classic()

ggplot(draws_df)+
  geom_density(aes(`discriminabilityprior`), fill="red", alpha=0.3) +
  geom_density(aes(`discriminability_i[1]`), fill="blue", alpha=0.3) +
  geom_density(aes(`discriminability_i[2]`), fill="blue", alpha=0.3) +
  geom_density(aes(`discriminability_i[3]`), fill="blue", alpha=0.3) +
  theme_classic()

## Now quality checks of joint distributions against the prior

ggplot(draws_df)+
  geom_point(aes(criterionprior, sample(discriminabilityprior)), color="red", alpha=0.05) + # sample() is needed because of how I generated the prior samples (badly)
  geom_point(aes(`criterion_i[1]`, `discriminability_i[1]`), color="blue", alpha=0.1) +
  geom_point(aes(`criterion_i[2]`, `discriminability_i[2]`), color="green", alpha=0.1) +
  geom_point(aes(`criterion_i[3]`, `discriminability_i[3]`), color="purple", alpha=0.1) +
  theme_classic()

ggplot(draws_df)+
  geom_point(aes(muprior, sigmaprior), color="red", alpha=0.05) +
  geom_point(aes(`mu_crit_i`, `sigma_crit_i`), color="green", alpha=0.1) +
  geom_point(aes(`mu_disc_i`, `sigma_disc_i`), color="blue", alpha=0.1) + 
  theme_classic()

ggplot(draws_df)+
  geom_point(aes(muprior, sample(muprior)), color="red", alpha=0.05) +
  geom_point(aes(`mu_crit_i`, `mu_disc_i`), color="green", alpha=0.1) +
  theme_classic()

ggplot(draws_df)+
  geom_point(aes(sigmaprior, sample(sigmaprior)), color="red", alpha=0.05) +
  geom_point(aes(`sigma_crit_i`, `sigma_disc_i`), color="green", alpha=0.1) +
  theme_classic()


# Extract diagnostics for divergences

samples$cmdstan_diagnose() # summary
diagnostics_df <- as_draws_df(samples$sampler_diagnostics())
print(diagnostics_df)

draws_df$divergence=diagnostics_df$divergent__
x <- draws_df[draws_df$divergence==1,]


ggplot(draws_df)+
  geom_point(aes(`mu_crit_i`, `sigma_crit_i`), alpha=0.1) +
  geom_point(data=x, aes(`mu_crit_i`, `sigma_crit_i`), color="red") +
  theme_classic()

ggplot(draws_df)+
  geom_point(aes(`mu_disc_i`, `sigma_disc_i`), alpha=0.1) +
  geom_point(data=x, aes(`mu_disc_i`, `sigma_disc_i`), color="red") +
  theme_classic()

ggplot(draws_df)+
  geom_point(aes(`mu_crit_i`, `mu_disc_i`), alpha=0.1) +
  geom_point(data=x, aes(`mu_crit_i`, `mu_disc_i`), color="red") +
  theme_classic()


## Trace plots
ggplot(draws_df, aes(.iteration, sigma_disc_i, color=as.factor(.chain))) +
  geom_line() +
  theme_classic()
ggplot(draws_df, aes(.iteration, sigma_crit_i, color=as.factor(.chain))) +
  geom_line() +
  theme_classic()

ggplot(draws_df, aes(.iteration, mu_disc_i, color=as.factor(.chain))) +
  geom_line() +
  theme_classic()
ggplot(draws_df, aes(.iteration, mu_crit_i, color=as.factor(.chain))) +
  geom_line() +
  theme_classic()

## Plots
ggplot(draws_df) +
  geom_point(aes(`mu_disc_i`, `mu_crit_i`), color="red", alpha=0.1) +
  geom_point(aes(`mu_disc_d`, `mu_crit_d`), color="blue", alpha=0.1) +
  geom_rug(aes(`mu_disc_i`, `mu_crit_i`), color="red", size=0.1, alpha=0.1) +
  geom_rug(aes(`mu_disc_d`, `mu_crit_d`), color="blue", size=0.1, alpha=0.1) +
  theme_classic()

The alternative (unsuccessful) parameterisation is:

//
// Alternative parameterisation with multiplicative rescaling of sigma
//

// The input data 
data {
  
  int<lower=1> k_i; // n of participants
  int<lower=0> signal_i; // cases
  int<lower=0> hits_i[k_i]; // true positives
  int<lower=0> noise_i; // non-casees
  int<lower=0> falsealarms_i[k_i]; // false positives
  
}

// The parameters accepted by the model..
parameters {
  
  real<lower=0,upper=1> xic_i;
  real<lower=0,upper=1> xid_i;
  real<lower=0,upper=1> xi_prior;
  vector[k_i] deltac_i;
  vector[k_i] deltad_i;
  real deltaprior;
  
  real mu_crit_i;
  real mu_disc_i;
  real muprior;
  
  real<lower=0> sigma_crit_i_new;
  real<lower=0> sigma_disc_i_new;
  real<lower=0> sigma_prior_new;
  
}

transformed parameters{
  
  vector[k_i] criterion_i;
  vector[k_i] discriminability_i;
  real discriminabilityprior;
  real criterionprior;
  
  real<lower=0> sigma_crit_i;
  real<lower=0> sigma_disc_i;
  real<lower=0> sigmaprior;
  
  real<lower=0,upper=1> theta_h_i[k_i];
  real<lower=0,upper=1> theta_f_i[k_i];
  
  real<lower=0,upper=1> theta_h_prior;
  real<lower=0,upper=1> theta_f_prior;
  
  sigma_crit_i = fabs(xic_i) * sigma_crit_i_new;
  sigma_disc_i = fabs(xid_i) * sigma_disc_i_new;
  sigmaprior = fabs(xi_prior) * sigma_prior_new;
  
  criterion_i = mu_crit_i + xic_i * deltac_i;
  discriminability_i = mu_disc_i + xid_i * deltad_i;
  
  criterionprior = muprior + xi_prior * deltaprior;
  discriminabilityprior = muprior + xi_prior * deltaprior;
  
  
  // Reparameterization Using Equal-Variance Gaussian SDT
  theta_h_prior = Phi(discriminabilityprior / 2 - criterionprior);
  theta_f_prior = Phi(-discriminabilityprior / 2 - criterionprior);
    
  for(i in 1:k_i) {
    theta_h_i[i] = Phi(discriminability_i[i] / 2 - criterion_i[i]);
    theta_f_i[i] = Phi(-discriminability_i[i] / 2 - criterion_i[i]);
  }
  
}

// The model to be estimated. 
model {

  // These Priors over Discriminability and Bias Correspond 
  // to Uniform Priors over the Hit and False Alarm Rates
  mu_crit_i ~ normal(0, 5);
  mu_disc_i ~ normal(0, 5);
  muprior ~ normal(0, 5);
  
  sigma_crit_i_new ~ normal(0, 3);
  sigma_disc_i_new ~ normal(0, 3);
  sigma_prior_new ~ normal(0, 3);
  
  xic_i ~ beta(1, 1);  // can be removed
  xid_i ~ beta(1, 1);  // can be removed
  xi_prior ~ beta(1, 1);  // can be removed
  
  deltac_i ~ normal(0, sigma_crit_i_new);
  deltad_i ~ normal(0, sigma_disc_i_new);
  deltaprior ~ normal(0, sigma_prior_new);
  
  // Observed counts
  hits_i ~ binomial(signal_i, theta_h_i);
  falsealarms_i ~ binomial(noise_i, theta_f_i);
}

Hi,
The fist model works if one moves the parameters that are not used for estimation to the generated quantities. I also restricted some priors that seem excessively weak, if you move from normal space to the probability with Phi(), any number below 3 would essentially be ~1, and any number below -3, will be ~0.
In general, you shouldn’t have parameters that are not informed by data (unless you don’t have data at all). The generated quantities section can help you to generate predictions, I did it below.
Another thing, just in case, be aware that if you don’t use the _lpdf/_lpmf notation you cannot use Bayes Factor with this model.

// Signal detection theory model
//

// The input data
data {

  int<lower=1> k_i; // n of participants
  int<lower=0> signal_i; // cases
  int<lower=0> hits_i[k_i]; // true positives
  int<lower=0> noise_i; // non-casees
  int<lower=0> falsealarms_i[k_i]; // false positives

}

// The parameters accepted by the model.
parameters {

  vector[k_i] criterion_i;

  vector[k_i] discriminability_i;

  real mu_crit_i;
  real mu_disc_i;

  real<lower=0> sigma_crit_i;
  real<lower=0> sigma_disc_i;

}

transformed parameters{
  real<lower=0,upper=1> theta_h_i[k_i];
  real<lower=0,upper=1> theta_f_i[k_i];

    for(i in 1:k_i) {
    theta_h_i[i] = Phi(discriminability_i[i] / 2 - criterion_i[i]);
    theta_f_i[i] = Phi(-discriminability_i[i] / 2 - criterion_i[i]);
  }

}

// The model to be estimated.
model {

  // These Priors over Discriminability and Bias Correspond
  // to Uniform Priors over the Hit and False Alarm Rates
  mu_crit_i ~ normal(0, 2);
  mu_disc_i ~ normal(0, 2);

  sigma_crit_i ~ normal(0, 3);
  sigma_disc_i ~ normal(0, 3);

  discriminability_i ~ normal(mu_disc_i, sigma_disc_i);
  criterion_i ~ normal(mu_crit_i, sigma_crit_i);

  // Observed counts
  hits_i ~ binomial(signal_i, theta_h_i);
  falsealarms_i ~ binomial(noise_i, theta_f_i);

}
generated quantities {

  real muprior;
  real<lower=0> sigmaprior=0;
  real discriminabilityprior;
  real criterionprior;
  real<lower=0,upper=1> theta_h_prior;
  real<lower=0,upper=1> theta_f_prior;

   muprior = normal_rng(0, 5);
  // not-that nice way to sample from a truncated:
   while(sigmaprior <=0 )
     sigmaprior = normal_rng(0, 3);

   discriminabilityprior = normal_rng(muprior, sigmaprior);
   criterionprior = normal_rng(muprior, sigmaprior);
  // Reparameterization Using Equal-Variance Gaussian SDT
  theta_h_prior = Phi(discriminabilityprior / 2 - criterionprior);
  theta_f_prior = Phi(-discriminabilityprior / 2 - criterionprior);



}
3 Likes

thanks! to be clear for the _lpdf and lpmf you mean the binomial function, right?

yes, in this case everything is _lpdf except for binomial_lpmf.