Hi all,
I am relatively new to Stan and hierarchical modeling so forgive me for asking any dumb questions!
I am trying to implement a hierarchical version of a model of associative memory from the 60s (https://pdfs.semanticscholar.org/6d5d/6c726ab8d67dfd721e81aba8ccc5fcf44a67.pdf). Subjects study pairs of words (one in english and one in german) in a particular schedule of study opportunities and have to recall the english word after being cued by the german word at a later test time. The model is essentially a continuous-time HMM where subjects transition between three different memory states that have different probabilities of recall at test time. While the transition dynamics occur throughout the experiment (while the subjects are studying and forgetting the words) the only observable emission of the model (in this version of it) is at the later test time where the subject either recalls the word or not.
The main parameters are a set of 5 probabilities that govern the transition dynamics for each word pair. In this implementation, I also put hierarchical priors over each of these. However, this model is running very slowly (over an hour to get about 80 warmup samples). In the experiment, we use 45 word pairs so there are roughly 45 * 5 effective parameters to do inference over and there are 168 subjects (so T is 168 * 45 = 7560). I have three questions: is this is a reasonable amount of time for a model like this or if there are any obvious ways to speed things up? In addition, using LBFGS, I end up with lots of “Error evaluating model log probability: Non-finite gradient”. Does this indicate a problem with the model? Or is it just because this is a kind of mixture model and it’s having trouble? Finally, ADVI seems to actually work reasonably well but I’m not sure whether to trust it. Are there any ways like Rhat etc to get diagnostics from ADVI? Obviously I would like to do full Bayesian inference but for iterating through developing new models on top of this, the speed of ADVI/LBFGS is certainly nice.
I’ve included the code for the model below, and I’d really appreciate any suggestions on how to improve/fix the model.
Thank you for all your help!
David
data {
int<lower=0> S; // number of states (3)
int<lower=0> N; // number of subjects
int<lower=0> T; // number of "trials" (subjects X words)
int<lower=0> W; // number of words
int<lower=0> L; // length of protocol (5 study sessions)
real<lower=0> l[T, L]; // protocol (length of time between study sessions)
int<lower=0> r[T]; // recalled (0 or 1)
int<lower=0> sub[T]; // subject on "trial"" T
int<lower=0> w[T]; // word on "trial"" T
vector<lower=0, upper=1>[S] recall_prob; // probability of recall in each state
}
parameters {
//priors
real<lower=0,upper=1> f_phi;
real<lower=0.1> f_lambda;
real<lower=0,upper=1> g_phi;
real<lower=0.1> g_lambda;
real<lower=0,upper=1> x_phi;
real<lower=0.1> x_lambda;
simplex[S] zy_phi;
real<lower=0.1> zy_kappa;
real<lower=0, upper=1> f[W]; //forgetting transition from T(emporary) to U(nknown)
real<lower=0, upper=1> g[W]; //prior prob of being in P(ermanent)
real<lower=0, upper=1> x[W]; //study transition from T(emporary) to P(ermanent)
simplex[S] zy[W];
}
transformed parameters {
vector<lower=0, upper=1>[T] p; //probability of recall at test
matrix<lower=0, upper=1>[T, S] p_s; //state probabilities
//prior reparameterization
real<lower=0> g_alpha;
real<lower=0> g_beta;
real<lower=0> f_alpha;
real<lower=0> f_beta;
real<lower=0> x_alpha;
real<lower=0> x_beta;
vector[S] zy_alpha;
g_alpha = g_lambda * g_phi;
g_beta = g_lambda * (1 - g_phi);
f_alpha = f_lambda * f_phi;
f_beta = f_lambda * (1 - f_phi);
x_alpha = x_lambda * x_phi;
x_beta = x_lambda * (1 - x_phi);
zy_alpha = zy_kappa * zy_phi;
for (t in 1:T) {
p_s[t, ] = [1 - g[w[t]], 0, g[w[t]]];
for (l_i in 1:L) {
//study transitions
//study_trans_mat (A) = [to_row_vector(zy[w[t]]),
//[0, 1 - x[w[t]], x[w[t]]],
//[0, 0, 1]];
p_s[t, 3] = p_s[t, 3] + (p_s[t, 2] * x[w[t]]) + (p_s[t, 1] * zy[w[t]][3]);
p_s[t, 2] = (p_s[t, 2] * (1 - x[w[t]])) + (p_s[t, 1] * zy[w[t]][2]);
p_s[t, 1] = p_s[t, 1] * zy[w[t]][1];
//forgetting transitions
//forget_trans_mat (F) = [[1, 0, 0],
//[f[w[t]], 1 - f[w[t]], 0],
//[0, 0, 1]];
p_s[t, 1] = p_s[t, 1] + p_s[t, 2] * (1 - ((1 - f[w[t]]) ^ l[t, l_i]));
p_s[t, 2] = p_s[t, 2] * ((1 - f[w[t]]) ^ l[t, l_i]);
}
}
//recall probs
p = p_s * recall_prob;
}
model {
//g hierarchical prior
g_phi ~ beta(1, 1);
g_lambda ~ pareto(0.1, 1.5); //BDA3 chapter 5
g ~ beta(g_alpha, g_beta);
//f hierarchical prior
f_phi ~ beta(1, 1);
f_lambda ~ pareto(0.1, 1.5); //BDA3 chapter 5
f ~ beta(f_alpha, f_beta);
//x hierarchical prior
x_phi ~ beta(1, 1);
x_lambda ~ pareto(0.1, 1.5); //BDA3 chapter 5
x ~ beta(x_alpha, x_beta);
zy_phi ~ dirichlet(rep_vector(1, S));
zy_kappa ~ pareto(0.1, 1.5);
for (w_i in 1:W) {
zy[w_i] ~ dirichlet(zy_alpha);
}
r ~ bernoulli(p);
}