Hi Mike,

You are right, I ran a Bayesian hierarchical reinforcement learning model, following is my Stan code (in case you are interested), I employed both **non-centered parameterization** and **vectorization** coding.

```
data {
int<lower=1> N;
int<lower=1> T;
int<lower=1, upper=4> cue[N, T];
int<lower=0, upper=1> action[N, T];
real outcome[N, T];
}
transformed data {
vector[4] initV;
initV = rep_vector(0.0, 4);
}
parameters {
// in total 6 parameters of interest:
// 1) xi
// 2) ep
// 3) b
// 4) pi
// 5) rhoRew
// 6) rhoPun
vector[6] mu_pr; // declare as vectors for vectorizing
vector<lower=0>[6] sigma;
vector[N] xi_pr; // each element ~ N(0, 1)
vector[N] ep_pr;
vector[N] b_pr;
vector[N] pi_pr;
vector[N] rhoRew_pr;
vector[N] rhoPun_pr;
}
transformed parameters {
vector<lower=0, upper=0.2>[N] xi; // bounded between [0, 0.2]
vector<lower=0, upper=1>[N] ep; // bounded between [0, 1]
vector[N] b; // unbounded, namely (-Inf, Inf)
vector[N] pi; // unbounded, namely (-Inf, Inf)
vector<lower=0>[N] rhoRew; // bounded as [0, Inf)
vector<lower=0>[N] rhoPun; // bounded as [0, Inf)
// vectorization and non-centered parameterization
xi = Phi_approx(mu_pr[1] + sigma[1] * xi_pr)*0.2;
ep = Phi_approx(mu_pr[2] + sigma[2] * ep_pr);
b = mu_pr[3] + sigma[3] * b_pr;
pi = mu_pr[4] + sigma[4] * pi_pr;
rhoRew = log(1 + exp(mu_pr[5] + sigma[5] * rhoRew_pr));
rhoPun = log(1 + exp(mu_pr[6] + sigma[6] * rhoPun_pr));
}
model {
// priors for these 6 hyper-parameters
mu_pr[1] ~ normal(0, 1.0); // xi
mu_pr[2] ~ normal(0, 1.0); // ep
mu_pr[3] ~ normal(0, 2.0); // b
mu_pr[4] ~ normal(0, 2.0); // pi
mu_pr[5] ~ normal(0, 1.0); // rhoRew
mu_pr[6] ~ normal(0, 1.0); // rhoPun
sigma[1] ~ normal(0, 0.2); // xi
sigma[2] ~ normal(0, 0.2); // ep
sigma[3] ~ cauchy(0, 1.0); // b
sigma[4] ~ cauchy(0, 1.0); // pi
sigma[5] ~ normal(0, 0.2); // rhoRew
sigma[6] ~ normal(0, 0.2); // rhoPun
// Matt trick
xi_pr ~ normal(0, 1.0);
ep_pr ~ normal(0, 1.0);
b_pr ~ normal(0, 1.0);
pi_pr ~ normal(0, 1.0);
rhoRew_pr ~ normal(0, 1.0);
rhoPun_pr ~ normal(0, 1.0);
for (s in 1:N) {
// 1)In total I have 4 training conditions.
// 2)Under each condition subjects have two alternative action choices: go and no-go.
// 3)So, for each action per condition, I want to now its action weight and q-value.
vector[4] actionWeight_go;
vector[4] actionWeight_nogo;
vector[4] qValue_go;
vector[4] qValue_nogo;
vector[4] sv;
vector[4] pGo;
actionWeight_go = initV;
actionWeight_nogo = initV;
qValue_go = initV;
qValue_nogo = initV;
sv = initV;
for (t in 1:T) {
actionWeight_go[cue[s, t]] = qValue_go[cue[s, t]] + b[s] + pi[s] * sv[cue[s, t]];
actionWeight_nogo[cue[s, t]] = qValue_nogo[cue[s, t]];
// the probability of making go responses
pGo[cue[s,t]] = inv_logit(actionWeight_go[cue[s, t]] - actionWeight_nogo[cue[s,t]]);
{
pGo[cue[s, t]] *= (1 - xi[s]);
pGo[cue[s, t]] += (xi[s]/2);
}
// Likelihood function
action[s, t] ~ bernoulli(pGo[cue[s, t]]);
// update the stimulus value
if (cue[s, t] ==1) { // goToWin (0 or 1)
if (outcome[s, t] == 1) {
sv[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - sv[cue[s, t]]);
} else if (outcome[s, t] ==0) {
sv[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - sv[cue[s, t]]);
}
} else if (cue[s, t] == 2) { // goToAvoid (-1 or 0)
if (outcome[s, t] == 0) {
sv[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - sv[cue[s, t]]);
} else if (outcome[s, t] == -1) {
sv[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - sv[cue[s, t]]);
}
} else if (cue[s, t] ==3) { // nogoToWin (0 or 1)
if (outcome[s, t] == 1) {
sv[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - sv[cue[s, t]]);
} else if (outcome[s, t] ==0) {
sv[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - sv[cue[s, t]]);
}
} else if (cue[s, t] == 4) { // nogoToAvoid (-1 or 0)
if (outcome[s, t] == 0) {
sv[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - sv[cue[s, t]]);
} else if (outcome[s, t] == -1) {
sv[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - sv[cue[s, t]]);
}
}
// update the action values
if (cue[s, t] == 1) { // goToWin (0 or 1)
if (action[s, t] == 1 && outcome[s, t] == 1) {
qValue_go[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 1 && outcome[s, t] == 0) {
qValue_go[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 1) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoRew[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 0) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoPun[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
}
} else if (cue[s, t] == 2) { // goToAvoid (-1 or 0)
if (action[s, t] == 1 && outcome[s, t] == 0) {
qValue_go[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 1 && outcome[s, t] == -1) {
qValue_go[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 0) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoRew[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == -1) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoPun[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
}
} else if (cue[s, t] == 3) { // nogoToWin (0 or 1)
if (action[s, t] == 1 && outcome[s, t] == 1) {
qValue_go[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 1 && outcome[s, t] == 0) {
qValue_go[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 1) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoRew[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 0) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoPun[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
}
} else if (cue[s, t] == 4) { // nogoToAvoid (-1 or 0)
if (action[s, t] == 1 && outcome[s, t] == 0) {
qValue_go[cue[s, t]] += ep[s] * (rhoRew[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 1 && outcome[s, t] == -1) {
qValue_go[cue[s, t]] += ep[s] * (rhoPun[s] * outcome[s, t] - qValue_go[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == 0) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoRew[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
} else if (action[s, t] == 0 && outcome[s, t] == -1) {
qValue_nogo[cue[s, t]] += ep[s] * (rhoPun[s] *outcome[s, t] - qValue_nogo[cue[s, t]]);
}
}
} // end of the loop for the trial-level
} // end of the loop for the subject-level
}
generated quantities {
real<lower=0, upper=0.2> mu_xi;
real<lower=0, upper=1> mu_ep;
real mu_b;
real mu_pi;
real<lower=0> mu_rhoRew;
real<lower=0> mu_rhoPun;
mu_xi = Phi_approx(mu_pr[1])*0.2;
mu_ep = Phi_approx(mu_pr[2]);
mu_b = mu_pr[3];
mu_pi = mu_pr[4];
mu_rhoRew = log(1 + exp(mu_pr[5]));
mu_rhoPun = log(1 + exp(mu_pr[6]));
}
```

To be honest, I haven’t tried the centered parameterizations, because I intuitively thought the non-centered parameterizations would give me more efficient samplings for the hierarchical model. I will try the centered parameterizations soon and let you know what happen. Also, if you have any suggestions in terms of improving/optimizing the coding, that would be great.

Best,

Kim