Hi all,

I’m a newcomer to Stan and I’m currently trying to use Stan for reinforcement learning modeling. I’ve encountered a problem: the inverse temperature (tau) has a minimal magnitude!

Specifically:

(1) In the softmax choice function, the inverse temperature (tau) represents choice consistency. A higher tau indicates that subjects’ choices rely more on the value.

(2) Based on feedback from participants and GLMM analysis, it is clear that choices are indeed influenced by value. Therefore, I should expect to capture an appropriate tau value from the model.

(3) I have set the range for tau to be [0, 10]. However, the final fitted tau is significantly on the lower side, for example, 0.1. I’m quite puzzled and would like to seek your advice on where the issue might be.

(4) Current findings and attempts: Tau at the group-level initial value (tau_mu_raw) is negative. I tried setting “lower=0” for fitting, but the model faces convergence issues (Rhat > 1.1).

(5) Below is my code for the sake of readability, mainly to illustrate the modeling approach.I would greatly appreciate any other methods or references.

Thank you in advance for your assistance！

*(Main Reference: lei-zhang (Dr. Lei Zhang) · GitHub ）*

*(Code Reference: Testosterone eliminates strategic prosocial behavior through impacting choice consistency in healthy males | Neuropsychopharmacology )*

**Stan Code:**

```
data {
int<lower=1> nSubjects; // number of subjects
int<lower=1> nTrials; // number of trials
int<lower=1,upper=99> Choice[nSubjects,nTrials]; // 1 = Left Choice, 2 = Right Choice, 99 = Missing
real<lower=0> Reward_A[nSubjects,nTrials]; // experiment reward feedback A (RANGE:[1,250])
real<lower=0> Reward_B[nSubjects,nTrials]; // experiment reward feedback B (RANGE:[1,250])
}
transformed data {
vector[2] RAi = rep_vector(0,2); // initial expected values for Reward_A
vector[2] RBi = rep_vector(0,2); // initial expected values for Reward_B
vector[2] EVi = rep_vector(0,2); // initial expected values for EV,which EV = w * Reward_A + (1-w) * Reward_B
}
parameters {
// group level parameters
real alphaRA_mu_raw; // learning rate for Reward_A
real alphaRB_mu_raw; // learning rate for Reward_B
real w_mu_raw; // a weighting parameter between Reward_A and Reward_B
real tau_mu_raw; // inverse temperature in softmax choice function
real<lower=0> alphaRA_sd_raw;
real<lower=0> alphaRB_sd_raw;
real<lower=0> w_sd_raw;
real<lower=0> tau_sd_raw;
// individual level parameters - raw space
vector[nSubjects] alphaRA_raw;
vector[nSubjects] alphaRB_raw;
vector[nSubjects] w_raw;
vector[nSubjects] tau_raw;
}
transformed parameters {
vector<lower=0,upper=1>[nSubjects] alphaRA;
vector<lower=0,upper=1>[nSubjects] alphaRB;
vector<lower=0,upper=1>[nSubjects] w;
vector<lower=0,upper=10>[nSubjects] tau;
// convert to the actual parameter space
for (s in 1:nSubjects){
alphaRA[s] = Phi_approx( alphaRA_mu_raw + alphaRA_sd_raw * alphaRA_raw[s] );
alphaRB[s] = Phi_approx( alphaRB_mu_raw + alphaRB_sd_raw * alphaRB_raw[s] );
w[s] = Phi_approx( w_mu_raw + w_sd_raw * w_raw[s] );
tau[s] = Phi_approx( tau_mu_raw + tau_sd_raw * tau_raw[s] )*10;
}
}
model {
// priors
alphaRA_mu_raw ~ std_normal();
alphaRB_mu_raw ~ std_normal();
w_mu_raw ~ std_normal();
tau_mu_raw ~ std_normal();
alphaRA_sd_raw ~ cauchy(0,1);
alphaRB_sd_raw ~ cauchy(0,1);
w_sd_raw ~ cauchy(0,1);
tau_sd_raw ~ cauchy(0,1);
alphaRA_raw ~ std_normal();
alphaRB_raw ~ std_normal();
w_raw ~ std_normal();
tau_raw ~ std_normal();
// model
for (s in 1:nSubjects){
vector[2] RA = RAi;
vector[2] RB = RBi;
vector[2] EV = EVi;
for (t in 1:nTrials) {
if (Choice[s,t] != 99) {
Choice[s,t] ~ categorical_logit(tau[s] * EV); // softmax choice function
EV[Choice[s,t]] = w[s] * RA[Choice[s,t]] + (1-w[s]) * RB[Choice[s,t]]; // calculate Expected Value
RA[Choice[s,t]] += alphaRA[s] * (Reward_A[s,t] - RA[Choice[s,t]]); // Q—Learning:Reward_A
RB[Choice[s,t]] += alphaRB[s] * (Reward_B[s,t] - RB[Choice[s,t]]); // Q—Learning:Reward_B
} // end of Choice
} // end of t
} // end of s
} // end of model
generated quantities {
// parameters
real<lower=0,upper=1> alphaRA_mu;
real<lower=0,upper=1> alphaRB_mu;
real<lower=0,upper=1> w_mu;
real<lower=0,upper=10> tau_mu;
real log_lik[nSubjects]; //choice log likelihood
int y_pred[nSubjects, nTrials] = rep_array(99, nSubjects, nTrials); //choice prediction
// convert to the actual parameter space
alphaRA_mu = Phi_approx(alphaRA_mu_raw);
alphaRB_mu = Phi_approx(alphaRB_mu_raw);
w_mu = Phi_approx(w_mu_raw);
tau_mu = Phi_approx(tau_mu_raw)*10;
// model
{
for (s in 1:nSubjects){
vector[2] RA = RAi;
vector[2] RB = RBi;
vector[2] EV = EVi;
log_lik[s]=0;
for (t in 1:nTrials) {
if (Choice[s,t] != 99) {
log_lik[s] += categorical_logit_lpmf(Choice[s,t] | tau[s] * EV); //choice log likelihood
y_pred[s,t] = categorical_logit_rng(tau[s] * EV); //choice prediction
EV[Choice[s,t]] = w[s] * RA[Choice[s,t]] + (1-w[s]) * RB[Choice[s,t]]; // calculate Expected Value
RA[Choice[s,t]] += alphaRA[s] * (Reward_A[s,t] - RA[Choice[s,t]]); // Q—Learning:Reward_A
RB[Choice[s,t]] += alphaRB[s] * (Reward_B[s,t] - RB[Choice[s,t]]); // Q—Learning:Reward_B
} // end of Choice
} // end of t
} // end of s
} // end of model
} // end of generated quantities
```