I am trying a new reinforcement learning which indicates that learning rate will change with the reward that the agent received at trial t. The formula is like: a(t)=gamma*(reward(t)-sum(v))+(1-gamma)*a(t-1), where gamma is a weight factor that implies how much extenal feedback the agent will take on his learning process. The stan code is like below:
data {
int<lower=1> nSubjects;
int<lower=1> nTrials;
int<lower=1,upper=4> choice[nSubjects, nTrials];
real<lower=-1150, upper=100> reward[nSubjects, nTrials];
real<lower=-1150, upper=100> y_reward[choice[nSubjects,nTrials],nTrials];
}
transformed data {
vector[4] initV; // initial values for V
vector[4] initu;
real initlr;
initV = rep_vector(0.0, 4);
initu = rep_vector(0.0, 4);
initlr = 0.0;
}
parameters {
real<lower=0,upper=5> c[nSubjects];
real<lower=0,upper=5> loss_aversion[nSubjects];
real<lower=0,upper=1> A[nSubjects];
real<lower=0,upper=1> gamma[nSubjects];
}
model {
for (s in 1:nSubjects) {
vector[4] v;
vector[4] U;
real lr;
real pe;
real theta;
real lamda;
v = initV;
U = initu;
lr = initlr;
theta = pow(3,c[s])-1;
for (t in 1:nTrials) {
// complete the foo-loop here
if (reward[s,t]>=0) {
U[choice[s,t]] = pow(reward[s,t],A[s]);}
else {
U[choice[s,t]] = -loss_aversion[s]*pow(-reward[s,t],A[s]);}
pe = U[choice[s,t]]-v[choice[s,t]];
v[choice[s,t]]=v[choice[s,t]]+lr*pe;
lamda = U[choice[s,t]]-sum(v);
lr=gamma[s]*fabs(lamda)+(1-gamma[s])*lr;
choice[s,t] ~ categorical_logit(theta*v);
}
}
}
generated quantities{
int<lower=1,upper=4> y_pred[nSubjects,nTrials];
int<lower=0> dA[nSubjects];
int<lower=0> dB[nSubjects];
int<lower=0> dC[nSubjects];
int<lower=0> dD[nSubjects];
real log_lik[nSubjects];
real<lower=-1150,upper=100> p_reward[nSubjects,nTrials];
vector[4] initev;
vector[4] initeu;
real initelr;
initev = rep_vector(0.0, 4);
initeu = rep_vector(0.0, 4);
initelr=0.0;
for (s in 1:nSubjects) {
vector[4] v;
vector[4]ev;
vector[4] U;
vector[4]eu;
real y_pe;
real pe;
real theta;
real lamda;
real y_lamda;
real lr;
real elr;
v = initV;
ev = initev;
U = initu;
eu = initeu;
lr = initlr;
elr = initelr;
dA[s] = 0;
dB[s] = 0;
dC[s] = 0;
dD[s] = 0;
log_lik[s]=0;
theta = pow(3,c[s])-1;
for (t in 1:nTrials) {
// complete the foo-loop here
y_pred[s,t] = categorical_logit_rng(theta*ev);
if(y_pred[s,t]==1){
dA[s]=dA[s]+1;
p_reward[s,t]=y_reward[1,dA[s]];
}else if(y_pred[s,t]==2){
dB[s]=dB[s]+1;
p_reward[s,t]=y_reward[2,dB[s]];
}else if(y_pred[s,t]==3){
dC[s]=dC[s]+1;
p_reward[s,t]=y_reward[3,dC[s]];
}else{
dD[s]=dD[s]+1;
p_reward[s,t]=y_reward[4,dD[s]];
}
if (reward[s,t]>=0) {
U[choice[s,t]] = pow(reward[s,t],A[s]);}
else {
U[choice[s,t]] = -loss_aversion[s]*pow(-reward[s,t],A[s]);}
pe = U[choice[s,t]]-v[choice[s,t]];
v[choice[s,t]]=v[choice[s,t]]+lr*pe;
lamda = U[choice[s,t]]-sum(v);
lr=gamma[s]*fabs(lamda)+(1-gamma[s])*lr;
log_lik[s]=log_lik[s]+categorical_logit_lpmf(choice[s,t]|theta*v);
if(p_reward[s,t]>=0){
eu[y_pred[s,t]]=pow(p_reward[s,t],A[s]);
} else{
eu[y_pred[s,t]]=-loss_aversion[s]*pow(-p_reward[s,t],A[s]);
}
y_pe=eu[y_pred[s,t]]-ev[y_pred[s,t]];
ev[y_pred[s,t]]=ev[y_pred[s,t]]+elr*y_pe;
y_lamda=eu[y_pred[s,t]]-sum(ev);
elr=gamma[s]*fabs(y_lamda)+(1-gamma[s])*elr;
}
}
}
But there are some bugs:
Chain 1: Rejecting initial value:
Chain 1: Error evaluating the log probability at the initial value.
Chain 1: Exception: categorical_logit_lpmf: log odds parameter[2] is -inf, but must be finite! (in ‘modele87010bc1312_PVL_Delta_flexible_learning_Model’ at line 50)
the bug happens in:
choice[s,t]~categorical_logit(theta*v);
I don’t know what is wrong with this expression. Can someone help me to figure it out? Thanks!