Near-zero learning rate

Hello everyone,

I have a hierarchical RL model based on the R-W delta rule. Out of all the models I have tested, LOOIC identifies this one as the best fit for my data: two learning rates (one for reward and one for punishment) and one sensitivity parameter.

All the diagnostics of this model look good, however, under one of the conditions, the learning rate for reward is really close to zero, while still having a high sensitivity, which seems strange to me.

I was wondering if there’s any obvious errors in my model, since this is my first time writing RL models.

data {
  int<lower=1> N_subjects;
  int<lower=1> N_trials;
  int<lower=1> Maxtrial; //max nr trials across subjects
  int<lower=1> N_conditions;
  int<lower=1, upper = N_trials> Trialxsubj[N_conditions, N_subjects];//nr trials per subj
  int<lower= 1, upper=2> choice[N_conditions, N_subjects, Maxtrial];
  int<lower= -1, upper=1> outcome[N_conditions, N_subjects, Maxtrial]; 
}

transformed data {
  vector[2] initV;  // initial values 
  initV = rep_vector(0.0, 2);
}

parameters {
  // Hyper-parameters
  real mu_lr_r[N_conditions]; //mean for learning rate rew
  real mu_lr_p[N_conditions]; //mean for learning rate pun
  real mu_sen[N_conditions];  //mean for sensitivity param
  
  real<lower=0> sigma_lr_r[N_conditions]; //sd for learning rate rew
  real<lower=0> sigma_lr_p[N_conditions]; //sd for learning rate pun
  real<lower=0> sigma_sen[N_conditions]; //sd for sensitivity param

  
  // Subject-level raw parameters
  real lr_rew_pr[N_conditions, N_subjects];  //raw learning rate rew
  real lr_pun_pr[N_conditions, N_subjects];  //raw learning rate pun 
  real sensitivity_pr[N_conditions, N_subjects]; //raw sensitivity
 
}

transformed parameters {
  // Transforming parameters to be [0,1]
  real <lower=0, upper=1> lr_rew[N_conditions, N_subjects];
  real <lower=0, upper=1> lr_pun[N_conditions, N_subjects];
  real <lower=0, upper=1> sensitivity [N_conditions, N_subjects];
 
  for (c in 1:N_conditions){
    for (i in 1:N_subjects) {
      lr_rew[c, i] = Phi_approx(mu_lr_r[c]  + sigma_lr_r[c] * lr_rew_pr[c, i]);
      lr_pun[c, i] = Phi_approx(mu_lr_p[c]  + sigma_lr_p[c] * lr_pun_pr[c, i]);
      sensitivity[c, i] = Phi_approx(mu_sen[c]  + sigma_sen[c] * sensitivity_pr[c, i]);
    }
  }
}

model {
  // Hyperparameter priors
  mu_lr_r  ~ normal(0, 1);
  sigma_lr_r ~ normal(0, 0.2);
  
  mu_lr_p  ~ normal(0, 1);
  sigma_lr_p ~ normal(0, 0.2);
  
  mu_sen  ~ normal(0, 1);
  sigma_sen ~ normal(0, 0.2);

  // Subject parameters priors
  for (c in 1:N_conditions){
    for (i in 1:N_subjects){
      lr_rew_pr[c, i] ~ normal(0, 1);
      lr_pun_pr[c, i] ~ normal(0, 1);
      sensitivity_pr[c, i]~ normal(0, 1);
    }
  }
  
  
  for (c in 1:N_conditions){
    for (i in 1:N_subjects) {
      
      vector[2] ev = initV; // expected value
      
      real PE;      // prediction error
  
      for (t in 1:(Trialxsubj[c, i])) {
        
        if (outcome[c, i, t] == -1){ //if the outcome is punishment
        
          //action probabilities
          choice[c, i, t] ~ categorical_logit(ev);
    
          // prediction error
          PE = sensitivity[c, i]*outcome[c, i, t] - ev[choice[c, i, t]];
    
          // value updating
           ev[choice[c, i, t]] += lr_pun[c, i] * PE;
          
        }else{ //for non-rewarded and rewarded trials
        
          //action probabilities
          choice[c, i, t] ~ categorical_logit(ev);
    
          // prediction error
          PE = sensitivity[c, i]*outcome[c, i, t] - ev[choice[c, i, t]];
    
          // value updating
           ev[choice[c, i, t]] += lr_rew[c, i] * PE;
          
        }
      }
    }
  }
}

generated quantities {
  // For log likelihood calculation
  real log_lik[N_conditions, N_subjects];

  { 
    for (c in 1:N_conditions){
      for (i in 1:N_subjects) {
        
        vector[2] ev = initV; // expected value
        real PE;      // prediction error
        log_lik[c, i] = 0;
  
        for (t in 1:(Trialxsubj[c, i])) {
          if (outcome[c, i, t] == -1){//if the outcome is punishment
        
            // compute log likelihood of current trial
            log_lik[c, i] += categorical_logit_lpmf(choice[c, i, t] | ev);
      
            // prediction error
            PE = sensitivity[c, i]*outcome[c, i, t] - ev[choice[c, i, t]];
      
            // value updating
             ev[choice[c, i, t]] += lr_pun[c, i] * PE;
          
          }else{ //if the trial is rewarded or non-rewarded
            // compute log likelihood of current trial
            log_lik[c, i] += categorical_logit_lpmf(choice[c, i, t] | ev);
      
            // prediction error
            PE = sensitivity[c, i]*outcome[c, i, t] - ev[choice[c, i, t]];
      
            // value updating (learning)
             ev[choice[c, i, t]] += lr_rew[c, i] * PE;
          }
        }
      }
    }
  }
}