Model has large p_loo but wins model comparison

Hi all,
I have fitted two models in JAGS and used loo_compare for model comparison. The model comparison seems to strongly favor one model, but this model has p_loo > p (number of parameters in the model). This seems to indicate that “the model has very weak predictive capability and may indicate a severe model misspecification”, according to this page. However, this looks counter-intuitive to me that such a model would win model comparison. I would like to know what would be the possible reasons for that? Thanks!

Here is the model comparison result:


Here is the detailed loo information for the winning model:

Can you tell more about the model? Can you show the code you used to get the log_liks and called loo? Can you show the loo output including possible Pareto k warnings (these are not visible in the information you provided)?

Hi, Thank you a lot for your reply! Sorry I didn’t upload the code as it seems a bit long.
Here is the script calculating the log_liks. It is written in JAGS.

data {

	for (s in 1:Nsamps) {
		for (i in 1:N) { # trial level
		
      		loglik[s, i] <- logdensity.wieners(y[i], 1 , ndt[s,i], bias[s,i], w[s,i], alpha.p[s,idxP[i]])
      
      # generate trial-by-trial nDT
        ndt[s,i] <- ndt.p[s,idxP[i]]
        dT[s,i] <- rt[i] - ndt[s,i]

      # generate trial-by-trial Bias
        bias[s,i] <- bias.p[s,idxP[i]]

      # The actual drift rate
        w[s,i] <- li.hat[s,i]

      # rt minus timeHin
        timeIn[s,i] <- abs(time.p[s,idxP[i]])
        timeStep[s,i] <- (dT[s,i] - timeIn[s,i])/dT[s,i]    #time is a weighted variable depending when the health(b1) or taste(b2) comes first
        timeInStep[s,i] <- timeIn[s,i]/dT[s,i]    
        
        smaller[s,i] <- step(timeStep[s,i])             #if smaller==0 either health or taste doesnt get in, the timeIn > (rt+ndt)
        tInsmaller[s,i] <- step(timeInStep[s,i])    

        li.hat_h[s,i] <- tInsmaller[s,i] * timeInStep[s,i]  * vd_h2[s,i] +   ( smaller[s,i] * timeStep[s,i] * vd_h[s,i] )
        li.hat_t[s,i] <- tInsmaller[s,i] * timeInStep[s,i]  * vd_t2[s,i] + ( smaller[s,i] * timeStep[s,i] * vd_t[s,i] )

        li.hat[s,i] <- li.hat_h[s,i] + li.hat_t[s,i]

      # The linear regression of the value taste and value health
        vd_h[s,i] <- b1.p[s,idxP[i]] * wd_S[i]
        vd_h2[s,i] <- b12.p[s,idxP[i]] * wd_S[i]
        vd_t[s,i] <- b2.p[s,idxP[i]] * td_S[i]
        vd_t2[s,i] <- b22.p[s,idxP[i]] * td_S[i]
    
    }
	}
}

model{
  fake <- 0
}

I save the output in Rdata and then call loo in R script. I suppose this should be correct because it was used multiple times for different model comparison:

Nsamps = 6000

conds = c(1)
for (cond in conds){
  
  waic_all <- vector(mode="list", length=length(all_models))
  names(waic_all) <- all_models
  loo_all <- vector(mode="list", length=length(all_models))
  names(loo_all) <- all_models
  ll_alls <- vector(mode="list", length=length(all_models))

  for (i_model in 1:length(all_models)){ 
    
    model <- all_models[i_model]
    folder.results <- file.path(pathToFolder, model)
    subG = 1     # initialize with subject 1 and append
    resultsFileName <- paste("post_ll_",model,"_", cond, "_", subG, ".RData", sep="")
    load(file.path(folder.results, resultsFileName))
    
    ll_alls[[model]] = matrix(post_ll$mcmc[[1]], nrow = Nsamps, ncol = length(post_ll$mcmc[[1]])/Nsamps, byrow = FALSE)
    
    for (subG in 2:n_sub){
      resultsFileName <- paste("post_ll_",model,"_", cond, "_", subG, ".RData", sep="")
      load(file.path(folder.results, resultsFileName))
      
      ll = matrix(post_ll$mcmc[[1]], nrow = Nsamps, ncol = length(post_ll$mcmc[[1]])/Nsamps, byrow = FALSE)
      ll_alls[[model]] <- cbind(ll_alls[[model]],ll)
    }
  }

  # remove and replace inf ----
  inf_row_inds <- vector(mode="list", length=length(all_models))
  for (i_model in 1:length(all_models)){
    model <- all_models[i_model]
    ll_all <- ll_alls[[model]]
    # find rows with Inf or -Inf or ll_all and does not lead to "replacement has length zero" error
    inf_row_inds[[model]] <- which(is.infinite(rowSums(ll_all)))
  }
  # get the union of all inf_row_inds
  inf_row_ind_union <- Reduce(union, inf_row_inds)
  if (length(inf_row_ind_union)%%3 != 0){
    # randomly sample from 1:6000 which doesnt exist in inf_row_ind_union and add to inf_row_ind_union
    inf_row_ind_union <- c(inf_row_ind_union, 
                           sample(setdiff(1:6000, inf_row_ind_union), 
                                  3 - length(inf_row_ind_union)%%3))
  }

  # remove inf_row_ind_union rows from all elements of ll_alls
  for (i_model in 1:length(all_models)){
    model <- all_models[i_model]
    
    ll_all <- ll_alls[[model]]
    if (length(inf_row_ind_union) > 0) {
      ll_all <- ll_all[-inf_row_ind_union, ] # Exclude rows if inf_row_ind_union is not empty
    }
    
    waic_all[[model]] =waic(ll_all)
    chainIDs = c(rep(1:3, each = dim(ll_all)[1]/3))
    loo_all[[model]]  =loo(ll_all, r_eff = relative_eff(ll_all, chain_id = chainIDs))
  }

  ###-----------------------------------------
  comp <- loo_compare(loo_all)
  print(comp)
}

Here is the output of loo:

Warning messages:
1:
204 (4.0%) p_waic estimates greater than 0.4. We recommend trying loo instead.
2: Some Pareto k diagnostic values are too high. See help(‘pareto-k-diagnostic’) for details.

3:
398 (7.8%) p_waic estimates greater than 0.4. We recommend trying loo instead.
4: Some Pareto k diagnostic values are too high. See help(‘pareto-k-diagnostic’) for details.
[The second one is the winning model]

I hope these help!