Hi all,
I’m trying to build a model in stan for a task similar to the one in this paper - there are tiles on a 2D grid that are each worth a certain number of points. The point values are spatially correlated on the grid according to a gaussian process, and work has shown that people engage in a GP-like generalization to infer values of hidden tiles. The model has two steps: 1) based on the current and past observed values for chosen tiles on the current grid, estimate the value of each tile (including unseen/unchosen ones) using a GP, and 2) based on the values inferred from the GP and a softmax choice rule, make choices that will maximize point values. I can find examples of GP predictions after model fitting but was having trouble figuring out how to do this appropriately within the model itself. I have a version of the model that runs but it is running very slowly, even with simulated data (hours when it should take a few minutes).
Below is my stan code, R code to run the model in rstan, and attached simulated data.
sim_calc_gp_softmax_sim_beta10_LS2_s20.Rdata (3.0 MB)
First of all, is my implementation of the GP (including the translations between 2D and 1D data) correct? And if so, are there ways to speed this up? Thanks in advance for your help.
Model code:
data {
int<lower=1> nS; //number of participants
int<lower=1> nB; //number of blocks per participant
int<lower=1> nT; //total possible trials per block
int<lower=1> nL; //number of tiles in grid row/column (assume square)
int<lower=1> b_length [nS,nB]; //trials per block per participant
int selected_x [nS,nB,nT];
int selected_y [nS,nB,nT];
int<lower=0,upper=nL*nL> selected_row [nS,nB,nT]; //X+Y coord of selected tile
//calculate as (X-1)*nL+Y
real points [nS,nB,nT]; //outcome of each choice
int all_x[nL*nL]; //X coordinates of all tiles
int all_y[nL*nL]; //Y coordinates of all tiles
}
transformed data {
real delta = 1e-9;
real sigma = 0.01; //assume variance
//transformed coordinates
row_vector[2] all_coords[nL*nL];
for (ll in 1:(nL*nL)) {
all_coords[ll,1] = all_x[ll];
all_coords[ll,2] = all_y[ll];
}
}
parameters {
real<lower=0> length_scale; //aka rho or generalization parameter
// real<lower=0> sigma; //variance
real<lower=0> beta; //inverse temperature for choice function
}
model {
//set up variables
vector[nL*nL] pred_f_mu;
matrix[nL*nL,nL*nL] cov_pred_f;
matrix[nL*nL,nL*nL] diag_delta;
vector[nL*nL] pred_f;
//priors
length_scale ~ inv_gamma(5,5);
// sigma ~ std_normal();
beta ~ normal(0,10);
//loop through subjects, blocks, & trials
for (s in 1:nS) {
for (b in 1:nB) {
row_vector[2] selected_coords[nT];
for (t in 1:(b_length[s,b])) {
matrix [t,t] K; //cov kernel for selected tiles
matrix [t,t] L; //cholesky transformed cov kernel for selected tiles
vector [t] f; //means of selected tiles
vector [t] curr_points; //temp vector for type agreement
vector[t] K_div_y1;
vector[t] K_div_y1_2;
matrix[t,nL*nL] k_x1_x2;
matrix[t,nL*nL] v_pred;
vector[t] mu = rep_vector(0,t);
selected_coords[t,1] = selected_x[s,b,t];
selected_coords[t,2] = selected_y[s,b,t];
//predict values from GP
K = cov_exp_quad(selected_coords[1:t,],1.0,length_scale);
for (tt in 1:(t)) {
K[tt, tt] = K[tt, tt] + square(sigma);
}
L = cholesky_decompose(K);
//fit GP to observed outcomes
curr_points=to_vector(points[s,b,1:t]);
curr_points ~ multi_normal_cholesky(mu,L);
//predict all tile values from GP
K_div_y1 = mdivide_left_tri_low(L,to_vector(points[s,b,1:t]));
K_div_y1_2 = mdivide_right_tri_low(K_div_y1',L)';
k_x1_x2 = cov_exp_quad(selected_coords[1:t,],all_coords,1.0,length_scale);
pred_f_mu = (k_x1_x2' * K_div_y1_2);
//softmax predicting next tile chosen
if (t<b_length[s,b]) {
selected_row[s,b,t+1] ~ categorical_logit(beta*pred_f_mu);
}
}
}
}
}
r code to run model in rstan:
in_data=load(data_path)
#set up data
nS=length(unique(in_data$ID))
nB=max(in_data$block_num)
nT=max(in_data$trial_block)
nL=11
all_coords=expand.grid(1:nL,1:nL)
all_x=all_coords[,2]
all_y=all_coords[,1]
b_length=array(data=0,dim=c(nS,nB))
block_type=array(data=0,dim=c(nS,nB))
selected_x=array(data=0,dim=c(nS,nB,nT))
selected_y=array(data=0,dim=c(nS,nB,nT))
selected_row=array(data=0,dim=c(nS,nB,nT))
points=array(data=0,dim=c(nS,nB,nT))
in_data$norm_points=(in_data$points-50)/100
for (s in 1:nS) {
subj_data=in_data[in_data$ID==unique(in_data$ID)[s],]
for (b in 1:nB) {
block_data=subj_data[subj_data$block_num==b,]
b_length[s,b]=dim(block_data)[1]
block_type[s,b]=block_data$block_risky[1]
for (t in 1:b_length[s,b]) {
selected_x[s,b,t]=block_data$choice_x[t]+1
selected_y[s,b,t]=block_data$choice_y[t]+1
selected_row[s,b,t]=nL*(block_data$choice_x[t])+block_data$choice_y[t]+1
points[s,b,t]=block_data$norm_points[t]
}
}
}
#run stan model
data_toest=list(nS=nS,nB=nB,nT=nT,nL=nL,b_length=b_length,block_type=block_type,
selected_row=selected_row,points=points,all_x=all_x,all_y=all_y,selected_x=
selected_x,selected_y=selected_y)
model_out=stan(file=model_name,data=data_toest,verbose=FALSE,save_warmup=FALSE,
iter=1000,chains=1) #few iters and 1 chain for testing