How to use your own stan model -- full tutorial

Hello,

I have a problem running my own stan file in Python/Pystan interface. I started with the tutorials (Getting Started — pystan 3.7.0 documentation, https://people.duke.edu/~ccc14/sta-663/PyStan.html, … many others do the exact same) but that does not work for me. My model is similar to the models in hmbayesdm package (eg.,

from hbayesdm.models import bandit2arm_delta

).
Meaning that I would like to use the following commands:

output = bandit2arm_delta(data=data_path, niter=2000, nwarmup=1000, nchain=4, ncore=4, nthin=1, inits=“random”, ind_pars=“mean”, inc_postpred=False, adapt_delta=0.95, stepsize=1, max_treedepth=10)

My code is as follows:

from hbayesdm import rhat, print_fit
from hbayesdm.models import bandit2arm_delta
import os, sys
import pystan
import pickle
from hashlib import md5

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec  # for unequal plot boxes
import seaborn as sns

def StanModel_cache(model_code, model_name=None, **kwargs):
    """Use just as you would `stan`
    https://pystan.readthedocs.io/en/latest/avoiding_recompilation.html"""
    code_hash = md5(model_code.encode('ascii')).hexdigest()
    if model_name is None:
        cache_fn = 'cached-model-{}.pkl'.format(code_hash)
    else:
        cache_fn = 'cached-{}-{}.pkl'.format(model_name, code_hash)
    try:
        sm = pickle.load(open(cache_fn, 'rb'))
    except:
        sm = pystan.StanModel(model_code=model_code)
        with open(cache_fn, 'wb') as f:
            pickle.dump(sm, f)
    else:
        print("Using cached StanModel")
    return sm

# model
sm = pystan.StanModel(file=model_path)

# Run the model and store results in "output"
output = sm.sampling(data=data_path, niter=2000, nwarmup=1000, nchain=4, ncore=4, nthin=1, inits="random",ind_pars="mean", inc_postpred=False, adapt_delta=0.95, stepsize=1, max_treedepth=10)

What I am getting is the following:

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_29fef7469510d8c5b4085b6c92003df1 NOW.

AttributeError Traceback (most recent call last)
in
14 # Run the model and store results in “output”
15 output_70 = sm.sampling(data=data_path, niter=2000, nwarmup=1000, nchain=4, ncore=4, nthin=1, inits=“random”,
—> 16 ind_pars=“mean”, inc_postpred=False, adapt_delta=0.95, stepsize=1, max_treedepth=10)
17
18 # Visually check convergence of the sampling chains (should look like “hairy caterpillars”)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
756 raise ValueError(“Warmup samples must be greater than 0 when adaptation is enabled (adapt_engaged=True)”)
757 seed = pystan.misc._check_seed(seed)
→ 758 fit = self.fit_class(data, seed)
759
760 m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_2730466145547416660.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_2730466145547416660.StanFit4Model.cinit()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\misc.py in _split_data(data)
403 # map<string, pair<vector, vector<size_t>>> so prepare
404 # them accordingly.
→ 405 for k, v in data.items():
406 if np.issubdtype(np.asarray(v).dtype, np.integer):
407 data_i.update({k.encode(‘utf-8’): np.asarray(v, dtype=int)})

AttributeError: ‘str’ object has no attribute ‘items’

I have seen it implemented in R but I’m looking for a way how to do it in python. Enclosed please find the stan file I am using.

In general, I am looking for a way how to run any stan model as I would run those in hmbayesdm package.

Thanks,
Jan

bandit_Karm_single_LR.stan (3.3 KB)

Hi,

You have a minor bug in your code, otherwise looks good.

Data input needs to be a dictionary, not filepath.

output = sm.sampling(data=data_path,...

What format do you have the data_path? Is it .data.R or .json

For .data.R do

stan_data = pystan.read_rdump(data_path)

For .json

import json
with open(data_path) as f:
    stan_data = json.load(f)

And then just use the dictionary for the sampling

output = sm.sampling(data=stan_data,...

edit. oh yes, see the additional information from @nhuurre

In addition to what @ahartikainen said, the signature for StanModel.sampling() isn’t quite the same as bandit2arm_delta().
Instead of niter, nwarmup, nthin and ncores those are called iter, warmup, thin and n_jobs.
There’s no ind_pars or inc_postpred. You’ll need to compute summary statistics separately.
max_treedepth, stepsize, adapt_delta all go in a dictionary that is passed to the sampling call as argument control so it’s

sm.sampling(..., control=dict(max_treedepth=10, stepsize=1, adapt_delta=0.95))

Since everything is at their default values you can just call sm.sampling(data=stan_data).

Do help(StanModel.sampling) for more information.

2 Likes

First, I thank both of you.
My data path

data_folder = "C:\\Users\\user\\Documents\\Python_models\\data\\RS_learning"
model_folder = "C:\\Users\\user\\Documents\\Python_models"
file = "RS_3_negative.txt"
model = 'bandit_Karm_single_LR.stan'

data_path = os.path.join(data_folder, file)
model_path = os.path.join(model_folder, model)

It is a simple .txt file, I tried it the same way as I did it in R or as I did the previous hmbayes models (eg 2arm bandit).

@nhuurre
I tried that as well, sorry for not mentioning.

output_70 = pystan.stan(file=model_path, data=data_path, chains=4, iter=2000, warmup=1000, thin=1, init=‘random’, verbose=True, control = {adapt_delta:0.95, stepsize:1, max_treedepth:10}, n_jobs=-1)

and I got

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-68d98f56a702> in <module>
----> 1 output_70 = pystan.stan(file=model_path, data=data_path, chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, control = {adapt_delta:0.95, stepsize:1, max_treedepth:10}, n_jobs=-1)

NameError: name 'adapt_delta' is not defined

I got a similar error for other parameters as well (eg. stepsize)


note: I will be out of service now but I will come back and reply in a week, so forgive please my late answer. Thanks

Keys needs to be strings

adapt_delta --> "adapt_delta"

What does “simple .txt file” mean? CSV? Based on the hBayesDM documentation I think you can load the data like this

import pandas as pd
pd_data = pd.read_csv(data_path, sep='\t')
stan_data = pd_data.to_dict()

If you use curly brace syntax to build the control dictionary then the keys must be quoted like so

control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}

Yes, it is 3 column csv, the same format as for hBayesDM documentation, ie.

“subjID”: A unique identifier for each subject in the data-set.
“choice”: Integer value representing the option chosen on the given trial: 1 or 2.
“outcome”: Integer value representing the outcome of the given trial (where reward == 1, and loss == -1).

I’m enclosing my file.
RS_3_negative.txt (10.9 KB)

Thank you for the keys being strings – @ahartikainen, @nhuurre. I tried, it got compiled but gave a similiar problem as before:

output_70 = pystan.stan(file=model_path, data=data_path, 
                        chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, 
                        control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

output:

WARNING:pystan:DeprecationWarning: pystan.stan was deprecated in version 2.17 and will be removed in version 3.0. Compile and use a Stan program in separate steps.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_29fef7469510d8c5b4085b6c92003df1 NOW.
INFO:pystan:OS: win32, Python: 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)], Cython 0.29.14

Compiling C:\Users\user\AppData\Local\Temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.pyx because it changed.
[1/1] Cythonizing C:\Users\user\AppData\Local\Temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.pyx
building 'stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384' extension
C:\Users\user\AppData\Local\Continuum\anaconda3\Library\mingw-w64\bin\gcc.exe -mdll -O -Wall -DMS_WIN64 -DBOOST_RESULT_OF_USE_TR1 -DBOOST_NO_DECLTYPE -DBOOST_DISABLE_ASSERTS -IC:\Users\JANKA~1.WIS\AppData\Local\Temp\pystan_x5aorp_q -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\stan\src -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\stan\lib\stan_math -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\stan\lib\stan_math\lib\eigen_3.3.3 -IC:\Usersuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\stan\lib\stan_math\lib\boost_1.69.0 -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\stan\lib\stan_math\lib\sundials_4.1.0\include -IC:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\include -IC:\Users\user\AppData\Local\Continuum\anaconda3\include -IC:\Users\user\AppData\Local\Continuum\anaconda3\include -c C:\Users\user\AppData\Local\Temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.cpp -o c:\users\user\appdata\local\temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.o -O2 -ftemplate-depth-256 -Wno-unused-function -Wno-uninitialized -std=c++1y -D_hypot=hypot -pthread -fexceptions
writing c:\users\user\appdata\local\temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.cp37-win_amd64.def
C:\Users\user\AppData\Local\Continuum\anaconda3\Library\mingw-w64\bin\g++.exe -shared -s c:\users\janka~1.wis\appdata\local\temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.o c:\users\user\appdata\local\temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.cp37-win_amd64.def -LC:\Users\janka.WISMAIN\AppData\Local\Continuum\anaconda3\libs -LC:\Users\user\AppData\Local\Continuum\anaconda3\PCbuild\amd64 -lpython37 -lmsvcr140 -o C:\Users\user\AppData\Local\Temp\pystan_x5aorp_q\stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.cp37-win_amd64.pyd

With error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-6de36af1e349> in <module>
      1 output_70 = pystan.stan(file=model_path, data=data_path, 
      2                         chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True,
----> 3                         control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\api.py in stan(file, model_name, model_code, fit, data, pars, chains, iter, warmup, thin, init, seed, algorithm, control, sample_file, diagnostic_file, verbose, boost_lib, eigen_lib, include_paths, n_jobs, allow_undefined, **kwargs)
    445                      sample_file=sample_file, diagnostic_file=diagnostic_file,
    446                      verbose=verbose, algorithm=algorithm, control=control,
--> 447                      n_jobs=n_jobs, **kwargs)
    448     return fit

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
    756                 raise ValueError("Warmup samples must be greater than 0 when adaptation is enabled (`adapt_engaged=True`)")
    757         seed = pystan.misc._check_seed(seed)
--> 758         fit = self.fit_class(data, seed)
    759 
    760         m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_6165544069353003384.StanFit4Model.__cinit__()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\misc.py in _split_data(data)
    403     # map<string, pair<vector<int>, vector<size_t>>> so prepare
    404     # them accordingly.
--> 405     for k, v in data.items():
    406         if np.issubdtype(np.asarray(v).dtype, np.integer):
    407             data_i.update({k.encode('utf-8'): np.asarray(v, dtype=int)})

AttributeError: 'str' object has no attribute 'items'

If I do what @nhuurre sugested with pandas, my data are:

stan_data

{‘subjID’: {0: 1,
1: 1,
2: 1,
3: 1,
4: 1,
5: 1,
6: 1,

998: 1,
999: 0,
…}}

and I get this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-ae5321c79846> in <module>
      1 output_70 = pystan.stan(file=model_path, data=stan_data, 
      2                         chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True,
----> 3                         control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\api.py in stan(file, model_name, model_code, fit, data, pars, chains, iter, warmup, thin, init, seed, algorithm, control, sample_file, diagnostic_file, verbose, boost_lib, eigen_lib, include_paths, n_jobs, allow_undefined, **kwargs)
    445                      sample_file=sample_file, diagnostic_file=diagnostic_file,
    446                      verbose=verbose, algorithm=algorithm, control=control,
--> 447                      n_jobs=n_jobs, **kwargs)
    448     return fit

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
    756                 raise ValueError("Warmup samples must be greater than 0 when adaptation is enabled (`adapt_engaged=True`)")
    757         seed = pystan.misc._check_seed(seed)
--> 758         fit = self.fit_class(data, seed)
    759 
    760         m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_3019044705743485172.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_3019044705743485172.StanFit4Model.__cinit__()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\misc.py in _split_data(data)
    410         else:
    411             msg = "Variable {} is neither int nor float nor list/array thereof"
--> 412             raise ValueError(msg.format(k))
    413     return data_r, data_i
    414 

ValueError: Variable subjID is neither int nor float nor list/array thereof

Hi, almost there.

I think you can do the following

stan_data = {key : pd_data[key].values for key in pd_data.columns}
# then add rest of the values needed by Stan
# example:
stan_data['N'] = len(pd_data)

Looking at pystan code, I think there’s a chance it can handle a pandas dataframe, so simple

stan_data = pd.read_csv(data_path, sep='\t')
fit = model.sampling(data=stan_data, ...)

may work.

stan_data = pd.read_csv(data_path, sep='\t')
model = pystan.StanModel(file=model_path)
fit = model.sampling(data=stan_data,
                        chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, 
                        control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

gives

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-12-c600ad4904a2> in <module>
      3 fit = model.sampling(data=stan_data,
      4                         chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True,
----> 5                         control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
    756                 raise ValueError("Warmup samples must be greater than 0 when adaptation is enabled (`adapt_engaged=True`)")
    757         seed = pystan.misc._check_seed(seed)
--> 758         fit = self.fit_class(data, seed)
    759 
    760         m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_4577727963002664165.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_4577727963002664165.StanFit4Model.__cinit__()

RuntimeError: Exception: variable does not exist; processing stage=data initialization; variable name=N; base type=int  (in 'bandit_Karm_single_LR.stan' at line 2)

Hi,

I’m sorry, how exactly do you mean to do that? You mean to write the whole dictionary for it?

I started doing this:

import pandas as pd

data = pd.read_csv(data_path, delimiter="\t", dtype={"subjID": str, "choice": int, "outcome": int})
#but this was really not nice
a = [sorted(data['subjID'],key=int).count(str(item)) for item in range(1,data['subjID'].nunique()+1)]

data_list = {
             'N': data['subjID'].nunique(),
             'T': data['subjID'].value_counts().max(),
             'K': 2 #data.choice.max(),
             'Tsubj': a,
             'choice': data['choice'],
             'outcome': data['outcome'],
             'numPars': 2
            }

# Tsubj number of trials for each subject

Yeah.

Write dictionary which contain data you have defined. 1D row and col vectors both work with 1D ndarray.

Additional note to the pandas data frame structure:

I tried also to explicitly tell the types:

import pandas as pd
pd_data = pd.read_csv(data_path, sep='\t', dtype={"subjID": str, "choice": int, "outcome": int})
stan_data2 = pd_data.to_dict()
#data = pd.read_csv(data_path, delimiter="\t")

output_70 = pystan.stan(file=model_path, data=stan_data2, 
                        chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, 
                        control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

However, I am still getting

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-22-43912f62efde> in <module>
      6 output_70 = pystan.stan(file=model_path, data=stan_data2, 
      7                         chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True,
----> 8                         control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\api.py in stan(file, model_name, model_code, fit, data, pars, chains, iter, warmup, thin, init, seed, algorithm, control, sample_file, diagnostic_file, verbose, boost_lib, eigen_lib, include_paths, n_jobs, allow_undefined, **kwargs)
    445                      sample_file=sample_file, diagnostic_file=diagnostic_file,
    446                      verbose=verbose, algorithm=algorithm, control=control,
--> 447                      n_jobs=n_jobs, **kwargs)
    448     return fit

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
    756                 raise ValueError("Warmup samples must be greater than 0 when adaptation is enabled (`adapt_engaged=True`)")
    757         seed = pystan.misc._check_seed(seed)
--> 758         fit = self.fit_class(data, seed)
    759 
    760         m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_7831310574925026557.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_7831310574925026557.StanFit4Model.__cinit__()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\misc.py in _split_data(data)
    410         else:
    411             msg = "Variable {} is neither int nor float nor list/array thereof"
--> 412             raise ValueError(msg.format(k))
    413     return data_r, data_i
    414 

ValueError: Variable subjID is neither int nor float nor list/array thereof

Hi @ahartikainen,

I’m probably missing some flow there, I’m getting a good error (shape problems) but don’t know how to fix it. Would you have a tip, please?

import pandas as pd
data = pd.read_csv(data_path, delimiter="\t", dtype={"subjID": str, "choice": int, "outcome": int})
a = [sorted(data['subjID'],key=int).count(str(item)) for item in range(1,data['subjID'].nunique()+1)]

data_list = {
             'N': data['subjID'].nunique(),
             'T': data['subjID'].value_counts().max(),
             'K': 2, #data.choice.max(),
             'Tsubj': a,
             'choice': data['choice'],
             'outcome': data['outcome'],
             'numPars': 2
            }

# Tsubj number of trials for each subject

output_70 = pystan.stan(file=model_path, data=data_list, 
                        chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, 
                        control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

fot which I get:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
in
1 output_70 = pystan.stan(file=model_path, data=data_list,
2 chains=4, iter=2000, warmup=1000, thin=1, init=‘random’, verbose=True,
----> 3 control = {“adapt_delta”:0.95, “stepsize”:1, “max_treedepth”:10}, n_jobs=-1)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\api.py in stan(file, model_name, model_code, fit, data, pars, chains, iter, warmup, thin, init, seed, algorithm, control, sample_file, diagnostic_file, verbose, boost_lib, eigen_lib, include_paths, n_jobs, allow_undefined, **kwargs)
    445                      sample_file=sample_file, diagnostic_file=diagnostic_file,
    446                      verbose=verbose, algorithm=algorithm, control=control,
--> 447                      n_jobs=n_jobs, **kwargs)
    448     return fit

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pystan\model.py in sampling(self, data, pars, chains, iter, warmup, thin, seed, init, sample_file, diagnostic_file, verbose, algorithm, control, n_jobs, **kwargs)
    756                 raise ValueError("Warmup samples must be greater than 0 when adaptation is enabled (`adapt_engaged=True`)")
    757         seed = pystan.misc._check_seed(seed)
--> 758         fit = self.fit_class(data, seed)
    759 
    760         m_pars = fit._get_param_names()

stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_5365742667283611043.pyx in stanfit4anon_model_29fef7469510d8c5b4085b6c92003df1_5365742667283611043.StanFit4Model.__cinit__()

RuntimeError: Exception: mismatch in number dimensions declared and found in context; processing stage=data initialization; variable name=choice; dims declared=(26,50); dims found=(1285)  (in 'bandit_Karm_single_LR.stan' at line 6)

Thank you

Yeah, you need to preprocess the data to be in the same shape as given in the data block.

You could follow what hBayesDM is doing

Ok, thanks again. In case someone needs it, this is my final solution. It is not nice and can be simplified but I kept it in the shape of what is hBayesDM doing.

from hbayesdm import rhat, print_fit
import os, sys
import pystan

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple, List, Sequence, Dict, Union, Callable, Any

def _prepare_general_info(raw_data: pd.DataFrame) -> Dict:
        """Prepare general infos about the raw data.
        Parameters
        ----------
        raw_data
            The raw behavioral data as a Pandas DataFrame.
        Returns
        -------
        general_info : Dict
            'grouped_data': data grouped by subjs (& blocks) - Pandas GroupBy,
            'subjs': list of all subjects in data,
            'n_subj': total number of subjects in data,
            'b_subjs': number of blocks per subj,
            'b_max': max number of blocks across all subjs,
            't_subjs': number of trials (per block) per subj,
            't_max': max number of trials across all subjs (& blocks).
        """

        grouped_data = raw_data.groupby('subjid', sort=False)
        trials_per_subj = grouped_data.size()
        subjs = list(trials_per_subj.index)
        n_subj = len(subjs)
        t_subjs = list(trials_per_subj)
        t_max = max(t_subjs)
        b_subjs, b_max = None, None
        
        return {'grouped_data': grouped_data,
                'subjs': subjs, 'n_subj': n_subj,
                'b_subjs': b_subjs, 'b_max': b_max,
                't_subjs': t_subjs, 't_max': t_max}


# adapted from https://github.com/CCS-Lab/hBayesDM/blob/master/Python/hbayesdm/preprocess_funcs.py

def banditKarm_preprocess_func(raw_data, general_info, number_arms = 2):
    # Iterate through grouped_data
    subj_group = iter(general_info['grouped_data'])
    # iter and next work together, https://www.programiz.com/python-programming/methods/built-in/iter

    # Use general_info(s) about raw_data
    n_subj = general_info['n_subj']
    t_subjs = general_info['t_subjs']
    t_max = general_info['t_max']

    # Initialize (model-specific) data arrays
    choice = np.full((n_subj, t_max), -1, dtype=int)
    outcome = np.full((n_subj, t_max), 0, dtype=float)

    # Write from subj_data to the data arrays
    for s in range(n_subj):
        _, subj_data = next(subj_group)
        t = t_subjs[s]
        choice[s][:t] = subj_data['choice']
        outcome[s][:t] = subj_data['outcome']

    # Wrap into a dict for pystan
    data_dict = {
        'N': n_subj,
        'T': t_max,
        'K': number_arms,
        'Tsubj': t_subjs,
        'choice': choice,
        'outcome': outcome,
    }

    # Returned data_dict will directly be passed to pystan
    return data_dict

And commands:

# Data
data_folder = "C:\\Users\\folder"
model_folder = "C:\\Users\\models"
file = "input.txt"

#prepare model
model = 'bandit.stan'
model_path = os.path.join(model_folder, model)

# Prepare data
data_path = os.path.join(data_folder, file)
## load as panda and rename
data = pd.read_csv(data_path, delimiter="\t")
data = data.rename(columns={'subjID': 'subjid'})

# Prepare the proper format in panda frame with all requirements, creates "general_info"
prep_data = _prepare_general_info(raw_data=data)
data_bandit = banditKarm_preprocess_func(data, prep_data)

output = pystan.stan(file=model_path, data=data_bandit, 
                        chains=4, iter=2000, warmup=1000, thin=1, init='random', verbose=True, 
                        control = {"adapt_delta":0.95, "stepsize":1, "max_treedepth":10}, n_jobs=-1)

`

1 Like