After conquering some errors, I tried using pathfinder method from GitHub - SteveBronder/pathfinder_testing: Collection of stan-dev repo branches need to run pathfinder and results are very inaccurate.
Example code:
library(dplyr)
library(brms)
set_cmdstan_path('/home/rstudio/user_projects/pathfinder_testing/cmdstan')
library(cmdstanr)
# generate some data
N <- 10000
data0 <-
tibble(
x1 = rnorm(N, 0, 1),
x2 = rnorm(N, 0, 1),
x3 = rnorm(N, 0, 1),
x4 = rnorm(N, 0, 1),
x5 = rnorm(N, 0, 1),
y_star = rlogis(N, 1*x1 + 0.7*x2 + 0.5*x3 + 0.3*x4 + 0.1*x5, 1),
y = ifelse(y_star>0,1,0)
)
summary(glm(y ~ x1 + x2 + x3 + x4 + x5, data = data0, family = binomial(link = "logit")))
# define model using brm
bf0 <- bf(y ~ x1 + x2 + x3 + x4 + x5) + bernoulli(link = "logit")
# make native stan code from brm objects and save the code as stan object
nthreads = 12
my_stan_code <- make_stancode(
formula = bf0,
data = data0,
#prior = prior0,
autocor = NULL,
data2 = NULL,
cov_ranef = NULL,
sparse = NULL,
sample_prior = "no",
stanvars = NULL,
stan_funs = NULL,
knots = NULL,
#threads = threading(threads = nthreads),
normalize = getOption("brms.normalize", TRUE),
save_model = '/home/rstudio/user_projects/Mantas/pathfinder_example/psc_20220203.stan'
)
# make a native stan data structure from brm objects
my_stan_data <- make_standata(
formula = bf0,
data = data0,
#family = gaussian(),
#prior = prior0,
autocor = NULL,
data2 = NULL,
cov_ranef = NULL,
sample_prior = "no",
stanvars = NULL,
#threads = threading(threads = nthreads),
knots = NULL
)
# create cmdstan model object
ex_mod = cmdstan_model(stan_file = '/home/rstudio/user_projects/Mantas/pathfinder_example/psc_20220203.stan',
compile = FALSE)
# compile
ex_mod$compile(
quiet = FALSE,
dir = NULL,
pedantic = TRUE,
include_paths = NULL,
cpp_options = list(stan_threads = TRUE),
stanc_options = list(),
force_recompile = TRUE
)
# NUTS (HMC)
ex_fit_sample = ex_mod$sample(data = my_stan_data,
parallel_chains = 3,
iter_sampling = 1000,
chains = 3,
threads_per_chain=4,
output_dir = '/home/rstudio/user_projects/Mantas/pathfinder_example/')
# meanfield variational inference
ex_fit_vb <- ex_mod$variational(
data = my_stan_data,
seed = NULL,
refresh = 5,
init = NULL,
save_latent_dynamics = FALSE,
output_dir = '/home/rstudio/user_projects/Mantas/pathfinder_example/',
output_basename = 'psc_variational',
sig_figs = NULL,
threads = nthreads,
opencl_ids = NULL,
algorithm = NULL,
iter = 1000,
grad_samples = NULL,
elbo_samples = NULL,
eta = NULL,
adapt_engaged = NULL,
adapt_iter = NULL,
tol_rel_obj = 0.0000001,
eval_elbo = NULL,
output_samples = 10000
)
# lbfgs optimization
ex_optimize = ex_mod$optimize(data = my_stan_data,
refresh = 5,
algorithm = "lbfgs",
threads = nthreads,
iter = 1000,
history_size = 12,
init_alpha = 0.0000001,
init = 3,
output_dir = '/home/rstudio/user_projects/Mantas/pathfinder_example/',
output_basename = 'psc_lbfgs_opt'
)#, tol_obj = 0, tol_grad = 0, tol_param = 0, tol_rel_grad = 0, tol_rel_obj = 0)
# Run path and sampler
# FIXME: idk why but this needs +1 num_draws for the number of draws you actually want
ex_fit_single = ex_mod$pathfinder(algorithm = "single",
data = my_stan_data,
refresh = 5,
threads = nthreads,
iter = 1000,
num_elbo_draws = 100,
history_size = 6,
init_alpha = 0.0000001,
num_draws = 10001,
init = 0,
output_dir = '/home/rstudio/user_projects/Mantas/pathfinder_example/',
output_basename = 'psc_single'
)#, tol_obj = 0, tol_grad = 0, tol_param = 0, tol_rel_grad = 0, tol_rel_obj = 0)
# Run path and sampler
ex_fit_multi = ex_mod$pathfinder(algorithm = "multi",
data = my_stan_data,
refresh = 10,
threads = nthreads,
num_paths = 16,
psis_draws = 10000,
iter = 1000,
num_elbo_draws = 1000,
history_size = 4,
init_alpha = 0.0000001,
num_draws = 10000,
init = 3,
output_dir = '/home/rstudio/user_projects/Mantas/pathfinder_example/',
output_basename = 'psc_multi'
)#, tol_obj = 0, tol_grad = 0, tol_param = 0, tol_rel_grad = 0, tol_rel_obj = 0)
ex_fit_sample$summary()
ex_fit_vb$summary()
ex_optimize$summary()
# problems with optimization or convergence
ex_fit_single$summary()
ex_fit_multi$summary()
Thus, summary results should be similar to b[1]=1; b[2]=0.7, b[3]=0.5, b[4]=0.3, b[5]=0.1.
HMC, meanfield and “optimize” methods get similar results, but Pathfinder’s results are very impersise, i.e.:
ex_fit_single$summary()
A tibble: 8 × 7
variable mean median sd mad q5 q95
1 b[1] 0.813 0.965 37.5 37.0 -62.5 62.4
2 b[2] 0.354 0.501 38.5 38.4 -62.8 63.8
3 b[3] 0.354 0.203 43.7 43.3 -72.0 72.4
4 b[4] 0.648 0.637 41.1 40.8 -67.2 67.9
5 b[5] 0.0516 -0.00359 40.2 39.5 -66.3 66.3
6 Intercept 0.0816 0.256 40.9 41.0 -68.1 66.3
7 b_Intercept 0.0738 0.269 40.9 40.9 -68.0 66.3
8 lp__ 377497. 361349 146589. 146837. 166286. 641275.
ex_fit_multi$summary()
A tibble: 8 × 7
variable mean median sd mad q5 q95
1 b[1] 0.642 1.36 41.2 40.2 -68.7 67.4
2 b[2] -8.49 -8.51 47.7 46.2 -87.8 69.6
3 b[3] 9.79 7.28 49.1 45.4 -65.4 93.6
4 b[4] -6.45 -4.90 50.3 48.4 -93.0 73.9
5 b[5] 0.876 1.12 57.6 46.6 -95.8 96.0
6 Intercept 1.19 -2.30 90.3 55.3 -148. 210.
7 b_Intercept 1.18 -2.22 90.3 55.4 -148. 210.
8 lp__ 533303. 446146. 311956. 236710. 185127. 1215174.
Is it normal to be this inaccurate? Maybe I selected parameters wrong? What parameters should be tuned?