Hi all,
I am trying to do within-chain parallelization in brms
using rstan
as a backend, which, per the latest brms release notes should work if I understand correctly with Stan versions >= 2.25.
My system is:
- Operating System: Ubuntu 20.04
- R: Version: 4.0.4
- brms Version: 2.15.0
- rstan version: 2.26.1
- StanHeaders: 2.26.1
I am trying to run the model:
m1 <- brm(weight ~ Time, chains = 4, cores = 4, threads = threading(2), backend = "rstan", data = ChickWeight)
which fails to compile with following output:
> m1 <- brm(weight ~ Time, chains = 4, cores = 4, threads = threading(2), backend = "rstan", data = ChickWeight)
Compiling Stan program...
During startup - Warning messages:
1: Setting LC_CTYPE failed, using "C"
2: Setting LC_COLLATE failed, using "C"
3: Setting LC_TIME failed, using "C"
4: Setting LC_MESSAGES failed, using "C"
5: Setting LC_MONETARY failed, using "C"
6: Setting LC_PAPER failed, using "C"
7: Setting LC_MEASUREMENT failed, using "C"
make cmd is
make -f '/usr/lib/R/etc/Makeconf' -f '/usr/share/R/share/make/shlib.mk' -f '/home/julian/.R/Makevars' CXX='$(CXX14) $(CXX14STD)' CXXFLAGS='$(CXX14FLAGS)' CXXPICFLAGS='$(CXX14PICFLAGS)' SHLIB_LDFLAGS='$(SHLIB_CXX14LDFLAGS)' SHLIB_LD='$(SHLIB_CXX14LD)' SHLIB='file29064b2c005a.so' OBJECTS='file29064b2c005a.o'
make would use
"/bin/g++" -std=gnu++14 -I"/usr/share/R/include" -DNDEBUG -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/Rcpp/include/" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/RcppEigen/include/" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/RcppEigen/include/unsupported" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/BH/include" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/include/src/" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/include/" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/RcppParallel/include/" -I"/home/julian/R/x86_64-pc-linux-gnu-library/4.0/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -DBOOST_NO_AUTO_PTR -include '/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -fpic -O3 -march=native -mtune=native -fPIC -c file29064b2c005a.cpp -o file29064b2c005a.o
if test "zfile29064b2c005a.o" != "z"; then \
echo "/bin/g++" -std=gnu++14 -shared -L"/usr/lib/R/lib" -Wl,-Bsymbolic-functions -Wl,-z,relro -o file29064b2c005a.so file29064b2c005a.o '/home/julian/R/x86_64-pc-linux-gnu-library/4.0/rstan/lib//libStanServices.a' -L'/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/lib/' -lStanHeaders -L'/home/julian/R/x86_64-pc-linux-gnu-library/4.0/RcppParallel/lib/' -ltbb -L"/usr/lib/R/lib" -lR; \
"/bin/g++" -std=gnu++14 -shared -L"/usr/lib/R/lib" -Wl,-Bsymbolic-functions -Wl,-z,relro -o file29064b2c005a.so file29064b2c005a.o '/home/julian/R/x86_64-pc-linux-gnu-library/4.0/rstan/lib//libStanServices.a' -L'/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/lib/' -lStanHeaders -L'/home/julian/R/x86_64-pc-linux-gnu-library/4.0/RcppParallel/lib/' -ltbb -L"/usr/lib/R/lib" -lR; \
fi
Error in compileCode(f, code, language = language, verbose = verbose) :
/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/include/tbb/parallel_reduce.h:270:44: required from 'void tbb::interface9::internal::start_deterministic_reduce<Range, Body, Partitioner>::run_body(Range&) [with Range = tbb::blocked_range<long unsigned int>; Body = stan::math::internal::reduce_sum_impl<model29061cad3d7c__namespace::partial_log_lik_lpmf_rsfunctor__<false>, void, stan::math::var_value<double>, const std::vector<int>&, const Eigen::Matrix<double, -1, 1, 0, -1, 1>&, const Eigen::Matrix<double, -1, -1, 0, -1, -1>&, Eigen::Matrix<stan::math::var_value<double, void>, -1, 1, 0, -1, 1>&, stan::math::var_value<double, void>&, stan::math::var_value<double, void>&>::recursive_reducer; Partitioner = const tbb::simple_partitioner]'/home/julian/R/x86_64-pc-linux-gnu-library/4.0/StanHeaders/include/tbb/partitioner.h:507:9: required from 'void tbb::interface9::internal::simple_partition_type::execute(StartType&, Range&) [with StartType = tbb::interface9::internal
Error in sink(type = "output") : invalid connection
The stancode of the model looks as follows:
> make_stancode(weight ~ Time, chains = 4, cores = 4, threads = threading(2), data = ChickWeight)
// generated with brms 2.15.0
functions {
/* integer sequence of values
* Args:
* start: starting integer
* end: ending integer
* Returns:
* an integer sequence from start to end
*/
int[] sequence(int start, int end) {
int seq[end - start + 1];
for (n in 1:num_elements(seq)) {
seq[n] = n + start - 1;
}
return seq;
}
// compute partial sums of the log-likelihood
real partial_log_lik_lpmf(int[] seq, int start, int end, vector Y, matrix Xc, vector b, real Intercept, real sigma) {
real ptarget = 0;
int N = end - start + 1;
ptarget += normal_id_glm_lpdf(Y[start:end] | Xc[start:end], Intercept, b, sigma);
return ptarget;
}
}
data {
int<lower=1> N; // total number of observations
vector[N] Y; // response variable
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
int grainsize; // grainsize for threading
int prior_only; // should the likelihood be ignored?
}
transformed data {
int Kc = K - 1;
matrix[N, Kc] Xc; // centered version of X without an intercept
vector[Kc] means_X; // column means of X before centering
int seq[N] = sequence(1, N);
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
vector[Kc] b; // population-level effects
real Intercept; // temporary intercept for centered predictors
real<lower=0> sigma; // residual SD
}
transformed parameters {
}
model {
// likelihood including constants
if (!prior_only) {
target += reduce_sum(partial_log_lik_lpmf, seq, grainsize, Y, Xc, b, Intercept, sigma);
}
// priors including constants
target += student_t_lpdf(Intercept | 3, 103, 69.7);
target += student_t_lpdf(sigma | 3, 0, 69.7)
- 1 * student_t_lccdf(0 | 3, 0, 69.7);
}
generated quantities {
// actual population-level intercept
real b_Intercept = Intercept - dot_product(means_X, b);
}
The models
m2 <- brm(weight ~ Time, chains = 4, cores = 4, backend = "rstan", data = ChickWeight)
and
m3 <- brm(weight ~ Time, chains = 4, cores = 4, threads = threading(2), backend = "cmdstanr", data = ChickWeight)
both work. The reason why I want to use rstan
is mainly because it works with loo_moment_match
which, AFAIK, is not possible with cmdstanr
at the moment (?).
Does anyone know why the model with rstan
+ threading fails to compile?
Thanks in advance!
Julian