Is it impossible to compute multiple models in parallel using brms with `backend = "cmdstanr"`?

I am trying to compute multiple models in parallel on my computer with 16 logical cores. To do so, I use brms::brm() inside furrr::future_map(), and using four workers to run furrr::future_map() by setting plan(tweak(multisession, workers = 4)). In addition, I use brms::brm() with its backend set to cmdstanr (i.e. brms::brm(..., backend = "cmdstanr")), since cmdstanr’s computation is fast. Moreover, I set the number of cores to use when executing the chains in parallel as four (i.e. brms::brm(..., cores = 4)), as I have four chains per model.

However, when these settings are implemented, as shown below, and multiple models are computed in parallel using furrr::future_map(), the error collect2.exe: error: ld returned 1 exit status occurs and the execution halts. The problem is resolved when I use purrr::map() instead of furrr::future_map() but purrr::map() does not parallelise the computation of multiple models.

Does the problem happen because I am trying to do nested parallelism where four workers compute models (by setting plan(tweak(multisession, workers = 4))) and each worker deals with additional four cores set by brms::brm(..., cores = 4)? Then, is it impossible to compute multiple models in parallel using brms as shown in here or @matti 's post on RPubs when backend = "cmdstanr" is set?

bayes_samples_Nsim <- samples_Nsim |>
  group_by(sample_id) |>
  nest(samples = samples) |>
  mutate(
    Bayesian_model = future_map(
      .options = furrr_options(
        scheduling = 0,
        seed = 7,
       prefix = "prefix"
      ),
      samples,
      ~ brm(
        samples ~ 1,
        data = .,
        family = gaussian(),
        # The prior can be more constrained
        # then the following weak prior:
        # prior = prior(normal(10, 5), class = Intercept),
        prior = prior(normal(10, 1), class = Intercept),
        chains = 4,
        cores = 4,
        control = list(adapt_delta = 0.95),
        backend = "cmdstanr",
        iter = 2000,
        warmup = 1000
      )
    ),
    estimates =
      map_dfr(
        .x = Bayesian_model,
        ~ fixef(.x) |>
          as_tibble()
      ),
    Bayesian_estimate = estimates$Estimate,
    Bayesian_std_error = estimates$`Est.Error`,
    Bayesian_lower = estimates$`Q2.5`,
    Bayesian_upper = estimates$`Q97.5`
  )

Error message

C
:\rtools42/x86_64-w64-mingw32.static.posix/bin/ld.exe: 
src/cmdstan/main.o:main.cpp:(.text$_ZN4stan2io15stan_csv_reader13read_metadataERSiRNS0_17stan_csv_metadataEPSo[_ZN4stan2io15stan_csv_reader13read_metadataERSiRNS0_17stan_csv_metadataEPSo]+0xe3): undefined reference to `std::istream::seekg(std::fpos<int>)'
C:\rtools42/x86_64-w64-mingw32.static.posix/bin/ld.exe:
s
rc/cmdstan/main.o:main.cpp:(.text$_ZN4stan2io15stan_csv_reader15read_adaptationERSiRNS0_19stan_csv_adaptationEPSo[_ZN4stan2io15stan_csv_reader15read_adaptationERSiRNS0_19stan_csv_adaptationEPSo]+0x164): undefined reference to `std::istream::seekg(std::fpos<int>)'
C:\rtools42/x86_64-w64-mingw32.static.posix/bin/ld.exe:
s
rc/cmdstan/main.o:main.cpp:(.text$_ZN4stan2io15stan_csv_reader12read_samplesERSiRN5Eigen6MatrixIdLin1ELin1ELi0ELin1ELin1EEERNS0_15stan_csv_timingEPSo[_ZN4stan2io15stan_csv_reader12read_samplesERSiRN5Eigen6MatrixIdLin1ELin1ELi0ELin1ELin1EEERNS0_15stan_csv_timingEPSo]+0x74d): undefined reference to `std::istream::seekg(std::fpos<int>)'       

collect2.exe: error: ld returned 1 exit status

mingw32-make.exe: *** [make/program:59: C:\Users\my_username\AppData\Local\Temp\RtmpyKRff8\model-2d24605a3f2e.exe] Error 1

Error in `mutate()`:
! Problem while computing `Bayesian_model = future_map(...)`.
ℹ The error occurred in group 1: sample_id = "1".
Caused by error:
! An error occured during compilation! See the message above for more information.
Run `rlang::last_error()` to see where the error occurred.

MWE

library("tidyverse")
library("brms")
library("cmdstanr")
library("future")
library("furrr")
plan(tweak(multisession, workers = 4))

alpha <- 0.05
n <- 100
Nsim <- 10
mu <- 10
sigma <- 5

set.seed(20220718)

samples_Nsim <- 1:Nsim |>
  map(
    ~ rnorm(
      n, mu, sigma
    )
  ) |>
  map_dfr(
    .id = "sample_id",
    ~ {
      sample_mean <- mean(x = .x)
      sample_sd <- sd(x = .x)
      half_interval <- sample_sd / sqrt(n) * qt(1 - alpha * 0.5, df = n - 1)
      upper <- sample_mean + half_interval
      lower <- sample_mean - half_interval
      tibble(
        samples = .x,
        sample_mean = sample_mean,
        sample_sd = sample_sd,
        upper = upper,
        lower = lower,
        mu_out_of_CI = upper < mu | lower > mu
      )
    }
  )

tictoc::tic()

# 245.58 sec = 4 min 5 sec elapsed to calculate 10 model with iter = 2000 using purrr:map
# 2309.72 sec = 38 min 29 sec elapsed to calculate 100 model with iter = 2000 using purrr:map

bayes_samples_Nsim <- samples_Nsim |>
  group_by(sample_id) |>
  nest(samples = samples) |>
  mutate(
    Bayesian_model = future_map(
      .options = furrr_options(
        scheduling = 0,
        seed = 7,
       prefix = "prefix"
      ),
      samples,
      ~ brm(
        samples ~ 1,
        data = .,
        family = gaussian(),
        # The prior can be more constrained
        # then the following weak prior:
        # prior = prior(normal(10, 5), class = Intercept),
        prior = prior(normal(10, 1), class = Intercept),
        chains = 4,
        cores = 4,
        control = list(adapt_delta = 0.95),
        backend = "cmdstanr",
        iter = 2000,
        warmup = 1000
      )
    ),
    estimates =
      map_dfr(
        .x = Bayesian_model,
        ~ fixef(.x) |>
          as_tibble()
      ),
    Bayesian_estimate = estimates$Estimate,
    Bayesian_std_error = estimates$`Est.Error`,
    Bayesian_lower = estimates$`Q2.5`,
    Bayesian_upper = estimates$`Q97.5`
  )

tictoc::toc()

Session Info

Operating System

  • Windows 10 Pro (x86_64-w64-mingw32)
  • R version 4.2.0 (2022-04-22 ucrt)

package version

  • brms 2.17.0
  • cmdstanr 0.5.2
  • future 1.25.0
  • furrr 0.3.0
  • tidyverse 1.3.1

It’s possible to do this with {rstan} (you might try it with the up to date experimental branch) and {future} but as I understand it because {cmdstanr} is just a light-weight wrapper that calls cmdstan behind the scenes, parallelizing models via {future} and a cmdstanr backend is complicated if not impossible and is not currently supported by {brms}. IMO, since you have 8 physical cores you’d probably have more success fitting the models sequentially using four chains and 2 threads per chain via brms’ threads argument.

1 Like

@ajnafa

Thank you for your reply.

parallelizing models via {future} and a cmdstanr backend is complicated if not impossible and is not currently supported by {brms}

Would you please let me know how to fit ‘raw’ cmdstanr models parallelly using {future}, or would you tell me any reference that might explain the method to achieve that? I modified the model fitting process so that not brm(..., backend = "cmdstanr") but cmdstan_model(...)$sample(...) actually fit the models. Do you think that anything more could be added to this modification to enable parallel processing using {future}?

bayes_samples_Nsim_without_brm <- samples_Nsim |>
  group_by(sample_id) |>
  nest(samples = samples) |>
  mutate(
    ## TODO: create data to pass Stan
    ## and actually pass them to fitting process
    ##
    ## stan_data = map(
    ##   samples,
    ##   ~ make_standata(
    ##     samples ~ 1,
    ##     data = .x,
    ##     family = gaussian(),
    ##     prior = prior(normal(10, 1), class = Intercept)
    ##   )
    ## ),
    Bayesian_model = map( # future_map(
      # .options = furrr_options(
      #  scheduling = 0,
      #  seed = 7,
      #  prefix = "prefix"
      # ),
      samples,
      ~ make_stancode(
        samples ~ 1,
        data = .x,
        family = gaussian(),
        prior = prior(normal(10, 1), class = Intercept),
        chains = 4,
        cores = 4,
        control = list(adapt_delta = 0.95),
        backend = "cmdstanr",
      )
    ) |>
      map(
        ~ write_stan_file(.x)
      ) |>
      map(
        ~ cmdstan_model(
          stan_file = .x,
          # force_recompile = TRUE,
          cpp_options = list(stan_threads = TRUE)
        )$sample(
          ## Manual data preparation using `list()` should be automated
          ## by supplying `stan_data` above to this argument.
          ## Currently this cannot be done successfully
          data = list(
            N = n,
            Y = samples |>
              unlist(),
            prior_only = 0
          ),
          seed = 123,
          iter_sampling = 4000,
          iter_warmup = 10000,
          chains = 4,
          threads_per_chain = 3,
          parallel_chains = 4
        )
      ),
    estimates =
      map(
        .x = Bayesian_model,
        ~ posterior::summarise_draws(
          .x,
          mean,
          sd,
          q2.5 = ~ quantile(.x, probs = c(0.025)),
          q97.5 = ~ quantile(.x, probs = c(0.975))
        ) |>
          filter(variable == "Intercept")
      ),
    Bayesian_estimate = map_dbl(
      .x = estimates,
      ~ .x |>
        select(mean) |>
        pull()
    ),
    Bayesian_std_error = map_dbl(
      .x = estimates,
      ~ .x |>
        select(sd) |>
        pull()
    ),
    Bayesian_lower = map_dbl(
      .x = estimates,
      ~ .x |>
        select(`2.5%`) |>
        pull()
    ),
    Bayesian_upper = map_dbl(
      .x = estimates,
      ~ .x |>
        select(`97.5%`) |>
        pull()
    )
  )

… since you have 8 physical cores you’d probably have more success fitting the models sequentially using four chains and 2 threads per chain via brms’ threads argument.

In fact, I found that there are only 12 logical cores (thus 6 physical cores) in my environment… Apologies for the confusion. In the above modification, I set chains = 4, threads_per_chain = 3, parallel_chains = 4 in cmdstan_model(...)$sample(...) so that 4 chains are computed in parallel (parallel_chains = 4) and each chain is computed with 3 threads, and all 12 logical cores should be used. Is this understanding correct?

I am investigating the cmdstanr issue, but I am almost certain that it is stemming from compiling the model. I have been able to run sampling with future, but have never used it for the compiling part as well.

2 Likes

@rok_cesnovar

Thank you for your comment.

I am almost certain that it is stemming from compiling the model. I have been able to run sampling with future , but have never used it for the compiling part as well

Do you think it would work if I separate the compiling part and sampling part and pass the compiling part (e.g. cmdstan_model(...)$compile()) to purrr::map() (sequential computation) and the sampling part (e.g. cmdstan_model(...)$sample()) to furrr::future_map() (parallel computation)? I will try to implement this later but I would like to know whether this is a promising way.

I actually se no reason to parallelize compilation. Just compile the model once and then use the same executable for all $sample() calls.

2 Likes

@rok_cesnovar

Thank you for your suggestion and apologies for my late reply.

I finally managed to write more efficient code, separating the compilation part and sampling part, as shown below. Since I fit one model to multiple simulation data, I put the compilation procedure outside of future_map, as you suggested. In the compilation part, I use brms::make_stancode() to write Stan commands automatically and accurately, and pass the commands to cmdstanr’s write_stan_file()and cmdstan_model() for the actual compilation.

The sampling part is executed using furrr::future_map() to iterate CmdStanModel$sample(..., chains = 4, threads_per_chain = 3, parallel_chains = 4) using all 12 logical cores I have. I confirmed by observing Task manager’s performance tab that all cores were used in this sampling procedure.

It took only 151.69 seconds to perform 100 simulations of 6000 iterations (2000 warm-up + 4000 post-warm-up) per simulation. Since it had taken approximately 40 min to do the simulation when using purrr::map(), we can enjoy drastic and dramatic advancement in speed.

I really appreciate your supportive comments, @rok_cesnovar and @ajnafa!

model <- make_stancode(
  samples ~ 1,
  ## no matter which data is specified for this argument,
  ## the same model (Stan code) is built,
  ## if common priors are specified across the simulations.
  data = samples_Nsim |>
    filter(sample_id == 1) |>
    dplyr::select(samples),
  family = gaussian(),
  prior = prior(normal(10, 1), class = Intercept) +
    prior(normal(0, 5), class = sigma),
  chains = 4,
  cores = 4,
  control = list(adapt_delta = 0.95),
  backend = "cmdstanr"
) |>
  write_stan_file() |>
  cmdstan_model(
    # force_recompile = TRUE,
    cpp_options = list(stan_threads = TRUE)
  )

bayes_samples_Nsim_without_brm <- samples_Nsim |>
  group_by(sample_id) |>
  nest(samples = samples) |>
  ungroup() |>
  mutate(
    stan_data = map(
      samples,
      ~ make_standata(
        samples ~ 1,
        data = .x,
        family = gaussian(),
        prior = prior(normal(10, 1), class = Intercept)
      )
    ),
    Bayesian_model = # map(
      future_map(
        .x = stan_data,
        .options = furrr_options(
          scheduling = 1,
          seed = TRUE,
          prefix = "prefix"
        ),
        ~ model$sample(
          data = .x,
          seed = 123,
          iter_sampling = 4000,
          iter_warmup = 2000,
          chains = 4,
          threads_per_chain = 3,
          parallel_chains = 4
        )
      ),
    estimates =
      map(
        .x = Bayesian_model,
        ~ posterior::summarise_draws(
          .x,
          mean,
          sd,
          q2.5 = ~ quantile(.x, probs = c(0.025)),
          q97.5 = ~ quantile(.x, probs = c(0.975))
        ) |>
          filter(variable == "Intercept")
      ),
    Bayesian_estimate = map_dbl(
      .x = estimates,
      ~ .x |>
        select(mean) |>
        pull()
    ),
    Bayesian_std_error = map_dbl(
      .x = estimates,
      ~ .x |>
        select(sd) |>
        pull()
    ),
    Bayesian_lower = map_dbl(
      .x = estimates,
      ~ .x |>
        select(`2.5%`) |>
        pull()
    ),
    Bayesian_upper = map_dbl(
      .x = estimates,
      ~ .x |>
        select(`97.5%`) |>
        pull()
    )
  )
4 Likes

Great news!

3 Likes