Call Stan-math gradient() from separate threads => segfault

tl;dr Calling a C program, which uses stan::math::gradient, on separate threads from Julia segfaults and I don’t understand why, while a similar program (called similarly) without stan::math::gradient runs fine. Does anyone have any ideas why spawning separate threads each of which calls stan::math::gradient doesn’t work?

Julia has a nice interface for calling C functions which are compiled as shared libraries. One can successfully write a C shared library which calls a C++ shared library which itself uses Stan-math. Here’s a minimal reproducible example which contains two C functions logistic, which segfaults when called from Julia on separate threads, and logisticm, which runs fine when called from Julia on separate threads.

C, C++, and Makefile files

C

#include "models.h"

extern void logistic(double* _fx, double* _grad_fx,
                         double* _theta, double* _X, double* _y, int _N, int _K) {
  _logistic(_fx, _grad_fx, _theta, _X, _y, _N, _K);
}

extern void logisticm(double* _fx, double* _grad_fx,
                         double* _theta, double* _X, double* _y, int _N, int _K) {
  _logisticm(_fx, _grad_fx, _theta, _X, _y, _N, _K);
}

and header file

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

  void logistic(double* _fx, double* _grad_fx,
                double* _theta, double* _X, double* _y, int _N, int _K);
  void _logistic(double* _fx, double* _grad_fx,
                     double* _theta, double* _X, double* _y, int _N, int _K);

  void logisticm(double* _fx, double* _grad_fx,
                  double* _theta, double* _X, double* _y, int _N, int _K);
  void _logisticm(double* _fx, double* _grad_fx,
                  double* _theta, double* _X, double* _y, int _N, int _K);

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

C++

#include "models.h"
#include <exception>
#include <cmath>
#include <stan/math.hpp>

extern "C" {

  void _logistic(double* _fx,
                double* _grad_fx,
                double* _theta,
                double* _X,
                double* _y,
                int _N,
                int _K) {
    Eigen::VectorXd grad_fx;

    int D = _K + 1;
    const Eigen::Map<Eigen::MatrixXd> X(_X, _N, _K);
    const Eigen::Map<Eigen::VectorXd> y(_y, _N);
    const Eigen::Map<Eigen::VectorXd> theta(_theta, D);

    stan::math::gradient([&X, &y, &_K](auto q) {

                           double lp(0.0);
                           stan::math::accumulator<stan::math::var_value<double> > lp_accum;

                           try {

                             lp_accum.add(stan::math::normal_lpdf<true>(q, 0, 1));
                             auto a = stan::math::add(q(0), stan::math::multiply(X, q.tail(_K).eval()));
                             lp_accum.add(stan::math::bernoulli_logit_lpmf<true>(y, a));

                           } catch (const std::exception& e) {}

                           lp_accum.add(lp);
                           return -lp_accum.sum();

                         }, theta, *_fx, grad_fx);

    // copy out
    // could avoid this copy with new gradient method
    for (int d = 0; d < D; d++) {
      _grad_fx[d] = grad_fx(d);
    }
  }

  void _logisticm(double* _fx,
                  double* _grad_fx,
                  double* _theta,
                  double* _X,
                  double* _y,
                  int _N,
                  int _K) {

    int D = _K + 1;
    const Eigen::Map<Eigen::MatrixXd> X(_X, _N, _K);
    const Eigen::Map<Eigen::VectorXd> y(_y, _N);
    const Eigen::Map<Eigen::VectorXd> theta(_theta, D);

    *_fx = -0.5 * std::pow(theta(0), 2);
    _grad_fx[0] = -theta(0);

    Eigen::VectorXd Xb = X * theta.tail(_K);

    for (int n = 0; n < _N; n++) {
      auto tmp = theta(0) + Xb(n);
      auto s = 2.0 * y(n) - 1.0;
      auto nsa = std::exp(-s * tmp);
      *_fx += -std::log1p(nsa);
      Xb(n) = s * nsa / (1 + nsa);
      _grad_fx[0] += Xb(n);
    }

    for (int k = 0; k < _K; k++) {
      *_fx += -0.5 * std::pow(theta(k + 1), 2);
      _grad_fx[k + 1] = -theta(k + 1) + X.col(k).dot(Xb);
    }
  }
}

Makefile

Edit the paths appropriately.

%.so: logistic.cpp
	clang++ -shared -fPIC -std=c++1y -Wno-unknown-warning-option -Wno-tautological-compare -Wno-sign-compare -D_REENTRANT -Wno-ignored-attributes -I /Users/ez/math/lib/tbb_2020.3/include -O3 -I /Users/ez/math/ -I /Users/ez/math/lib/eigen_3.3.9 -I /Users/ez/math/lib/boost_1.75.0 -I /Users/ez/math/lib/sundials_6.1.1/include -I /Users/ez/math/lib/sundials_6.1.1/src/sundials -DBOOST_DISABLE_ASSERTS -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" logistic.cpp -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_nvecserial.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_cvodes.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_idas.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_kinsol.a  /Users/ez/math/lib/tbb/libtbb.dylib /Users/ez/math/lib/tbb/libtbbmalloc.dylib /Users/ez/math/lib/tbb/libtbbmalloc_proxy.dylib -o logistic_lib.so

make: models.c logistic_lib.so
	clang -o models.o -c models.c -O3
	clang -shared -o stan_models.so models.o logistic_lib.so -lm -fPIC -O3

clean:
	rm stan_models.o stan_models.so logistic_lib.so

The function logistic uses stan::math::gradient. The function logisticm calculates the same thing manually (up to the sign). That logisticm works as I plan makes me think this is a Stan-math issue, not a Julia issue.

Here’s a minimal reproducible example Julia file which works with the above files, so long as everything is in the same directory.

Julia file
stanlib = joinpath(@__DIR__, "stan_models.so")

df = [-1.25 -1.23 0.0; -0.45 2.24 1.0];
q = randn(3);
N = size(df, 1);
K = size(df, 2) - 1;
y = df[:, K+1];
X = df[:, 1:K];
fx = zeros(1);
grad_fx = zeros(K+1);


# change to :logistic; this will still work
ccall((:logisticm, stanlib),
      Cvoid,
      (Ref{Cdouble},
       Ref{Cdouble},
       Ref{Cdouble},
       Ref{Cdouble},
       Ref{Cdouble},
       Cint,
       Cint),
      fx, grad_fx, q, X, y, N, K)


C = 10;
# start Julia with more than 1 thread. $ julia -t 2
nt = Threads.nthreads();
fxs = [zeros(1) for _ in 1:C];
grad_fxs = [zeros(K+1) for _ in 1:C];

@sync for it in 1:nt
    Threads.@spawn for c in it:nt:C
        ccall((:logisticm, stanlib), # change to :logistic; this borks
              Cvoid,
              (Ref{Cdouble},
               Ref{Cdouble},
               Ref{Cdouble},
               Ref{Cdouble},
               Ref{Cdouble},
               Cint,
               Cint),
              fxs[c], grad_fxs[c], q, X, y, N, K)
    end
end

My understanding is that the @spawn-ed threads from Julia are independent, despite sharing a few read-only variables. My conceptual model, which I believe I confirmed with the C function logisticm, is that calling this shared library from independent threads should work fine. However, this model is not true when I call stan::math::gradient. So my conceptual model or understanding must be off somewhere.

Any comments/suggestions/tips are much appreciated. Thanks in advance.

System info:
Stan-math v4.3.2
mac OS 12.2
Julia Version 1.7.2
Apple clang version 11.0.3

I believe the memory arena used by Stan’s autodiff is stored in a global variable which is not thread-local. This means that gradient calculations cannot be run in threaded programs.

It’s difficult to find exact information on this (and it probably shouldn’t be), the comments on an old blog post mention this and point to the Stan autodiff paper. This details the global arena and includes this snippet in the comparison section:

Stan can be run in thread safe mode at roughly a 20% penalty in performance by using thread-local instances of the stack representing the expression graph.

I think this “can” is purely theoretical, e.g. there is not a flag you can set which will just make it do so.

It appears I spoke too soon, according to the math docs you should be able to set -DSTAN_THREADS to make the autodiff stack thread local

1 Like

Well… things are NOT thread local by default. You have to define STAN_THREADS to get a proper thread-local autodiff tape, which is required for multi-threaded things. However, note that Stan math assumes that the function you want to autodiff itself does must not use multiple threads! Some care is needed to make that work. This is care is coded into reduce_sum and map_rect, which do span multiple threads.

2 Likes

I tried defining STAN_THREADS as

%.so: logistic.cpp
	clang++ -DSTAN_THREADS -shared -fPIC -std=c++1y -Wno-unknown-warning-option -Wno-tautological-compare -Wno-sign-compare -D_REENTRANT -Wno-ignored-attributes -I /Users/ez/math/lib/tbb_2020.3/include -O3 -I /Users/ez/math/ -I /Users/ez/math/lib/eigen_3.3.9 -I /Users/ez/math/lib/boost_1.75.0 -I /Users/ez/math/lib/sundials_6.1.1/include -I /Users/ez/math/lib/sundials_6.1.1/src/sundials -DBOOST_DISABLE_ASSERTS -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" logistic.cpp -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_nvecserial.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_cvodes.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_idas.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_kinsol.a  /Users/ez/math/lib/tbb/libtbb.dylib /Users/ez/math/lib/tbb/libtbbmalloc.dylib /Users/ez/math/lib/tbb/libtbbmalloc_proxy.dylib -o logistic_lib.so

make: models.c logistic_lib.so
	clang -o models.o -c models.c -O3
	clang -shared -o stan_models.so models.o logistic_lib.so -lm -fPIC -O3

but unfortunately, this did not work. The error I’m getting is

Segmentation fault: 11
in expression starting at none:1
_ZN4stan4math19nested_rev_autodiff

where the issue appears to be showing up in stan::math::nested_rev_autodiff.

It’s my understanding that the function I’m trying to autodiff does not use multiple threads.


    stan::math::gradient([&X, &y, &_K](auto q) {

                           double lp(0.0);
                           stan::math::accumulator<stan::math::var_value<double> > lp_accum;

                           try {

                             lp_accum.add(stan::math::normal_lpdf<true>(q, 0, 1));
                             auto a = stan::math::add(q(0), stan::math::multiply(X, q.tail(_K).eval()));
                             lp_accum.add(stan::math::bernoulli_logit_lpmf<true>(y, a));

                           } catch (const std::exception& e) {}

                           lp_accum.add(lp);
                           return -lp_accum.sum();

                         }, theta, *_fx, grad_fx);

@wds15 are there other constraints necessary to make the thread-local autodiff tape work?

You need to link in this file:

This uses Intel TBB tricks to initialise in any TBB thread the autodiff tape. In case the thread this runs, then you need to do the initialisation yourself. For each thread an instance of ChainableStack must be created once. This object holds the thread local resources for the AD tape. In case you use Intel TBB threads and link the above in, things should magically just work, but static linker weirdness can sometimes mess this up.

4 Likes

@wds15 That’s it! Thanks so much.

So we have a record of a working solution, here are the only changed files.

Makefile: add -DSTAN_THREADS
C++: add #include <stan/math/rev/core/init_chainablestack.hpp> and add stan::math::ChainableStack thread_instance; outside of stan::math::gradient.

Makefile
%.so: logistic.cpp
	clang++ -DSTAN_THREADS -shared -fPIC -std=c++1y -Wno-unknown-warning-option -Wno-tautological-compare -Wno-sign-compare -D_REENTRANT -Wno-ignored-attributes -I /Users/ez/math/lib/tbb_2020.3/include -O3 -I /Users/ez/math/ -I /Users/ez/math/lib/eigen_3.3.9 -I /Users/ez/math/lib/boost_1.75.0 -I /Users/ez/math/lib/sundials_6.1.1/include -I /Users/ez/math/lib/sundials_6.1.1/src/sundials -DBOOST_DISABLE_ASSERTS -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" logistic.cpp -Wl,-L,"/Users/ez/math/lib/tbb" -Wl,-rpath,"/Users/ez/math/lib/tbb" /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_nvecserial.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_cvodes.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_idas.a /Users/ez/math/lib/sundials_6.1.1/lib/libsundials_kinsol.a  /Users/ez/math/lib/tbb/libtbb.dylib /Users/ez/math/lib/tbb/libtbbmalloc.dylib /Users/ez/math/lib/tbb/libtbbmalloc_proxy.dylib -o logistic_lib.so

make: models.c logistic_lib.so
	clang -o models.o -c models.c -O3
	clang -shared -o stan_models.so models.o logistic_lib.so -lm -fPIC -O3

clean:
	rm stan_models.o stan_models.so logistic_lib.so
C++
#include "models.h"
#include <exception>
#include <cmath>
#include <stan/math/rev/core/init_chainablestack.hpp> // new include
#include <stan/math.hpp>

extern "C" {

  void _logistic(double* _fx,
                double* _grad_fx,
                double* _theta,
                double* _X,
                double* _y,
                int _N,
                int _K) {
    Eigen::VectorXd grad_fx;

    int D = _K + 1;
    const Eigen::Map<Eigen::MatrixXd> X(_X, _N, _K);
    const Eigen::Map<Eigen::VectorXd> y(_y, _N);
    const Eigen::Map<Eigen::VectorXd> theta(_theta, D);
    stan::math::ChainableStack thread_instance; // initialize ChainableStack => magic

    stan::math::gradient([&X, &y, &_K](auto q) {
                           double lp(0.0);
                           stan::math::accumulator<stan::math::var_value<double> > lp_accum;

                           try {

                             lp_accum.add(stan::math::normal_lpdf<true>(q, 0, 1));
                             auto a = stan::math::add(q(0), stan::math::multiply(X, q.tail(_K).eval()));
                             lp_accum.add(stan::math::bernoulli_logit_lpmf<true>(y, a));

                           } catch (const std::exception& e) {}

                           lp_accum.add(lp);
                           return -lp_accum.sum();

                         }, theta, *_fx, grad_fx);

    // copy out
    // could avoid this copy with new gradient method
    for (int d = 0; d < D; d++) {
      _grad_fx[d] = grad_fx(d);
    }
  }

  void _logisticm(double* _fx,
                  double* _grad_fx,
                  double* _theta,
                  double* _X,
                  double* _y,
                  int _N,
                  int _K) {

    int D = _K + 1;
    const Eigen::Map<Eigen::MatrixXd> X(_X, _N, _K);
    const Eigen::Map<Eigen::VectorXd> y(_y, _N);
    const Eigen::Map<Eigen::VectorXd> theta(_theta, D);

    *_fx = -0.5 * std::pow(theta(0), 2);
    _grad_fx[0] = -theta(0);

    Eigen::VectorXd Xb = X * theta.tail(_K);

    for (int n = 0; n < _N; n++) {
      auto tmp = theta(0) + Xb(n);
      auto s = 2.0 * y(n) - 1.0;
      auto nsa = std::exp(-s * tmp);
      *_fx += -std::log1p(nsa);
      Xb(n) = s * nsa / (1 + nsa);
      _grad_fx[0] += Xb(n);
    }

    for (int k = 0; k < _K; k++) {
      *_fx += -0.5 * std::pow(theta(k + 1), 2);
      _grad_fx[k + 1] = -theta(k + 1) + X.col(k).dot(Xb);
    }
  }
}
1 Like

It’s better to keep the chainablstack object around. This is beneficial if you do a repeated gradient call, since then the memory arena is allocated only once for the first call.

So maybe make chainable stack a static variable.

Maybe I don’t fully understand your latest suggestion. I added static as

...
static stan::math::ChainableStack thread_instance;

stan::math::gradient(...);
...

and curiously this segfaults. Did you imagine something else?

Sounds like this is a threaded program? If so, then consider using what you got, which recreates the ad tape whenever needed or you need to make sure that the chainable stack is created once per thread.

Yep, a threaded program. Sorry if I didn’t make that clear. Thanks again for all your help.