Stan SIMD & Performance

Compiling the following function:

double normal_lpdf_cholesky2(double* dY, double* dmu, double* dL, double* Y, double* mu, double* L, long N, long P){
  Eigen::Matrix<stan::math::var,Eigen::Dynamic,Eigen::Dynamic> vY(N,P), vL(P,P);
  Eigen::Matrix<stan::math::var,1,Eigen::Dynamic> vmu(P);
  //  std::cout << "N: " << N << " ; P: " << P << std::endl;
  for (long p = 0; p < P; p++){
    vmu(p) = mu[p];
    for (long pr = p; pr < P; pr++){
      vL(pr,p) = L[pr + p*P];
    }
    for (long n = 0; n < N; n++){
      vY(n,p) = Y[n + p*N];
    }
  }

  
  Eigen::Matrix<stan::math::var,Eigen::Dynamic,Eigen::Dynamic> vDelta = vY.rowwise() - vmu;

  vL.triangularView<Eigen::Lower>().adjoint().solveInPlace<Eigen::OnTheRight>(vDelta);
  
  stan::math::var lp = -0.5*(vDelta.squaredNorm()) - N * (vL.diagonal().array().log().sum());

  lp.grad();

  for (long p = 0; p < P; p++){
    dmu[p] = vmu(p).adj();
    for (long pr = p; pr < P; pr++){
      dL[pr + p*P] = vL(pr,p).adj();
    }
    for (long n = 0; n < N; n++){
      dY[n + p*N] = vY(n,p).adj();
    }
  }
  return lp.val();
}

and using the same data layout for Y as my Julia version, I now get:

julia> function stan_multinormal_cholesky2!(∂Y, ∂μ, ∂L, Y, μ, L)
           N, P = size(Y)
           ccall(
               (:normal_lpdf_cholesky2, STANLIB0), Cdouble,
               (Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Clong,Clong),
               ∂Y, ∂μ, ∂L.data, Y, μ, L.data, N, P
           )
       end
stan_multinormal_cholesky2! (generic function with 1 method)

julia> Ya = Array(Y);

julia> ∂Ya = similar(Ya);

julia> ∂μa = similar(μa);

julia> ∂La = similar(La);

julia> lp, ∂Y, ∂μ, ∂L = normal_chol(SPTR, Y, μ, L); lp
-15591.837040912393

julia> stan_multinormal_cholesky2!(∂Ya, ∂μa, ∂La, Ya, μa, La)
-15591.837040912395

julia> approx_equal(∂Y, ∂Ya)
true

julia> approx_equal(∂μ, ∂μa)
true

julia> approx_equal(∂L, ∂La)
true

julia> @benchmark normal_chol(SPTR, $Y, $μ, $L)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     7.955 μs (0.00% GC)
  median time:      8.004 μs (0.00% GC)
  mean time:        8.101 μs (0.00% GC)
  maximum time:     15.369 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     4

julia> @benchmark stan_multinormal_cholesky2!($∂Ya, $∂μa, $∂La, $Ya, $μa, $La)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     172.438 ms (0.00% GC)
  median time:      193.080 ms (0.00% GC)
  mean time:        191.596 ms (0.00% GC)
  maximum time:     214.988 ms (0.00% GC)
  --------------
  samples:          27
  evals/sample:     1

Or about a 20,000-fold difference in performance to compute the same thing.
For reference, I’m on the latest development version of Stan-math. I compiled the code using:

g++ -O3 -march=native -shared -fPIC -DNDEBUG -DEIGEN_NO_DEBUG -DADEPT_STACK_THREAD_UNSAFE -fno-signed-zeros -fno-trapping-math -fassociative-math -pipe -feliminate-unused-debug-types -I$STAN_MATH -I$STAN_MATH/lib/eigen_3.3.3 -I$STAN_MATH/lib/boost_1.69.0 -I$STAN_MATH/lib/sundials_4.1.0/include stan_normal_test.cpp -o libstan_normal_chol.so

The macro definitions were just copy and pasted from the Stan-math paper. The rest was generic optimization flags or include dirs (I installed Stan math and the dependencies via git cloning CmdStan).

I’m on the latest g++:

> g++ --version
g++ (Clear Linux OS for Intel Architecture) 9.2.1 20190828 gcc-9-branch@274989
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

EDIT:
For the sake of it, adding the analytical gradients using Eigen (note that the Eigen code is using dynamically sized arrays, while the Julia code is using statically sized; additionally, the Eigen code is copying a little excessively, but I didn’t want to look too heavily into interop for dynamically sized arrays):

julia> function stan_multinormal_cholesky3!(∂Y, ∂μ, ∂L, Y, μ, L)
           N, P = size(Y)
           ccall(
               (:normal_lpdf_cholesky3, STANLIB0), Cdouble,
               (Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Clong,Clong),
               ∂Y, ∂μ, ∂L.data, Y, μ, L.data, N, P
           )
       end
stan_multinormal_cholesky3! (generic function with 1 method)

julia> lp, ∂Y, ∂μ, ∂L = normal_chol(SPTR, Y, μ, L); lp
-14785.977219901622

julia> stan_multinormal_cholesky3!(∂Ya, ∂μa, ∂La, Ya, μa, La)
-14785.97721990162

julia> approx_equal(∂Y, ∂Ya)
true

julia> approx_equal(∂μ, ∂μa)
true

julia> approx_equal(∂L, ∂La)
true

julia> @benchmark normal_chol(SPTR, $Y, $μ, $L)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.098 μs (0.00% GC)
  median time:      8.360 μs (0.00% GC)
  mean time:        8.414 μs (0.00% GC)
  maximum time:     13.498 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     4

julia> @benchmark stan_multinormal_cholesky3!($∂Ya, $∂μa, $∂La, $Ya, $μa, $La)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     57.995 μs (0.00% GC)
  median time:      60.663 μs (0.00% GC)
  mean time:        60.579 μs (0.00% GC)
  maximum time:     181.221 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

Over 7 times slower, but the analytical gradient is definitely much better here.
C++ code:


double normal_lpdf_cholesky3(double* dY, double* dmu, double* dL, double* Y, double* mu, double* L, long N, long P){
  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> vY(N,P), vL(P,P);
  Eigen::Matrix<double,1,Eigen::Dynamic> vmu(P);
  //  std::cout << "N: " << N << " ; P: " << P << std::endl;
  for (long p = 0; p < P; p++){
    vmu(p) = mu[p];
    dmu[p] = 0;
  }
  for (long p = 0; p < P; p++){
    for (long pr = p; pr < P; pr++){
      vL(pr,p) = L[pr + p*P];
    }
    for (long n = 0; n < N; n++){
      vY(n,p) = Y[n + p*N];
    }
  }

  
  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> vDelta = vY.rowwise() - vmu;

  vL.triangularView<Eigen::Lower>().adjoint().solveInPlace<Eigen::OnTheRight>(vDelta);
  
  double lp = -0.5*(vDelta.squaredNorm()) - N * (vL.diagonal().array().log().sum());
  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> dlpdy = vL.triangularView<Eigen::Lower>().solve<Eigen::OnTheRight>(vDelta);
  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> dlpdl = dlpdy.adjoint() * vDelta;

  for (long p = 0; p < P; p++){
    for (long n = 0; n < N; n++){
      dmu[p] += dlpdy(n,p);
      dY[n + p*N] = -dlpdy(n,p);
    }
    dL[p + p*P] = dlpdl(p,p) - N / L[p + p*P];
    for (long pr = p+1; pr < P; pr++){
      dL[pr + p*P] = dlpdl(pr,p);
    }
  }
  return lp;
}

I compiled with the same flags as before.

2 Likes