Compiling the following function:
double normal_lpdf_cholesky2(double* dY, double* dmu, double* dL, double* Y, double* mu, double* L, long N, long P){
Eigen::Matrix<stan::math::var,Eigen::Dynamic,Eigen::Dynamic> vY(N,P), vL(P,P);
Eigen::Matrix<stan::math::var,1,Eigen::Dynamic> vmu(P);
// std::cout << "N: " << N << " ; P: " << P << std::endl;
for (long p = 0; p < P; p++){
vmu(p) = mu[p];
for (long pr = p; pr < P; pr++){
vL(pr,p) = L[pr + p*P];
}
for (long n = 0; n < N; n++){
vY(n,p) = Y[n + p*N];
}
}
Eigen::Matrix<stan::math::var,Eigen::Dynamic,Eigen::Dynamic> vDelta = vY.rowwise() - vmu;
vL.triangularView<Eigen::Lower>().adjoint().solveInPlace<Eigen::OnTheRight>(vDelta);
stan::math::var lp = -0.5*(vDelta.squaredNorm()) - N * (vL.diagonal().array().log().sum());
lp.grad();
for (long p = 0; p < P; p++){
dmu[p] = vmu(p).adj();
for (long pr = p; pr < P; pr++){
dL[pr + p*P] = vL(pr,p).adj();
}
for (long n = 0; n < N; n++){
dY[n + p*N] = vY(n,p).adj();
}
}
return lp.val();
}
and using the same data layout for Y as my Julia version, I now get:
julia> function stan_multinormal_cholesky2!(∂Y, ∂μ, ∂L, Y, μ, L)
N, P = size(Y)
ccall(
(:normal_lpdf_cholesky2, STANLIB0), Cdouble,
(Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Clong,Clong),
∂Y, ∂μ, ∂L.data, Y, μ, L.data, N, P
)
end
stan_multinormal_cholesky2! (generic function with 1 method)
julia> Ya = Array(Y);
julia> ∂Ya = similar(Ya);
julia> ∂μa = similar(μa);
julia> ∂La = similar(La);
julia> lp, ∂Y, ∂μ, ∂L = normal_chol(SPTR, Y, μ, L); lp
-15591.837040912393
julia> stan_multinormal_cholesky2!(∂Ya, ∂μa, ∂La, Ya, μa, La)
-15591.837040912395
julia> approx_equal(∂Y, ∂Ya)
true
julia> approx_equal(∂μ, ∂μa)
true
julia> approx_equal(∂L, ∂La)
true
julia> @benchmark normal_chol(SPTR, $Y, $μ, $L)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 7.955 μs (0.00% GC)
median time: 8.004 μs (0.00% GC)
mean time: 8.101 μs (0.00% GC)
maximum time: 15.369 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 4
julia> @benchmark stan_multinormal_cholesky2!($∂Ya, $∂μa, $∂La, $Ya, $μa, $La)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 172.438 ms (0.00% GC)
median time: 193.080 ms (0.00% GC)
mean time: 191.596 ms (0.00% GC)
maximum time: 214.988 ms (0.00% GC)
--------------
samples: 27
evals/sample: 1
Or about a 20,000-fold difference in performance to compute the same thing.
For reference, I’m on the latest development version of Stan-math. I compiled the code using:
g++ -O3 -march=native -shared -fPIC -DNDEBUG -DEIGEN_NO_DEBUG -DADEPT_STACK_THREAD_UNSAFE -fno-signed-zeros -fno-trapping-math -fassociative-math -pipe -feliminate-unused-debug-types -I$STAN_MATH -I$STAN_MATH/lib/eigen_3.3.3 -I$STAN_MATH/lib/boost_1.69.0 -I$STAN_MATH/lib/sundials_4.1.0/include stan_normal_test.cpp -o libstan_normal_chol.so
The macro definitions were just copy and pasted from the Stan-math paper. The rest was generic optimization flags or include dirs (I installed Stan math and the dependencies via git cloning CmdStan).
I’m on the latest g++
:
> g++ --version
g++ (Clear Linux OS for Intel Architecture) 9.2.1 20190828 gcc-9-branch@274989
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
EDIT:
For the sake of it, adding the analytical gradients using Eigen (note that the Eigen code is using dynamically sized arrays, while the Julia code is using statically sized; additionally, the Eigen code is copying a little excessively, but I didn’t want to look too heavily into interop for dynamically sized arrays):
julia> function stan_multinormal_cholesky3!(∂Y, ∂μ, ∂L, Y, μ, L)
N, P = size(Y)
ccall(
(:normal_lpdf_cholesky3, STANLIB0), Cdouble,
(Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Ptr{Cdouble},Clong,Clong),
∂Y, ∂μ, ∂L.data, Y, μ, L.data, N, P
)
end
stan_multinormal_cholesky3! (generic function with 1 method)
julia> lp, ∂Y, ∂μ, ∂L = normal_chol(SPTR, Y, μ, L); lp
-14785.977219901622
julia> stan_multinormal_cholesky3!(∂Ya, ∂μa, ∂La, Ya, μa, La)
-14785.97721990162
julia> approx_equal(∂Y, ∂Ya)
true
julia> approx_equal(∂μ, ∂μa)
true
julia> approx_equal(∂L, ∂La)
true
julia> @benchmark normal_chol(SPTR, $Y, $μ, $L)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 8.098 μs (0.00% GC)
median time: 8.360 μs (0.00% GC)
mean time: 8.414 μs (0.00% GC)
maximum time: 13.498 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 4
julia> @benchmark stan_multinormal_cholesky3!($∂Ya, $∂μa, $∂La, $Ya, $μa, $La)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 57.995 μs (0.00% GC)
median time: 60.663 μs (0.00% GC)
mean time: 60.579 μs (0.00% GC)
maximum time: 181.221 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
Over 7 times slower, but the analytical gradient is definitely much better here.
C++ code:
double normal_lpdf_cholesky3(double* dY, double* dmu, double* dL, double* Y, double* mu, double* L, long N, long P){
Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> vY(N,P), vL(P,P);
Eigen::Matrix<double,1,Eigen::Dynamic> vmu(P);
// std::cout << "N: " << N << " ; P: " << P << std::endl;
for (long p = 0; p < P; p++){
vmu(p) = mu[p];
dmu[p] = 0;
}
for (long p = 0; p < P; p++){
for (long pr = p; pr < P; pr++){
vL(pr,p) = L[pr + p*P];
}
for (long n = 0; n < N; n++){
vY(n,p) = Y[n + p*N];
}
}
Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> vDelta = vY.rowwise() - vmu;
vL.triangularView<Eigen::Lower>().adjoint().solveInPlace<Eigen::OnTheRight>(vDelta);
double lp = -0.5*(vDelta.squaredNorm()) - N * (vL.diagonal().array().log().sum());
Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> dlpdy = vL.triangularView<Eigen::Lower>().solve<Eigen::OnTheRight>(vDelta);
Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> dlpdl = dlpdy.adjoint() * vDelta;
for (long p = 0; p < P; p++){
for (long n = 0; n < N; n++){
dmu[p] += dlpdy(n,p);
dY[n + p*N] = -dlpdy(n,p);
}
dL[p + p*P] = dlpdl(p,p) - N / L[p + p*P];
for (long pr = p+1; pr < P; pr++){
dL[pr + p*P] = dlpdl(pr,p);
}
}
return lp;
}
I compiled with the same flags as before.