I’m working on adding sparse matrices to Stan. In math there are two ways to go about this.
- Reformat the library so a lot of the general functions can take both dense and sparse matrices.
- Add duplicate code that specializes on sparse matrices.
There has been a lot of discussion on the refactor for (1) in the past (see this Google groups post and this issue). Dan points out that Eigen coefficient access doesn’t work well with MatrixBase
. Though Ben points out that A.coeff()
for access instead of operator()
would be sufficient to handle the coefficient access error Dan saw. There’s a few ways to generalize the code which I’ll cover below
TLDR; Using Matrix, MatrixBase, EigenBase, or auto almost always gives the same assembly output. We should just use the one that gives us the nicest looking code. If we are careful with auto
we can and should use it for function signatures because it performs better on clang 6.
Eigen with -O3
The below will be the output of creating a returning a value from a function that calls A + B * A * B
We will use the following signatures
// Helper for return type
template <typename Derived>
using eigen_plain_t = typename Derived::PlainObject;
// Helper for Eigen Dynamic
template <typename T>
using eigen_mat_dyn = Eigen::Matrix<T, -1, -1>;
// With Standard Matrices
template <typename T1, typename T2>
inline eigen_mat_dyn<T1> add_mat(
const eigen_mat_dyn<T1>& A,
const eigen_mat_dyn<T2>& B) {
eigen_mat_dyn<T1> C = A + B * A * B;
return C;
}
// With Matrix Base
template <class Derived>
inline eigen_plain_t<Derived> add_mat_base(
const Eigen::MatrixBase<Derived>& A,
const Eigen::MatrixBase<Derived>& B) {
auto C = A + B * A * B;
return C;
}
// With Eigen Base
template <class Derived>
inline eigen_plain_t<Derived> add_base(
const Eigen::EigenBase<Derived>& A_,
const Eigen::EigenBase<Derived>& B_) {
// We have to pull out the Eigen derivied types
Derived const& A = A_.derived();
Derived const& B = B_.derived();
Derived C = A + B * A * B;
return C;
}
// With auto
template <typename T1, typename T2>
inline auto add_decl_ret(const T1& A, const T2& B) {
auto C = A + B * A * B;
return C;
}
Eigen Funcs on GCC 4.9 and -O3
TLDR; All the above calls generate the same asm
Starting with the Eigen::CommaInitializer
Every method uses pretty similar asm. Call .L1149
, .L1150
, .L1151
, and then calling the internal scalar_sum_op templated by Eigen::PlainObjectBase
## Matrix
mov rax, QWORD PTR [rsp+48]
cmp rax, QWORD PTR [rsp+8]
jne .L1149
mov rdx, QWORD PTR [rsp+16]
cmp rdx, QWORD PTR [rsp+40]
jne .L1150
cmp rax, rdx
mov QWORD PTR [rsp+240], rsp
mov QWORD PTR [rsp+248], rbx
mov QWORD PTR [rsp+256], rsp
mov QWORD PTR [rsp+264], rbx
jne .L1151
lea rsi, [rsp+240]
lea rdi, [rsp+64]
call Eigen::PlainObjectBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> >::PlainObjectBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> >(Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> > const&)
## EigenBase
mov rdx, QWORD PTR [rsp+48]
cmp rdx, QWORD PTR [rsp+8]
jne .L1149
mov rax, QWORD PTR [rsp+16]
cmp rax, QWORD PTR [rsp+40]
jne .L1150
cmp rdx, rax
mov QWORD PTR [rsp+288], rsp
mov QWORD PTR [rsp+296], rbx
mov QWORD PTR [rsp+304], rsp
mov QWORD PTR [rsp+312], rbx
jne .L1151
lea rsi, [rsp+288]
lea rdi, [rsp+96]
call Eigen::PlainObjectBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> >::PlainObjectBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> >(Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> > const&)
## MatrixBase
mov rdx, QWORD PTR [rsp+48]
cmp rdx, QWORD PTR [rsp+8]
jne .L1149
mov rax, QWORD PTR [rsp+16]
cmp rax, QWORD PTR [rsp+40]
jne .L1150
cmp rdx, rax
mov QWORD PTR [rsp+336], rsp
mov QWORD PTR [rsp+344], rbx
mov QWORD PTR [rsp+352], rsp
mov QWORD PTR [rsp+360], rbx
jne .L1151
lea rsi, [rsp+336]
lea rdi, [rsp+128]
call Eigen::PlainObjectBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> >::PlainObjectBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> >(Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> > const&)
## Auto
mov rax, QWORD PTR [rsp+48]
cmp rax, QWORD PTR [rsp+8]
jne .L1149
mov rdx, QWORD PTR [rsp+16]
cmp rdx, QWORD PTR [rsp+40]
jne .L1150
cmp rax, rdx
mov QWORD PTR [rsp+192], rsp
mov QWORD PTR [rsp+200], rbx
mov QWORD PTR [rsp+208], rsp
mov QWORD PTR [rsp+216], rbx
jne .L1151
lea rsi, [rsp+192]
lea rdi, [rsp+160]
call Eigen::PlainObjectBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> >::PlainObjectBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> >(Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const> > const&)
## Product With Addition
.L1151:
mov ecx, OFFSET FLAT:Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, -1, -1, 0, -1, -1> const, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const>::CwiseBinaryOp(Eigen::Matrix<double, -1, -1, 0, -1, -1> const&, Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const&, Eigen::internal::scalar_sum_op<double, double> const&)::__PRETTY_FUNCTION__
mov edx, 110
mov esi, OFFSET FLAT:.LC45
mov edi, OFFSET FLAT:.LC46
call __assert_fail
## Product Product
.L1150:
mov ecx, OFFSET FLAT:Eigen::Product<Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0>::Product(Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const&, Eigen::Matrix<double, -1, -1, 0, -1, -1> const&)::__PRETTY_FUNCTION__
mov edx, 97
mov esi, OFFSET FLAT:.LC4
mov edi, OFFSET FLAT:.LC5
call __assert_fail
## Product from a temporary
.L1149:
call Eigen::Product<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1>, 0> const Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> >::operator*<Eigen::Matrix<double, -1, -1, 0, -1, -1> >(Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&) const [clone .part.30]
mov rdi, QWORD PTR [rsp+160]
mov rbx, rax
call free
So, essentially we can pick whatever we want as the signature here (excluding the instances where Eigen shows that auto can segfault) and for 4.9.3 we will get the same stuff.
On a side note, using a newer version of GCC gives the same asm for everything except for Matrix
, which does requires two extra moves and an unpack. The other calls all give the same asm.
Eigen Funcs on Clang 6 and -O3
TLDR: All the asm is the same except for auto
which is actually inlined and a bit more optmized
This is waaaaayy more complicated. But essentially everything is also the same but clang 6 loves to duplicate code. The only interesting thing below is that the auto
version actually inlines while the others are forced to make a call.
## Matrix
Eigen::Matrix<double, -1, -1, 0, -1, -1> add_mat<double, double>(Eigen::Matrix<double, -1, -1, 0, -1, -1> const&, Eigen::Matrix<double, -1, -1, 0, -1, -1> const&): # @Eigen::Matrix<double, -1, -1, 0, -1, -1> add_mat<double, double>(Eigen::Matrix<double, -1, -1, 0, -1, -1> const&, Eigen::Matrix<double, -1, -1, 0, -1, -1> const&)
push r14
push rbx
sub rsp, 56
mov rax, rsi
mov rbx, rdi
mov rsi, qword ptr [rdx + 16]
cmp rsi, qword ptr [rax + 8]
## .LBB*_10 are the same product
jne .LBB1_10
mov rcx, qword ptr [rax + 16]
cmp rcx, qword ptr [rdx + 8]
## .LBB*_11 are all the same product
jne .LBB1_11
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], rdx
mov qword ptr [rsp + 32], rax
mov qword ptr [rsp + 40], rdx
cmp rsi, rcx
## .LBB*_12 are all the same addition
jne .LBB1_12
xorps xmm0, xmm0
movups xmmword ptr [rbx], xmm0
mov qword ptr [rbx + 16], 0
test rsi, rsi
## .LBB*_7 are all the same product
je .LBB1_7
movabs rax, 9223372036854775807
xor edx, edx
idiv rsi
cmp rax, rsi
jl .LBB1_5
## EigenBase
Eigen::Matrix<double, -1, -1, 0, -1, -1>::PlainObject add_base<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1> >(Eigen::EigenBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&, Eigen::EigenBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&): # @Eigen::Matrix<double, -1, -1, 0, -1, -1>::PlainObject add_base<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1> >(Eigen::EigenBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&, Eigen::EigenBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&)
push r14
push rbx
sub rsp, 56
mov rax, rsi
mov rbx, rdi
mov rsi, qword ptr [rdx + 16]
cmp rsi, qword ptr [rax + 8]
## .LBB*_10 are the same product
jne .LBB2_10
mov rcx, qword ptr [rax + 16]
cmp rcx, qword ptr [rdx + 8]
## .LBB*_11 are all the same product
jne .LBB2_11
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], rdx
mov qword ptr [rsp + 32], rax
mov qword ptr [rsp + 40], rdx
cmp rsi, rcx
## .LBB*_12 are all the same addition
jne .LBB2_12
xorps xmm0, xmm0
movups xmmword ptr [rbx], xmm0
mov qword ptr [rbx + 16], 0
test rsi, rsi
## .LBB*_7 are all the same product
je .LBB2_7
movabs rax, 9223372036854775807
xor edx, edx
idiv rsi
cmp rax, rsi
jl .LBB2_5
## MatrixBase
Eigen::Matrix<double, -1, -1, 0, -1, -1>::PlainObject add_mat_base<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1> >(Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&, Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&): # @Eigen::Matrix<double, -1, -1, 0, -1, -1>::PlainObject add_mat_base<Eigen::Matrix<double, -1, -1, 0, -1, -1>, Eigen::Matrix<double, -1, -1, 0, -1, -1> >(Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&, Eigen::MatrixBase<Eigen::Matrix<double, -1, -1, 0, -1, -1> > const&)
push r14
push rbx
sub rsp, 56
mov rax, rsi
mov rbx, rdi
mov rsi, qword ptr [rdx + 16]
cmp rsi, qword ptr [rax + 8]
## .LBB*_10 are the same product
jne .LBB3_10
mov rcx, qword ptr [rax + 16]
cmp rcx, qword ptr [rdx + 8]
## .LBB*_11 are all the same product
jne .LBB3_11
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], rdx
mov qword ptr [rsp + 32], rax
mov qword ptr [rsp + 40], rdx
cmp rsi, rcx
## .LBB*_12 are all the same addition
jne .LBB3_12
xorps xmm0, xmm0
movups xmmword ptr [rbx], xmm0
mov qword ptr [rbx + 16], 0
test rsi, rsi
## .LBB*_7 are all the same product
je .LBB3_7
movabs rax, 9223372036854775807
xor edx, edx
idiv rsi
cmp rax, rsi
jl .LBB3_5
## Auto is the only one that's inlined in main!?!?
mov rsi, qword ptr [rsp + 32]
cmp rsi, qword ptr [rsp + 56]
# Product
jne .LBB0_76
mov rax, qword ptr [rsp + 64]
cmp rax, qword ptr [rsp + 24]
# Product
jne .LBB0_77
cmp rsi, rax
# Addition
jne .LBB0_78
mov qword ptr [rsp + 184], rbx
mov qword ptr [rsp + 192], r14
mov qword ptr [rsp + 200], rbx
mov qword ptr [rsp + 208], r14
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 80], xmm0
mov qword ptr [rsp + 96], 0
test rsi, rsi
# Product
je .LBB0_46
Conclusions
I think auto and generic templates are a good idea for a lot of the functions in the math library where the functions are simple. I.E. using
template <typename T1, typename T2, typename = enable_if_all_eigen<T1, T2>>
auto add(const T1&, const T2&) {
return A + B;
}
is fine. For the simpler things like addition, subtraction, and multiplication this will let those work for sparse and dense matrices. This will give us the same performance on gcc with slightly better performance on clang. Though we should probs not use auto
for variable declarations.