Part III: Arena-Based Memory and Functional Application
What’s still wrong
Allocating that stack of chainables with malloc
and then freeing them when the stack is cleared is going to be very slow, with a memory-affecting constructor and destructor for each subexpression being evaluated.
The fix: arena-based memory
Our third version of this program is going to clean up a bunch of things at once. It’s going to give us
-
configurable sizes for indexes (32 or 64-bit being the only logical choices) and floating point scalars (32 bit and 64 bit being most obvious choices)—we’ll just go with the machine size of 64 and 64
-
arena-based memory management with custom
operator new
for thechainable
base class (and descendants) -
functional for for evaluating gradients like the one in
stan::math::gradient
-
more operators including
+=
and*=
anddouble
-var
versions of binary ops
The code still isn’t very well organized, documented, or broken down into functions. So I’ll step through it all like in the last posts.
#include <limits>
#include <iostream>
#include <vector>
#include <cmath>
namespace agrad {
// CONFIGURABLE SIZES
typedef unsigned int index_t; // determines max expression size
typedef double real_t; // basic scalar type
typedef std::vector<real_t> vec_t; // vector of scalars
// SIMPLE MEMORY ARENA
std::size_t MEM_SIZE = 1024 * 1024 * 1024; // 1 GB
char* ARENA = static_cast<char*>(malloc(MEM_SIZE)); // ** global **
char* ARENA_NEXT = ARENA; // ** global **
inline void clear_arena() {
ARENA_NEXT = ARENA;
}
inline char* arena_malloc(std::size_t nbytes) {
char* bytes = ARENA_NEXT;
ARENA_NEXT += nbytes;
return bytes;
}
template <typename T>
inline T* arena_array_malloc(std::size_t n) {
return static_cast<T*>(arena_malloc(sizeof(T) * n));
}
// STACK OF CHAINABLES FOR REVERSE PASS IN ARENA
struct chainable {
virtual void operator()(vec_t& adj) const { }
static inline void* operator new(std::size_t nbytes) {
std::cout << "arena allocating nbytes = " << nbytes << std::endl;
return arena_malloc(nbytes);
}
};
std::vector<chainable*> STACK; // ** global **
inline void clear_stack() {
STACK.clear(); // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
STACK.push_back(c);
}
index_t NEXT_ID = 0; // ** global **
inline index_t next_id() {
return NEXT_ID++;
}
// CLEAR MEMORY (RESETS, BUT DOES NOT FREE)
inline void clear_memory() {
NEXT_ID = 0;
clear_stack();
clear_arena();
}
// AUTODIFF VARIABLE
struct var {
real_t val_;
int id_;
var(const var& v) : val_(v.val_), id_(v.id_) { }
var(real_t val, int id) : val_(val), id_(id) { }
var(real_t val) : val_(val), id_(next_id()) { }
var() : val_(std::numeric_limits<real_t>::quiet_NaN()), id_(next_id()) { }
var& operator+=(const var& x);
var& operator+=(double x);
};
// REVERSE PASS GRADIENT CALCULATIONS (PURE CONTINUATIONS)
void grad(const var& y, const std::vector<var>& x,
std::vector<real_t>& dy_dx) {
std::vector<real_t> adj(y.id_ + 1, 0.0);
adj[y.id_] = 1;
for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
(**chain_f)(adj);
dy_dx.clear();
for (auto&& xi : x)
dy_dx.push_back(adj[xi.id_]);
}
/**
* Set the value y = f(x) and derivatives dy_dx = d/dx f(x).
*/
void gradient(const std::function<var(const std::vector<var>&)>& f,
const std::vector<real_t>& x,
real_t& y, std::vector<real_t>& dy_dx) {
clear_memory();
std::vector<var> xv;
xv.reserve(x.size());
for (auto&& xi : x) xv.emplace_back(xi);
var yv = f(xv);
y = yv.val_;
grad(yv, xv, dy_dx);
}
// ADDITION
// allocated in arena
struct add_vv : public chainable {
index_t x1, x2, y;
add_vv(index_t x1, index_t x2, index_t y) : x1(x1), x2(x2), y(y) { }
void operator()(vec_t& adj) const {
adj[x1] += adj[y];
adj[x2] += adj[y];
}
};
struct add_v : public chainable {
index_t x1, y;
add_v(index_t x1, index_t y) : x1(x1), y(y) { }
void operator()(vec_t& adj) const {
adj[x1] += adj[y];
}
};
inline var operator+(const var& x1, const var& x2) {
var y(x1.val_ + x2.val_);
push_chainable(new add_vv(x1.id_, x2.id_, y.id_));
return y;
}
inline var operator+(const var& x1, double x2) {
var y(x1.val_ + x2);
push_chainable(new add_v(x1.id_, y.id_));
return y;
}
inline var operator+(double x1, const var& x2) {
return x2 + x1;
}
// COMPOUND ASSIGN-ADD
inline var& var::operator+=(const var& x) {
val_ = val_ + x.val_;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_vv(old_id, x.id_, id_));
return *this;
}
inline var& var::operator+=(double x) {
val_ = val_ + x;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_v(old_id, id_));
return *this;
}
// MULTIPLICATION
// allocated in arena
struct multiply_vv : public chainable {
index_t x1, x2, y;
real_t x1v, x2v;
multiply_vv(index_t x1, index_t x2, index_t y, real_t x1v, real_t x2v)
: x1(x1), x2(x2), y(y), x1v(x1v), x2v(x2v) { }
void operator()(vec_t& adj) const {
adj[x1] += x2v * adj[y];
adj[x2] += x1v * adj[y];
}
};
struct multiply_v : public chainable {
index_t x1, y;
real_t x2v;
multiply_v(index_t x1, index_t y, real_t x2v) : x1(x1), y(y), x2v(x2v) { }
void operator()(vec_t& adj) const {
adj[x1] += x2v * adj[y];
}
};
inline var operator*(const var& x1, const var& x2) {
var y(x1.val_ * x2.val_);
push_chainable(new multiply_vv(x1.id_, x2.id_, y.id_, x1.val_, x2.val_));
return y;
}
inline var operator*(const var& x1, double x2) {
var y(x1.val_ * x2);
push_chainable(new multiply_v(x1.id_, y.id_, x2));
return y;
}
inline var operator*(double x1, const var& x2) {
return x2 * x1;
}
// EXPONENTIATION
struct exp_v : public chainable {
index_t x1, y;
real_t yv;
exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
void operator()(vec_t& adj) const {
adj[x1] += yv * adj[y];
}
};
inline var exp(const var& x) {
var y(std::exp(x.val_));
push_chainable(new exp_v(x.id_, y.id_, y.val_));
return y;
}
} // namespace agrad
// EXAMPLE
struct my_fun {
template <typename T>
T operator()(const std::vector<T>& x) const {
return exp(x[0]) * x[1] * x[1];
}
};
int main() {
using agrad::var;
using std::vector;
using agrad::vec_t;
vec_t x = { 5.2, -3.9 };
vec_t dy_dx;
double y;
my_fun f;
agrad::gradient(f, x, y, dy_dx);
std::cout << "y = f(x) = " << y << std::endl;
for (std::size_t i = 0; i < x.size(); ++i)
std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;
return 0;
}
Configurable Sizes
typedef unsigned int index_t; // determines max expression size
typedef double real_t; // basic scalar type
typedef std::vector<real_t> vec_t; // vector of scalars
Using smaller integers means closures will be smaller and thus programs may be more memory local. It will limit number of subexpressions to around 4 billion. I’m still using double-precision floating point for real scalars and also vectors (the vec_t
is replacing adj_t
as I’ll need it in multiple places).
Simple arena-based memory
WARNING: Do not try this at home.
This will segfault in the face of a program that’s too large. To get around that issue, the memory arena either needs to be carefully sized or it needs to be dynamically expandable. The memory arena in Stan provides the same interface with a slightly slower (due to branching and pointer chasing) but much safer implementation.
std::size_t MEM_SIZE = 1024 * 1024 * 1024; // 1 GB
char* ARENA = static_cast<char*>(malloc(MEM_SIZE)); // ** global **
char* ARENA_NEXT = ARENA; // ** global **
inline void clear_arena() {
ARENA_NEXT = ARENA;
}
inline char* arena_malloc(std::size_t nbytes) {
char* bytes = ARENA_NEXT;
ARENA_NEXT += nbytes;
return bytes;
}
template <typename T>
inline T* arena_array_malloc(std::size_t n) {
return static_cast<T*>(arena_malloc(sizeof(T) * n));
}
The memory size here is fixed at 1GB. That’s it. Go above that and segfault. You have now been warned and I’ve assumed the appropriate waiver before going on.
The arena itself is just a bunch of bytes (char
in C++). The pointer to the next available memory location is ARENA_NEXT
and it starts at the beginning.
The arena_malloc
function has the same signature as the built in malloc
function. It just returns a pointer to the next position in the arena then advances the pointer. The arena_array_malloc
is just a convenience for arena-allocating arrays.
Closures allocated in the arena
This won’t do anything for us on its own. C++ has a really nifty feature where you can overload the operator new
implementation for a class and all of its subclasses. Here’s what that looks like for our new chainable
class:
struct chainable {
virtual void operator()(vec_t& adj) const { }
static inline void* operator new(std::size_t nbytes) {
std::cout << "arena allocating nbytes = " << nbytes << std::endl;
return arena_malloc(nbytes);
}
};
Same functor signature for performing continuations. But now there’s an operator new
definition that calls the arena (and prints out how many bytes were allocated). Now whenever new
is called on a suclass of chainable
, its memory will be drawn from the arena.
The arena winds up being 8-byte aligned because we always allocate objects and in clang++ and g++ at least, that produces 8-byte alignment. That’s good, because we need it for actually operating on the objects efficiently.
Stack of continuations
These closures are used as continuations to propagate the chain rule in reverse mode. We hold them in a stack just like before, which has a couple convenience methods.
std::vector<chainable*> STACK; // ** global **
inline void clear_stack() {
STACK.clear(); // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
STACK.push_back(c);
}
Note again that clear()
does not affect capacity. So the stack will keep growing until it doesn’t need to grow any more and then stay at that size until its memory is freed directly.
Autodiff variables and the ids behave exactly as before
index_t NEXT_ID = 0; // ** global **
inline index_t next_id() {
return NEXT_ID++;
}
struct var {
real_t val_;
int id_;
var(const var& v) : val_(v.val_), id_(v.id_) { }
var(real_t val, int id) : val_(val), id_(id) { }
var(real_t val) : val_(val), id_(next_id()) { }
var() : val_(std::numeric_limits<real_t>::quiet_NaN()), id_(next_id()) { }
var& operator+=(const var& x);
var& operator+=(double x);
};
We’ve added operator+=
declarations for var
that will be defined out of the class. The class is still incomplete compared to a full arithmetic class.
Defining functions and continuations
Function definitions are just like in the last instance—the arena memory management remains implicit.
struct exp_v : public chainable {
index_t x1, y;
real_t yv;
exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
void operator()(vec_t& adj) const {
adj[x1] += yv * adj[y];
}
};
inline var exp(const var& x) {
var y(std::exp(x.val_));
push_chainable(new exp_v(x.id_, y.id_, y.val_));
return y;
}
The compound add-assign operators are defined using the existing add_vv
closure type,
inline var& var::operator+=(const var& x) {
val_ += x.val_;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_vv(old_id, x.id_, id_));
return *this;
}
The argument for the object being operated on is implicit. Thus val_
refers to the value of that object, whereas x.val_
picks out the value of the argument. The first step updates the object’s value by incrementing it. Then the old index is saved and a new one is generated. The old and new indexes are used to construct the add_vv
insance, which is pushed onto the stack of contiuations. Then a reference to the object is returned by return *this
in order to match the return type.
The one for double
values is similar, but uses add_v
.
inline var& var::operator+=(double x) {
val_ += x;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_v(old_id, id_));
return *this;
}
Gradient calculation functional
The basic gradient calculation is just like last time,
void grad(const var& y, const std::vector<var>& x,
std::vector<real_t>& dy_dx) {
std::vector<real_t> adj(y.id_ + 1, 0.0);
adj[y.id_] = 1;
for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
(**chain_f)(adj);
dy_dx.clear();
for (auto&& xi : x)
dy_dx.push_back(adj[xi.id_]);
}
But now we will define the interface that Stan actually uses. This uses the std::function
argument because the storage cost isn’t high and it avoids templating on the functor type (i.e., type of f
).
void gradient(const std::function<var(const std::vector<var>&)>& f,
const std::vector<real_t>& x,
real_t& y, std::vector<real_t>& dy_dx) {
clear_memory();
std::vector<var> xv;
xv.reserve(x.size());
for (auto&& xi : x) xv.emplace_back(xi);
var yv = f(xv);
y = yv.val_;
grad(yv, xv, dy_dx);
}
This takes an argument function f
that operators on a vector<var>
to return a var
. Here is the example of such a function we use in the evaluation:
struct my_fun {
template <typename T>
T operator()(const std::vector<T>& x) const {
return exp(x[0]) * x[1] * x[1];
}
};
An instance of my_fun
will be assignable to std::function<var(const std::vector<var>&)>
. Note that the user here just writes my_fun
and never has to refer to autodiff types—everything’s just implicit in the template parameter T
.
Going back to the gradient function, it takes the functor as an argument, along with real-valued vectors of argument and a mutable real values for the result and multable vector of real values for the gradient.
The first step is to clear the memory. The clear function is simple, clearing the stack, the arena, and setting the next identifier back to zero.
inline void clear_memory() {
NEXT_ID = 0;
clear_stack();
clear_arena();
}
The rest of the function first creates var
instances of the argument scalar types.
std::vector<var> xv;
xv.reserve(x.size());
for (auto&& xi : x) xv.emplace_back(xi);
By using emplace
, the new var
is constructed in place in the vector’s memory rather than being copied.
Then it applies the function, extracts the value, and calls the other gradient function to calculate the gradient.
var yv = f(xv);
y = yv.val_;
grad(yv, xv, dy_dx);
Running it all
From a user’s persepctive, it’s just a matter of defining a functor to differentate, which I’ll repeate here again:
struct my_fun {
template <typename T>
T operator()(const std::vector<T>& x) const {
return exp(x[0]) * x[1] * x[1];
}
};
Then we can call it by setting up the argument vectors and result types, instantiating the functor f
, then calling the gradient functional.
int main() {
using agrad::var;
using std::vector;
using agrad::vec_t;
vec_t x = { 5.2, -3.9 };
vec_t dy_dx;
double y;
my_fun f;
agrad::gradient(f, x, y, dy_dx);
std::cout << "y = f(x) = " << y << std::endl;
for (std::size_t i = 0; i < x.size(); ++i)
std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;
return 0;
}
That’s it. I believe this will be faster than how we do autodiff now. It’s smaller and more memory local for most operations, so it should be faster.
Next up: matrix types
The real target is matrix types. I haven’t written this part yet, but the idea is to have a matrix_var
that stores a MatrixXd
value and int[]
of ids.