A new continuation-based autodiff by refactoring

Part III: Arena-Based Memory and Functional Application

What’s still wrong

Allocating that stack of chainables with malloc and then freeing them when the stack is cleared is going to be very slow, with a memory-affecting constructor and destructor for each subexpression being evaluated.

The fix: arena-based memory

Our third version of this program is going to clean up a bunch of things at once. It’s going to give us

  • configurable sizes for indexes (32 or 64-bit being the only logical choices) and floating point scalars (32 bit and 64 bit being most obvious choices)—we’ll just go with the machine size of 64 and 64

  • arena-based memory management with custom operator new for the chainable base class (and descendants)

  • functional for for evaluating gradients like the one in stan::math::gradient

  • more operators including += and *= and double-var versions of binary ops

The code still isn’t very well organized, documented, or broken down into functions. So I’ll step through it all like in the last posts.

#include <limits>
#include <iostream>
#include <vector>
#include <cmath>

namespace agrad {

// CONFIGURABLE SIZES

typedef unsigned int index_t;          // determines max expression size
typedef double real_t;                 // basic scalar type
typedef std::vector<real_t> vec_t;     // vector of scalars


// SIMPLE MEMORY ARENA

std::size_t MEM_SIZE = 1024 * 1024 * 1024;  // 1 GB

char* ARENA = static_cast<char*>(malloc(MEM_SIZE));      // ** global **

char* ARENA_NEXT = ARENA;                                // ** global **

inline void clear_arena() {
  ARENA_NEXT = ARENA;
}

inline char* arena_malloc(std::size_t nbytes) {
  char* bytes = ARENA_NEXT;
  ARENA_NEXT += nbytes;
  return bytes;
}

template <typename T>
inline T* arena_array_malloc(std::size_t n) {
  return static_cast<T*>(arena_malloc(sizeof(T) * n));
}


// STACK OF CHAINABLES FOR REVERSE PASS IN ARENA

struct chainable {
  virtual void operator()(vec_t& adj) const { }
  static inline void* operator new(std::size_t nbytes) {
    std::cout << "arena allocating nbytes = " << nbytes << std::endl;
    return arena_malloc(nbytes);
  }
};

std::vector<chainable*> STACK;                          // ** global **

inline void clear_stack() {
  STACK.clear();  // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
  STACK.push_back(c);
}

index_t NEXT_ID = 0;                                    // ** global **

inline index_t next_id() {
  return NEXT_ID++;
}


// CLEAR MEMORY (RESETS, BUT DOES NOT FREE)

inline void clear_memory() {
  NEXT_ID = 0;
  clear_stack();
  clear_arena();
}


// AUTODIFF VARIABLE

struct var {
  real_t val_;
  int id_;
  var(const var& v) : val_(v.val_), id_(v.id_) { }
  var(real_t val, int id) : val_(val), id_(id) { }
  var(real_t val) : val_(val), id_(next_id()) { }
  var() : val_(std::numeric_limits<real_t>::quiet_NaN()), id_(next_id()) { }
  var& operator+=(const var& x);
  var& operator+=(double x);
};


// REVERSE PASS GRADIENT CALCULATIONS (PURE CONTINUATIONS)

void grad(const var& y, const std::vector<var>& x,
          std::vector<real_t>& dy_dx) {
  std::vector<real_t> adj(y.id_ + 1, 0.0);
  adj[y.id_] = 1;
  for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
    (**chain_f)(adj);
  dy_dx.clear();
  for (auto&& xi : x)
    dy_dx.push_back(adj[xi.id_]);
}

/**
 * Set the value y = f(x) and derivatives dy_dx = d/dx f(x).
 */
void gradient(const std::function<var(const std::vector<var>&)>& f,
              const std::vector<real_t>& x,
              real_t& y, std::vector<real_t>& dy_dx) {
  clear_memory();
  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);
  var yv = f(xv);
  y = yv.val_;
  grad(yv, xv, dy_dx);
}


// ADDITION

// allocated in arena
struct add_vv : public chainable {
  index_t x1, x2, y;
  add_vv(index_t x1, index_t x2, index_t y) : x1(x1), x2(x2), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
    adj[x2] += adj[y];
  }
};
struct add_v : public chainable {
  index_t x1, y;
  add_v(index_t x1, index_t y) : x1(x1), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
  }
};
inline var operator+(const var& x1, const var& x2) {
  var y(x1.val_ + x2.val_);
  push_chainable(new add_vv(x1.id_, x2.id_, y.id_));
  return y;
}
inline var operator+(const var& x1, double x2) {
  var y(x1.val_ + x2);
  push_chainable(new add_v(x1.id_, y.id_));
  return y;
}
inline var operator+(double x1, const var& x2) {
  return x2 + x1;
}


// COMPOUND ASSIGN-ADD

inline var& var::operator+=(const var& x) {
  val_ = val_ + x.val_;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_vv(old_id, x.id_, id_));
  return *this;
}
inline var& var::operator+=(double x) {
  val_ = val_ + x;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_v(old_id, id_));
  return *this;
}


// MULTIPLICATION

// allocated in arena
struct multiply_vv : public chainable {
  index_t x1, x2, y;
  real_t x1v, x2v;
  multiply_vv(index_t x1, index_t x2, index_t y, real_t x1v, real_t x2v)
      : x1(x1), x2(x2), y(y), x1v(x1v), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
    adj[x2] += x1v * adj[y];
  }
};
struct multiply_v : public chainable {
  index_t x1, y;
  real_t  x2v;
  multiply_v(index_t x1, index_t y, real_t x2v) : x1(x1), y(y), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
  }
};
inline var operator*(const var& x1, const var& x2) {
  var y(x1.val_ * x2.val_);
  push_chainable(new multiply_vv(x1.id_, x2.id_, y.id_, x1.val_, x2.val_));
  return y;
}
inline var operator*(const var& x1, double x2) {
  var y(x1.val_ * x2);
  push_chainable(new multiply_v(x1.id_, y.id_, x2));
  return y;
}
inline var operator*(double x1, const var& x2) {
  return x2 * x1;
}

// EXPONENTIATION

struct exp_v : public chainable {
  index_t x1, y;
  real_t yv;
  exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
  void operator()(vec_t& adj) const {
    adj[x1] += yv * adj[y];
  }
};
inline var exp(const var& x) {
  var y(std::exp(x.val_));
  push_chainable(new exp_v(x.id_, y.id_, y.val_));
  return y;
}


} // namespace agrad


// EXAMPLE

struct my_fun {
  template <typename T>
  T operator()(const std::vector<T>& x) const {
    return exp(x[0]) * x[1] * x[1];
  }
};

int main() {
  using agrad::var;
  using std::vector;
  using agrad::vec_t;
  vec_t x = { 5.2, -3.9 };
  vec_t dy_dx;
  double y;
  my_fun f;
  agrad::gradient(f, x, y, dy_dx);
  std::cout << "y = f(x) = " << y << std::endl;
  for (std::size_t i = 0; i < x.size(); ++i)
    std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;

  return 0;
}

Configurable Sizes

typedef unsigned int index_t;          // determines max expression size
typedef double real_t;                 // basic scalar type
typedef std::vector<real_t> vec_t;     // vector of scalars

Using smaller integers means closures will be smaller and thus programs may be more memory local. It will limit number of subexpressions to around 4 billion. I’m still using double-precision floating point for real scalars and also vectors (the vec_t is replacing adj_t as I’ll need it in multiple places).

Simple arena-based memory

WARNING: Do not try this at home.
This will segfault in the face of a program that’s too large. To get around that issue, the memory arena either needs to be carefully sized or it needs to be dynamically expandable. The memory arena in Stan provides the same interface with a slightly slower (due to branching and pointer chasing) but much safer implementation.

std::size_t MEM_SIZE = 1024 * 1024 * 1024;  // 1 GB

char* ARENA = static_cast<char*>(malloc(MEM_SIZE));      // ** global **

char* ARENA_NEXT = ARENA;                                // ** global **

inline void clear_arena() {
  ARENA_NEXT = ARENA;
}

inline char* arena_malloc(std::size_t nbytes) {
  char* bytes = ARENA_NEXT;
  ARENA_NEXT += nbytes;
  return bytes;
}

template <typename T>
inline T* arena_array_malloc(std::size_t n) {
  return static_cast<T*>(arena_malloc(sizeof(T) * n));
}

The memory size here is fixed at 1GB. That’s it. Go above that and segfault. You have now been warned and I’ve assumed the appropriate waiver before going on.

The arena itself is just a bunch of bytes (char in C++). The pointer to the next available memory location is ARENA_NEXT and it starts at the beginning.

The arena_malloc function has the same signature as the built in malloc function. It just returns a pointer to the next position in the arena then advances the pointer. The arena_array_malloc is just a convenience for arena-allocating arrays.

Closures allocated in the arena

This won’t do anything for us on its own. C++ has a really nifty feature where you can overload the operator new implementation for a class and all of its subclasses. Here’s what that looks like for our new chainable class:

struct chainable {
  virtual void operator()(vec_t& adj) const { }

  static inline void* operator new(std::size_t nbytes) {
    std::cout << "arena allocating nbytes = " << nbytes << std::endl;
    return arena_malloc(nbytes);
  }
};

Same functor signature for performing continuations. But now there’s an operator new definition that calls the arena (and prints out how many bytes were allocated). Now whenever new is called on a suclass of chainable, its memory will be drawn from the arena.

The arena winds up being 8-byte aligned because we always allocate objects and in clang++ and g++ at least, that produces 8-byte alignment. That’s good, because we need it for actually operating on the objects efficiently.

Stack of continuations

These closures are used as continuations to propagate the chain rule in reverse mode. We hold them in a stack just like before, which has a couple convenience methods.

std::vector<chainable*> STACK;                          // ** global **

inline void clear_stack() {
  STACK.clear();  // leaves capacity unchanged
}

inline void push_chainable(chainable* c) {
  STACK.push_back(c);
}

Note again that clear() does not affect capacity. So the stack will keep growing until it doesn’t need to grow any more and then stay at that size until its memory is freed directly.

Autodiff variables and the ids behave exactly as before

index_t NEXT_ID = 0;                                    // ** global **

inline index_t next_id() {
  return NEXT_ID++;
}

struct var {
  real_t val_;
  int id_;
  var(const var& v) : val_(v.val_), id_(v.id_) { }
  var(real_t val, int id) : val_(val), id_(id) { }
  var(real_t val) : val_(val), id_(next_id()) { }
  var() : val_(std::numeric_limits<real_t>::quiet_NaN()), id_(next_id()) { }
  var& operator+=(const var& x);
  var& operator+=(double x);
};

We’ve added operator+= declarations for var that will be defined out of the class. The class is still incomplete compared to a full arithmetic class.

Defining functions and continuations

Function definitions are just like in the last instance—the arena memory management remains implicit.

struct exp_v : public chainable {
  index_t x1, y;
  real_t yv;
  exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
  void operator()(vec_t& adj) const {
    adj[x1] += yv * adj[y];
  }
};
inline var exp(const var& x) {
  var y(std::exp(x.val_));
  push_chainable(new exp_v(x.id_, y.id_, y.val_));
  return y;
}

The compound add-assign operators are defined using the existing add_vv closure type,

inline var& var::operator+=(const var& x) {
  val_ += x.val_;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_vv(old_id, x.id_, id_));
  return *this;
}

The argument for the object being operated on is implicit. Thus val_ refers to the value of that object, whereas x.val_ picks out the value of the argument. The first step updates the object’s value by incrementing it. Then the old index is saved and a new one is generated. The old and new indexes are used to construct the add_vv insance, which is pushed onto the stack of contiuations. Then a reference to the object is returned by return *this in order to match the return type.

The one for double values is similar, but uses add_v.

inline var& var::operator+=(double x) {
  val_ += x;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_v(old_id, id_));
  return *this;
}

Gradient calculation functional

The basic gradient calculation is just like last time,

void grad(const var& y, const std::vector<var>& x,
          std::vector<real_t>& dy_dx) {
  std::vector<real_t> adj(y.id_ + 1, 0.0);
  adj[y.id_] = 1;
  for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
    (**chain_f)(adj);
  dy_dx.clear();
  for (auto&& xi : x)
    dy_dx.push_back(adj[xi.id_]);
}

But now we will define the interface that Stan actually uses. This uses the std::function argument because the storage cost isn’t high and it avoids templating on the functor type (i.e., type of f).

void gradient(const std::function<var(const std::vector<var>&)>& f,
              const std::vector<real_t>& x,
              real_t& y, std::vector<real_t>& dy_dx) {
  clear_memory();
  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);
  var yv = f(xv);
  y = yv.val_;
  grad(yv, xv, dy_dx);
}

This takes an argument function f that operators on a vector<var> to return a var. Here is the example of such a function we use in the evaluation:

struct my_fun {
  template <typename T>
  T operator()(const std::vector<T>& x) const {
    return exp(x[0]) * x[1] * x[1];
  }
};

An instance of my_fun will be assignable to std::function<var(const std::vector<var>&)>. Note that the user here just writes my_fun and never has to refer to autodiff types—everything’s just implicit in the template parameter T.

Going back to the gradient function, it takes the functor as an argument, along with real-valued vectors of argument and a mutable real values for the result and multable vector of real values for the gradient.

The first step is to clear the memory. The clear function is simple, clearing the stack, the arena, and setting the next identifier back to zero.

inline void clear_memory() {
  NEXT_ID = 0;
  clear_stack();
  clear_arena();
}

The rest of the function first creates var instances of the argument scalar types.

  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);

By using emplace, the new var is constructed in place in the vector’s memory rather than being copied.

Then it applies the function, extracts the value, and calls the other gradient function to calculate the gradient.

  var yv = f(xv);
  y = yv.val_;
  grad(yv, xv, dy_dx);

Running it all

From a user’s persepctive, it’s just a matter of defining a functor to differentate, which I’ll repeate here again:

struct my_fun {
  template <typename T>
  T operator()(const std::vector<T>& x) const {
    return exp(x[0]) * x[1] * x[1];
  }
};

Then we can call it by setting up the argument vectors and result types, instantiating the functor f, then calling the gradient functional.

int main() {
  using agrad::var;
  using std::vector;
  using agrad::vec_t;
  vec_t x = { 5.2, -3.9 };
  vec_t dy_dx;
  double y;
  my_fun f;
  agrad::gradient(f, x, y, dy_dx);
  std::cout << "y = f(x) = " << y << std::endl;
  for (std::size_t i = 0; i < x.size(); ++i)
    std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;

  return 0;
}

That’s it. I believe this will be faster than how we do autodiff now. It’s smaller and more memory local for most operations, so it should be faster.

Next up: matrix types

The real target is matrix types. I haven’t written this part yet, but the idea is to have a matrix_var that stores a MatrixXd value and int[] of ids.

8 Likes