A new continuation-based autodiff by refactoring


#25

Part IV: Matrices and Matrix Functions

I haven’t quite had time to get this working, but the basic organization’s there of how it’ll work. I tried to make the enhancements others suggested in earlier responses like automatic constructor definitions, and I added a few of my own like brace initialization.

If you can see why the vector autodiff isn’t working yet, please let me know!

The idea is just like ordinary autodiff—the vector stores a vector of values and a vector of int indexes. This makes getting and setting both very easy, though I haven’t been able to figure out how to do reference sets without creating expression templates with back pointers to the originals. Then the autodiff is exactly the same.

By my rough calculation, with integer indexes, the memory used should be about the same as for the current approach, but with benefits of much better memory locality and no need to copy out vector or matrix values (they can be used directly with Eigen::Map).

I’m still a little hazy on how the Eigen reference and base class pointer arguments work and which gets instantiated in the constructors and other classes. Or which is more efficient, so if anyone knows that, please let me know.

#include <Eigen/Eigen>
#include <algorithm>
#include <cmath>
#include <initializer_list>
#include <iostream>
#include <limits>
#include <utility>
#include <vector>

namespace agrad {

// CONFIGURABLE SIZES

typedef unsigned int index_t;          // determines max expression size
typedef double real_t;                 // basic scalar type
typedef std::vector<real_t> vec_t;     // vector of scalars
typedef Eigen::Matrix<real_t, -1, 1> real_vector_t;
typedef Eigen::Matrix<index_t, -1, 1> index_vector_t;


// HEAP MEMORY MANAGEMENT

template <typename T>
inline T* array_malloc(std::size_t n) {
  return static_cast<T*>(malloc(sizeof(T) * n));
}

template <typename T>
inline T* copy_array_malloc(const T* x, index_t N) {
  T* y = array_malloc<T>(N);
  std::copy(x, x + N, y);
  return y;
}
template <typename T, int R, int C>
inline T* copy_array_malloc(const Eigen::Matrix<T, R, C>& x) {
  T* y = array_malloc<T>(x.size());
  std::copy(&x[0], &x[0] + x.size(), y);
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}
template <typename M>
inline typename M::Scalar* copy_array_malloc(const Eigen::DenseBase<M>& x) {
  typename M::Scalar* y = array_malloc<typename M::Scalar>(x.size());
  // can't use std::copy because not knowing M requires x(n) access
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}


// ARENA MEMORY MANAGEMENT

std::size_t MEM_SIZE = 1024 * 1024 * 1024;  // 1 GB

char* ARENA = static_cast<char*>(malloc(MEM_SIZE));      // ** global **

char* ARENA_NEXT = ARENA;                                // ** global **

inline void clear_arena() {
  ARENA_NEXT = ARENA;
}

inline void* arena_malloc(std::size_t nbytes) {
  char* bytes = ARENA_NEXT;
  ARENA_NEXT += nbytes;
  return bytes;
}

template <typename T>
inline T* array_arena_malloc(std::size_t n) {
  return static_cast<T*>(arena_malloc(sizeof(T) * n));
}

template <typename T>
inline T* copy_array_arena_malloc(const T* x, index_t N) {
  T* y = array_arena_malloc<T>(N);
  std::copy(x, x + N, y);
  return y;
}
template <typename T, int R, int C>
inline T* copy_array_arena_malloc(const Eigen::Matrix<T, R, C>& x) {
  T* y = array_arena_malloc<T>(x.size());
  std::copy(&x[0], &x[0] + x.size(), y);
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}
template <typename M>
inline typename M::Scalar* copy_array_arena_malloc(const Eigen::DenseBase<M>& x) {
  typename M::Scalar* y = array_arena_malloc<typename M::Scalar>(x.size());
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}


// STACK OF CHAINABLES FOR REVERSE PASS IN ARENA

struct chainable {
  virtual void operator()(vec_t& adj) const { }
  static inline void* operator new(std::size_t nbytes) {
    return arena_malloc(nbytes);
  }
};

std::vector<chainable*> STACK;                          // ** global **

inline void clear_stack() {
  STACK.clear();  // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
  STACK.push_back(c);
}

index_t NEXT_ID = 0;                                    // ** global **

inline index_t next_id() {
  return NEXT_ID++;
}

inline index_t* next_id_array_malloc(int N) {
  index_t* y = array_malloc<index_t>(N);
  for (index_t n = 0; n < N; ++n) y[n] = next_id();
  return y;
}



// CLEAR MEMORY (RESETS, BUT DOES NOT FREE)

inline void clear_memory() {
  NEXT_ID = 0;
  clear_stack();
  clear_arena();
}


// AUTODIFF VARIABLE

struct var {
  real_t val_;
  index_t id_;
  ~var() = default;
  var() = default;
  var(const var& v) = default;
  var(real_t val, index_t id) : val_(val), id_(id) { }
  var(real_t val) : var(val, next_id()) { }
  var& operator=(const var& x) {
    val_ = x.val_;
    id_ = x.id_;
    return *this;
  }
  var& operator=(double x) {
    val_ = x;
    id_ = next_id();
    return *this;
  }
  var& operator+=(const var& x);
  var& operator+=(real_t x);
};


struct vector_var {
  int N_;
  real_t* val_;
  index_t* id_;

  vector_var() = default;

  // Eigen reference constructor (allow template expression args)
  vector_var(const Eigen::Ref<Eigen::Matrix<real_t, -1, 1>>& v)
      : N_(v.size()),
        val_(copy_array_malloc(v)),
        id_(next_id_array_malloc(v.size())) { }

  // argument copies on purpose
  vector_var(std::initializer_list<real_t> v)
      : N_(v.size()),
        val_(array_malloc<real_t>(v.size())),
        id_(next_id_array_malloc(v.size())) {
  }

  // // Eigen base constructor
  template <typename M>
  vector_var(const Eigen::DenseBase<M>& v)
      : N_(v.size()),
        val_(copy_array_malloc(v)),
        id_(next_id_array_malloc(v.size())) { }

  // copy constructor
  vector_var(const vector_var& v)
      : N_(v.N_),
        val_(copy_array_malloc(v.val_, v.N_)),
        id_(copy_array_malloc(v.id_, v.N_)) {
  }

  // move constructor (shallow copy in initializers, nullify source in body)
  // can this be = default with same behavior?
  vector_var(vector_var&& v) noexcept
      : N_(v.N_), val_(v.val_), id_(v.id_) {
    v.N_ = 0;
    v.val_ = nullptr;
    v.id_ = nullptr;
  }

  // allocated, but unitialized (like Eigen, unlike std::vector)
  vector_var(size_t N)
      : N_(N), val_(array_malloc<real_t>(N)), id_(array_malloc<index_t>(N)) { }

  // destructor frees memory to complete RAII pattern
  ~vector_var() { free(val_); free(id_); }

  // are these still necessary with move constructor?

  // copy assignment
  vector_var& operator=(const vector_var& v) {
    N_ = v.N_;
    val_ = copy_array_malloc(v.val_, v.N_);
    id_ = copy_array_malloc(v.id_, v.N_);
    return *this;
  }
  // move assignment (destructive; see move constructor)
  vector_var& operator=(vector_var&& v) {
    N_ = v.N_;
    val_ = v.val_;
    val_ = nullptr;
    id_ = v.id_;
    id_ = nullptr;
    return *this;
  }

  index_t size() const {
    return N_;
  }

  real_t val(index_t i) const {
    return val_[i];
  }
  decltype(auto) val() const {
    return Eigen::Map<real_vector_t>(val_, N_);
  }

  index_t id(index_t i) const {
    return id_[i];
  }
  decltype(auto) id() const {
    return Eigen::Map<index_vector_t>(id_, N_);
  }

  // std::vector-style indexing
  var operator[](index_t i) const {
    return var(val(i), id(i));
  }

  // Eigen::Vector-style indexing
  var operator()(index_t i) const {
    return var(val(i), id(i));
  }

  void set(index_t i, real_t v) {
    val_[i] = v;
    id_[i] = next_id();
  }
  void set(index_t i, const var& v) {
    val_[i] = v.val_;
    id_[i] = v.id_;
  }
};

// index_t: (2 * N + 1)    real_t: (N + 1)   size pointer: 1
// bytes (int index_t, double real_t): 16N + 28
// current:  N pointers to y, N ptrs to x1,  pointer to x2, length (8), vtable
//           pointer: 2 N + 3  size: 2
//           bytes: 16N + 32  (+ vari being pointed to, but consider it a wash)
struct vector_scalar_multiply_vv : public chainable {
  const index_t N_;
  const index_t* x1_id_;
  const index_t x2_id_;
  const index_t* y_id_;
  real_t* x1_val_;
  real_t x2_val_;
  vector_scalar_multiply_vv(const vector_var& x1,
                            const var& x2,
                            const vector_var& y)
      : N_(x1.N_),
        x1_id_(copy_array_arena_malloc<index_t>(x1.id_, x1.N_)),
        x2_id_(x2.id_),
        y_id_(array_arena_malloc<index_t>(y.N_)),
        x1_val_(copy_array_arena_malloc<real_t>(x1.val_, x1.N_)),
        x2_val_(x2.val_)
  { }

  void operator()(vec_t& adj) const {
    for (int n = 0; n < N_; ++n) {
      std::cout << "\n**** n = " << n << std::endl;
      std::cout << "x1_id_[n] = " << x1_id_[n] << std::endl;
      std::cout << "y_id_[n] = " << y_id_[n] << std::endl;
      std::cout << "x2_val_ = " << x2_val_ << std::endl;
      std::cout << "x1_val_[n] = " << x1_val_[n] << std::endl;
      adj[x1_id_[n]] += adj[y_id_[n]] * x2_val_;
      adj[x2_id_] += adj[y_id_[n]] * x1_val_[n];
    }
  }
};

inline vector_var multiply(const vector_var& x1, var& x2) {
  vector_var y(x1.val() * x2.val_);
  push_chainable(new vector_scalar_multiply_vv(x1, x2, y));
  return y;
}




// REVERSE PASS GRADIENT CALCULATIONS (PURE CONTINUATIONS)

void grad(const var& y, const std::vector<var>& x,
          std::vector<real_t>& dy_dx) {
  std::vector<real_t> adj(y.id_ + 1, 0.0);
  adj[y.id_] = 1;
  for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
    (**chain_f)(adj);
  dy_dx.clear();
  for (auto&& xi : x)
    dy_dx.push_back(adj[xi.id_]);
}

/**
 * Set the value y = f(x) and derivatives dy_dx = d/dx f(x).
 */
void gradient(const std::function<var(const std::vector<var>&)>&& f,
              const std::vector<real_t>& x,
              real_t& y, std::vector<real_t>& dy_dx) {
  clear_memory();
  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);
  var yv = f(xv);
  y = yv.val_;
  grad(yv, xv, dy_dx);
}


// ADDITION

// allocated in arena
struct add_vv : public chainable {
  index_t x1, x2, y;
  add_vv(index_t x1, index_t x2, index_t y) : x1(x1), x2(x2), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
    adj[x2] += adj[y];
  }
};
struct add_v : public chainable {
  index_t x1, y;
  add_v(index_t x1, index_t y) : x1(x1), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
  }
};
inline var operator+(const var& x1, const var& x2) {
  var y(x1.val_ + x2.val_);
  push_chainable(new add_vv(x1.id_, x2.id_, y.id_));
  return y;
}
inline var operator+(const var& x1, double x2) {
  var y(x1.val_ + x2);
  push_chainable(new add_v(x1.id_, y.id_));
  return y;
}
inline var operator+(double x1, const var& x2) {
  return x2 + x1;
}


// COMPOUND ASSIGN-ADD

inline var& var::operator+=(const var& x) {
  val_ += x.val_;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_vv(old_id, x.id_, id_));
  return *this;
}
inline var& var::operator+=(double x) {
  val_ += x;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_v(old_id, id_));
  return *this;
}


// MULTIPLICATION

// allocated in arena
struct multiply_vv : public chainable {
  index_t x1, x2, y;
  real_t x1v, x2v;
  multiply_vv(index_t x1, index_t x2, index_t y, real_t x1v, real_t x2v)
      : x1(x1), x2(x2), y(y), x1v(x1v), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
    adj[x2] += x1v * adj[y];
  }
};
struct multiply_v : public chainable {
  index_t x1, y;
  real_t  x2v;
  multiply_v(index_t x1, index_t y, real_t x2v) : x1(x1), y(y), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
  }
};
inline var operator*(const var& x1, const var& x2) {
  var y(x1.val_ * x2.val_);
  push_chainable(new multiply_vv(x1.id_, x2.id_, y.id_, x1.val_, x2.val_));
  return y;
}
inline var operator*(const var& x1, double x2) {
  var y(x1.val_ * x2);
  push_chainable(new multiply_v(x1.id_, y.id_, x2));
  return y;
}
inline var operator*(double x1, const var& x2) {
  return x2 * x1;
}

// EXPONENTIATION

struct exp_v : public chainable {
  index_t x1, y;
  real_t yv;
  exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
  void operator()(vec_t& adj) const {
    adj[x1] += yv * adj[y];
  }
};
inline var exp(const var& x) {
  var y(std::exp(x.val_));
  push_chainable(new exp_v(x.id_, y.id_, y.val_));
  return y;
}


} // namespace agrad


// EXAMPLE

struct my_fun {
  int n_;
  my_fun(int n) : n_(n) { }
  my_fun() : my_fun(0) { }
  template <typename T>
  T operator()(const std::vector<T>& x) const {
    T x1 = x[0];
    agrad::vector_var x2(x.size() - 1);
    for (size_t i = 0; i < x2.size(); ++i) x2.set(i, x[i + 1]);
    return agrad::multiply(x2, x1)(n_);
  }
};

int main() {
  using agrad::var;
  using std::vector;
  using agrad::vec_t;
  vec_t x = { 5.2, -3.9, 1.2 };
  vec_t dy_dx;
  double y;
  my_fun f;
  agrad::gradient(f, x, y, dy_dx);
  std::cout << "y = f(x) = " << y << std::endl;
  for (std::size_t i = 0; i < x.size(); ++i)
    std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;

  agrad::vector_var v = { 1.2, 3.9 };

  return 0;
}