A new continuation-based autodiff by refactoring

This topic is old, so apologies if you all have already moved on. If not, then here’s my best effort in response to:

I think there’s two issues. The first revolves around the size of y. In previous Parts, y held the scalar return value of f(x). In Part IV, f(x) does not have a scalar return value and thus y needs to adapt. The second issue stems from vector_scalar_multiply_vv wrapping multiple differentiable operations into one chainable call. I believe these two issues jointly imply an indexing issue around adj.

The code below attempts a solution by changing y to hold the outcome of each differentiable operation of f(). Edits show up in grad, gradient, multiply, my_fun, and main, where the code I replace is commented out just above the new code.

I’m not arguing this is the best solution, in fact I’m just hoping it is a solution. I’m happy to put more effort into this, if people are still interested.

#include <Eigen/Eigen>
#include <algorithm>
#include <cmath>
#include <initializer_list>
#include <iostream>
#include <limits>
#include <utility>
#include <vector>

namespace agrad {

// CONFIGURABLE SIZES

typedef unsigned int index_t;          // determines max expression size
typedef double real_t;                 // basic scalar type
typedef std::vector<real_t> vec_t;     // vector of scalars
typedef Eigen::Matrix<real_t, -1, 1> real_vector_t;
typedef Eigen::Matrix<index_t, -1, 1> index_vector_t;


// HEAP MEMORY MANAGEMENT

template <typename T>
inline T* array_malloc(std::size_t n) {
  return static_cast<T*>(malloc(sizeof(T) * n));
}

template <typename T>
inline T* copy_array_malloc(const T* x, index_t N) {
  T* y = array_malloc<T>(N);
  std::copy(x, x + N, y);
  return y;
}
template <typename T, int R, int C>
inline T* copy_array_malloc(const Eigen::Matrix<T, R, C>& x) {
  T* y = array_malloc<T>(x.size());
  std::copy(&x[0], &x[0] + x.size(), y);
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}
template <typename M>
inline typename M::Scalar* copy_array_malloc(const Eigen::DenseBase<M>& x) {
  typename M::Scalar* y = array_malloc<typename M::Scalar>(x.size());
  // can't use std::copy because not knowing M requires x(n) access
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}


// ARENA MEMORY MANAGEMENT

std::size_t MEM_SIZE = 1024 * 1024 * 1024;  // 1 GB

char* ARENA = static_cast<char*>(malloc(MEM_SIZE));      // ** global **

char* ARENA_NEXT = ARENA;                                // ** global **

inline void clear_arena() {
  ARENA_NEXT = ARENA;
}

inline void* arena_malloc(std::size_t nbytes) {
  char* bytes = ARENA_NEXT;
  ARENA_NEXT += nbytes;
  return bytes;
}

template <typename T>
inline T* array_arena_malloc(std::size_t n) {
  return static_cast<T*>(arena_malloc(sizeof(T) * n));
}

template <typename T>
inline T* copy_array_arena_malloc(const T* x, index_t N) {
  T* y = array_arena_malloc<T>(N);
  std::copy(x, x + N, y);
  return y;
}
template <typename T, int R, int C>
inline T* copy_array_arena_malloc(const Eigen::Matrix<T, R, C>& x) {
  T* y = array_arena_malloc<T>(x.size());
  std::copy(&x[0], &x[0] + x.size(), y);
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}
template <typename M>
inline typename M::Scalar* copy_array_arena_malloc(const Eigen::DenseBase<M>& x) {
  typename M::Scalar* y = array_arena_malloc<typename M::Scalar>(x.size());
  for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
  return y;
}


// STACK OF CHAINABLES FOR REVERSE PASS IN ARENA

struct chainable {
  virtual void operator()(vec_t& adj) const { }
  static inline void* operator new(std::size_t nbytes) {
    return arena_malloc(nbytes);
  }
};

std::vector<chainable*> STACK;                          // ** global **

inline void clear_stack() {
  STACK.clear();  // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
  STACK.push_back(c);
}

index_t NEXT_ID = 0;                                    // ** global **

inline index_t next_id() {
  return NEXT_ID++;
}

inline index_t* next_id_array_malloc(int N) {
  index_t* y = array_malloc<index_t>(N);
  for (index_t n = 0; n < N; ++n) y[n] = next_id();
  return y;
}



// CLEAR MEMORY (RESETS, BUT DOES NOT FREE)

inline void clear_memory() {
  NEXT_ID = 0;
  clear_stack();
  clear_arena();
}


// AUTODIFF VARIABLE

struct var {
  real_t val_;
  index_t id_;
  ~var() = default;
  var() = default;
  var(const var& v) = default;
  var(real_t val, index_t id) : val_(val), id_(id) { }
  var(real_t val) : var(val, next_id()) { }
  var& operator=(const var& x) {
    val_ = x.val_;
    id_ = x.id_;
    return *this;
  }
  var& operator=(double x) {
    val_ = x;
    id_ = next_id();
    return *this;
  }
  var& operator+=(const var& x);
  var& operator+=(real_t x);
};


struct vector_var {
  int N_;
  real_t* val_;
  index_t* id_;

  vector_var() = default;

  // Eigen reference constructor (allow template expression args)
  vector_var(const Eigen::Ref<Eigen::Matrix<real_t, -1, 1>>& v)
      : N_(v.size()),
        val_(copy_array_malloc(v)),
        id_(next_id_array_malloc(v.size())) { }

  // argument copies on purpose
  vector_var(std::initializer_list<real_t> v)
      : N_(v.size()),
        val_(array_malloc<real_t>(v.size())),
        id_(next_id_array_malloc(v.size())) {
  }

  // // Eigen base constructor
  template <typename M>
  vector_var(const Eigen::DenseBase<M>& v)
      : N_(v.size()),
        val_(copy_array_malloc(v)),
        id_(next_id_array_malloc(v.size())) { }

  // copy constructor
  vector_var(const vector_var& v)
      : N_(v.N_),
        val_(copy_array_malloc(v.val_, v.N_)),
        id_(copy_array_malloc(v.id_, v.N_)) {
  }

  // move constructor (shallow copy in initializers, nullify source in body)
  // can this be = default with same behavior?
  vector_var(vector_var&& v) noexcept
      : N_(v.N_), val_(v.val_), id_(v.id_) {
    v.N_ = 0;
    v.val_ = nullptr;
    v.id_ = nullptr;
  }

  // allocated, but unitialized (like Eigen, unlike std::vector)
  vector_var(size_t N)
      : N_(N), val_(array_malloc<real_t>(N)), id_(array_malloc<index_t>(N)) { }

  // destructor frees memory to complete RAII pattern
  ~vector_var() { free(val_); free(id_); }

  // are these still necessary with move constructor?

  // copy assignment
  vector_var& operator=(const vector_var& v) {
    N_ = v.N_;
    val_ = copy_array_malloc(v.val_, v.N_);
    id_ = copy_array_malloc(v.id_, v.N_);
    return *this;
  }
  // move assignment (destructive; see move constructor)
  vector_var& operator=(vector_var&& v) {
    N_ = v.N_;
    val_ = v.val_;
    val_ = nullptr;
    id_ = v.id_;
    id_ = nullptr;
    return *this;
  }

  index_t size() const {
    return N_;
  }

  real_t val(index_t i) const {
    return val_[i];
  }
  decltype(auto) val() const {
    return Eigen::Map<real_vector_t>(val_, N_);
  }

  index_t id(index_t i) const {
    return id_[i];
  }
  decltype(auto) id() const {
    return Eigen::Map<index_vector_t>(id_, N_);
  }

  // std::vector-style indexing
  var operator[](index_t i) const {
    return var(val(i), id(i));
  }

  // Eigen::Vector-style indexing
  var operator()(index_t i) const {
    return var(val(i), id(i));
  }

  void set(index_t i, real_t v) {
    val_[i] = v;
    id_[i] = next_id();
  }
  void set(index_t i, const var& v) {
    val_[i] = v.val_;
    id_[i] = v.id_;
  }
};

// index_t: (2 * N + 1)    real_t: (N + 1)   size pointer: 1
// bytes (int index_t, double real_t): 16N + 28
// current:  N pointers to y, N ptrs to x1,  pointer to x2, length (8), vtable
//           pointer: 2 N + 3  size: 2
//           bytes: 16N + 32  (+ vari being pointed to, but consider it a wash)
struct vector_scalar_multiply_vv : public chainable {
  const index_t N_;
  const index_t* x1_id_;
  const index_t x2_id_;
  const index_t* y_id_;
  real_t* x1_val_;
  real_t x2_val_;
  vector_scalar_multiply_vv(const vector_var& x1,
                            const var& x2,
                            const vector_var& y)
      : N_(x1.N_),
        x1_id_(copy_array_arena_malloc<index_t>(x1.id_, x1.N_)),
        x2_id_(x2.id_),
        y_id_(array_arena_malloc<index_t>(y.N_)),
        x1_val_(copy_array_arena_malloc<real_t>(x1.val_, x1.N_)),
        x2_val_(x2.val_)
  { }

  void operator()(vec_t& adj) const {
    for (int n = 0; n < N_; ++n) {
      std::cout << "\n**** n = " << n << std::endl;
      std::cout << "x1_id_[n] = " << x1_id_[n] << std::endl;
      std::cout << "y_id_[n] = " << y_id_[n] << std::endl;
      std::cout << "x2_val_ = " << x2_val_ << std::endl;
      std::cout << "x1_val_[n] = " << x1_val_[n] << std::endl;
      adj[x1_id_[n]] += adj[y_id_[n]] * x2_val_;
      adj[x2_id_] += adj[y_id_[n]] * x1_val_[n];
    }
  }
};

// inline vector_var multiply(const vector_var& x1, var& x2) {
//   vector_var y(x1.val() * x2.val_);
//   push_chainable(new vector_scalar_multiply_vv(x1, x2, y));
//   return y;
// }

// REVERSE PASS GRADIENT CALCULATIONS (PURE CONTINUATIONS)

void grad(//const var& y,
          const vector_var& y,
          const std::vector<var>& x,
          std::vector<real_t>& dy_dx) {
  // std::vector<real_t> adj(y.id_ + 1, 0.0);
  // adj[y.id_] = 1;
  std::vector<real_t> adj(y.id_[y.N_ - 1] + 1, 0.0);
  for (int i=0; i < y.N_; ++i)
    adj[y.id(i)] = 1;
  for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
    (**chain_f)(adj);
  dy_dx.clear();
  for (auto&& xi : x)
    dy_dx.push_back(adj[xi.id_]);
}

/**
 * Set the value y = f(x) and derivatives dy_dx = d/dx f(x).
 */
void gradient(//const std::function<var(const std::vector<var>&)>&& f,
              const std::function<vector_var(const std::vector<var>&)>&& f,
              const std::vector<real_t>& x,
              // real_t& y,
              std::vector<real_t>& y,
              std::vector<real_t>& dy_dx) {
  clear_memory();
  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);
  // var yv = f(xv);
  vector_var yv = f(xv);
  y.resize(yv.N_);
  // y = yv.val_;
  for (int i=0; i < yv.N_; ++i)
    y[i] = yv.val(i);
  grad(yv, xv, dy_dx);
}


// ADDITION

// allocated in arena
struct add_vv : public chainable {
  index_t x1, x2, y;
  add_vv(index_t x1, index_t x2, index_t y) : x1(x1), x2(x2), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
    adj[x2] += adj[y];
  }
};
struct add_v : public chainable {
  index_t x1, y;
  add_v(index_t x1, index_t y) : x1(x1), y(y) { }
  void operator()(vec_t& adj) const {
    adj[x1] += adj[y];
  }
};
inline var operator+(const var& x1, const var& x2) {
  var y(x1.val_ + x2.val_);
  push_chainable(new add_vv(x1.id_, x2.id_, y.id_));
  return y;
}
inline var operator+(const var& x1, double x2) {
  var y(x1.val_ + x2);
  push_chainable(new add_v(x1.id_, y.id_));
  return y;
}
inline var operator+(double x1, const var& x2) {
  return x2 + x1;
}


// COMPOUND ASSIGN-ADD

inline var& var::operator+=(const var& x) {
  val_ += x.val_;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_vv(old_id, x.id_, id_));
  return *this;
}
inline var& var::operator+=(double x) {
  val_ += x;
  index_t old_id = id_;
  id_ = next_id();
  push_chainable(new add_v(old_id, id_));
  return *this;
}


// MULTIPLICATION

// allocated in arena
struct multiply_vv : public chainable {
  index_t x1, x2, y;
  real_t x1v, x2v;
  multiply_vv(index_t x1, index_t x2, index_t y, real_t x1v, real_t x2v)
      : x1(x1), x2(x2), y(y), x1v(x1v), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
    adj[x2] += x1v * adj[y];
  }
};

inline vector_var multiply(const vector_var& x1, var& x2) {
  vector_var y(x1.val() * x2.val_);
  for (int i=0; i < y.N_; ++i) {
    push_chainable(new multiply_vv(x1.id(i), x2.id_, y.id(i), x1.val(i), x2.val_));
  }
  return y;
}

struct multiply_v : public chainable {
  index_t x1, y;
  real_t  x2v;
  multiply_v(index_t x1, index_t y, real_t x2v) : x1(x1), y(y), x2v(x2v) { }
  void operator()(vec_t& adj) const {
    adj[x1] += x2v * adj[y];
  }
};
inline var operator*(const var& x1, const var& x2) {
  var y(x1.val_ * x2.val_);
  push_chainable(new multiply_vv(x1.id_, x2.id_, y.id_, x1.val_, x2.val_));
  return y;
}
inline var operator*(const var& x1, double x2) {
  var y(x1.val_ * x2);
  push_chainable(new multiply_v(x1.id_, y.id_, x2));
  return y;
}
inline var operator*(double x1, const var& x2) {
  return x2 * x1;
}

// EXPONENTIATION

struct exp_v : public chainable {
  index_t x1, y;
  real_t yv;
  exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
  void operator()(vec_t& adj) const {
    adj[x1] += yv * adj[y];
  }
};
inline var exp(const var& x) {
  var y(std::exp(x.val_));
  push_chainable(new exp_v(x.id_, y.id_, y.val_));
  return y;
}


} // namespace agrad


// EXAMPLE

struct my_fun {
  int n_;
  my_fun(int n) : n_(n) { }
  my_fun() : my_fun(0) { }
  template <typename T>
  // T operator()(const std::vector<T>& x) const {
  agrad::vector_var operator()(const std::vector<T>& x) const {
    T x1 = x[0];
    agrad::vector_var x2(x.size() - 1);
    for (size_t i = 0; i < x2.size(); ++i) x2.set(i, x[i + 1]);
    // return agrad::multiply(x2, x1)(n_);
    return agrad::multiply(x2, x1);
  }
};

int main() {
  using agrad::var;
  using std::vector;
  using agrad::vec_t;
  vec_t x = { 5.2, -3.9, 1.2 };
  vec_t dy_dx;
  // double y;
  std::vector<agrad::real_t> y;
  my_fun f;
  agrad::gradient(f, x, y, dy_dx);
  // std::cout << "y = f(x) = " << y << std::endl;
  for (int i=0; i < y.size(); ++i)
    std::cout << "y[" << i << "] = " << y[i] << std::endl;
  for (std::size_t i = 0; i < x.size(); ++i)
    std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;

  agrad::vector_var v = { 1.2, 3.9 };

  return 0;
}
1 Like

I couldn’t keep from thinking about / working on this. What follows is a much cleaner solution than my previous post, in that it requires fewer changes from the original code in Part IV. Below is just the pieces that change relative to the code in Part IV, with the old code commented out.

struct vector_scalar_multiply_vv : public chainable {
  const index_t N_;
  const index_t* x1_id_;
  const index_t x2_id_;
  const index_t* y_id_;
  real_t* x1_val_;
  real_t x2_val_;
  vector_scalar_multiply_vv(const vector_var& x1,
                            const var& x2,
                            const vector_var& y)
      : N_(x1.N_),
        x1_id_(copy_array_arena_malloc<index_t>(x1.id_, x1.N_)),
        x2_id_(x2.id_),
        // y_id_(array_arena_malloc<index_t>(y.N_)),
        y_id_(copy_array_arena_malloc<index_t>(y.id_, y.N_)),
        x1_val_(copy_array_arena_malloc<real_t>(x1.val_, x1.N_)),
        x2_val_(x2.val_)
  { }

  void operator()(vec_t& adj) const {
    for (int n = 0; n < N_; ++n) {
      std::cout << "\n**** n = " << n << std::endl;
      std::cout << "x1_id_[n] = " << x1_id_[n] << std::endl;
      std::cout << "y_id_[n] = " << y_id_[n] << std::endl;
      std::cout << "x2_val_ = " << x2_val_ << std::endl;
      std::cout << "x1_val_[n] = " << x1_val_[n] << std::endl;
      adj[x1_id_[n]] += adj[y_id_[n]] * x2_val_;
      adj[x2_id_] += adj[y_id_[n]] * x1_val_[n];
    }
  }
};

void grad(//const var& y,
          const vector_var& y,
          const std::vector<var>& x,
          std::vector<real_t>& dy_dx) {
  // std::vector<real_t> adj(y.id_ + 1, 0.0);
  // adj[y.id_] = 1;
  std::vector<real_t> adj(y.id_[y.N_ - 1] + 1, 0.0);
  for (int i=0; i < y.N_; ++i)
    adj[y.id_[i]] = 1;
  for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
    (**chain_f)(adj);
  dy_dx.clear();
  for (auto&& xi : x)
    dy_dx.push_back(adj[xi.id_]);
}

void gradient(//const std::function<var(const std::vector<var>&)>&& f,
              const std::function<vector_var(const std::vector<var>&)>&& f,
              const std::vector<real_t>& x,
              // real_t& y,
              std::vector<real_t>& y,
              std::vector<real_t>& dy_dx) {
  clear_memory();
  std::vector<var> xv;
  xv.reserve(x.size());
  for (auto&& xi : x) xv.emplace_back(xi);
  // var yv = f(xv);
  vector_var yv = f(xv);
  y.resize(yv.N_);
  // y = yv.val_;
  for (int i=0; i < yv.N_; ++i)
    y[i] = yv.val(i);
  grad(yv, xv, dy_dx);
}


struct my_fun {
  int n_;
  my_fun(int n) : n_(n) { }
  my_fun() : my_fun(0) { }
  template <typename T>
  // T
  agrad::vector_var operator()(const std::vector<T>& x) const {
    T x1 = x[0];
    agrad::vector_var x2(x.size() - 1);
    for (size_t i = 0; i < x2.size(); ++i) x2.set(i, x[i + 1]);
    // return agrad::multiply(x2, x1)(n_);
    return agrad::multiply(x1, agrad::multiply(x2, x1));
  }
};

int main() {
  using agrad::var;
  using std::vector;
  using agrad::vec_t;
  vec_t x = { 5.2, -3.9, 1.2 };
  vec_t dy_dx;
  // double y;
  std::vector<agrad::real_t> y;
  my_fun f;
  agrad::gradient(f, x, y, dy_dx);
  // std::cout << "y = f(x) = " << y << std::endl;
  for (int i=0; i < y.size(); ++i)
    std::cout << "y[" << i << "] = " << y[i] << std::endl;
  for (std::size_t i = 0; i < x.size(); ++i)
    std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;

  agrad::vector_var v = { 1.2, 3.9 };

  return 0;
}
1 Like

This was here mostly as a thought experiment about how to think about autodiff. Specifically, I was trying to figure out if there’s a way to deal with vectors and matrices that doesn’t use so much copying and can use efficient matrix operations. So I’m still worried about things like vector_scalar_multiply_vv which seem to be copying all their inputs and using non-vector operations for the adjoints.

Thanks. It’s nice to have sandbox problems!

The adj_jac_apply stuff could be used to replace multiply.hpp in rev and a bunch of other files. The code would look simpler but I’m not sure if it is going to be faster though.

It should be faster as there should be fewer virtual function calls and less memory allocated and visited.

Some of our matrix stuff was already implemented using adj_jac_apply techniques, just not the function itself.