This topic is old, so apologies if you all have already moved on. If not, then here’s my best effort in response to:
I think there’s two issues. The first revolves around the size of y
. In previous Parts, y
held the scalar return value of f(x)
. In Part IV, f(x)
does not have a scalar return value and thus y
needs to adapt. The second issue stems from vector_scalar_multiply_vv
wrapping multiple differentiable operations into one chainable
call. I believe these two issues jointly imply an indexing issue around adj
.
The code below attempts a solution by changing y
to hold the outcome of each differentiable operation of f()
. Edits show up in grad
, gradient
, multiply
, my_fun
, and main
, where the code I replace is commented out just above the new code.
I’m not arguing this is the best solution, in fact I’m just hoping it is a solution. I’m happy to put more effort into this, if people are still interested.
#include <Eigen/Eigen>
#include <algorithm>
#include <cmath>
#include <initializer_list>
#include <iostream>
#include <limits>
#include <utility>
#include <vector>
namespace agrad {
// CONFIGURABLE SIZES
typedef unsigned int index_t; // determines max expression size
typedef double real_t; // basic scalar type
typedef std::vector<real_t> vec_t; // vector of scalars
typedef Eigen::Matrix<real_t, -1, 1> real_vector_t;
typedef Eigen::Matrix<index_t, -1, 1> index_vector_t;
// HEAP MEMORY MANAGEMENT
template <typename T>
inline T* array_malloc(std::size_t n) {
return static_cast<T*>(malloc(sizeof(T) * n));
}
template <typename T>
inline T* copy_array_malloc(const T* x, index_t N) {
T* y = array_malloc<T>(N);
std::copy(x, x + N, y);
return y;
}
template <typename T, int R, int C>
inline T* copy_array_malloc(const Eigen::Matrix<T, R, C>& x) {
T* y = array_malloc<T>(x.size());
std::copy(&x[0], &x[0] + x.size(), y);
for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
return y;
}
template <typename M>
inline typename M::Scalar* copy_array_malloc(const Eigen::DenseBase<M>& x) {
typename M::Scalar* y = array_malloc<typename M::Scalar>(x.size());
// can't use std::copy because not knowing M requires x(n) access
for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
return y;
}
// ARENA MEMORY MANAGEMENT
std::size_t MEM_SIZE = 1024 * 1024 * 1024; // 1 GB
char* ARENA = static_cast<char*>(malloc(MEM_SIZE)); // ** global **
char* ARENA_NEXT = ARENA; // ** global **
inline void clear_arena() {
ARENA_NEXT = ARENA;
}
inline void* arena_malloc(std::size_t nbytes) {
char* bytes = ARENA_NEXT;
ARENA_NEXT += nbytes;
return bytes;
}
template <typename T>
inline T* array_arena_malloc(std::size_t n) {
return static_cast<T*>(arena_malloc(sizeof(T) * n));
}
template <typename T>
inline T* copy_array_arena_malloc(const T* x, index_t N) {
T* y = array_arena_malloc<T>(N);
std::copy(x, x + N, y);
return y;
}
template <typename T, int R, int C>
inline T* copy_array_arena_malloc(const Eigen::Matrix<T, R, C>& x) {
T* y = array_arena_malloc<T>(x.size());
std::copy(&x[0], &x[0] + x.size(), y);
for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
return y;
}
template <typename M>
inline typename M::Scalar* copy_array_arena_malloc(const Eigen::DenseBase<M>& x) {
typename M::Scalar* y = array_arena_malloc<typename M::Scalar>(x.size());
for (index_t n = 0; n < x.size(); ++n) y[n] = x(n);
return y;
}
// STACK OF CHAINABLES FOR REVERSE PASS IN ARENA
struct chainable {
virtual void operator()(vec_t& adj) const { }
static inline void* operator new(std::size_t nbytes) {
return arena_malloc(nbytes);
}
};
std::vector<chainable*> STACK; // ** global **
inline void clear_stack() {
STACK.clear(); // leaves capacity unchanged
}
inline void push_chainable(chainable* c) {
STACK.push_back(c);
}
index_t NEXT_ID = 0; // ** global **
inline index_t next_id() {
return NEXT_ID++;
}
inline index_t* next_id_array_malloc(int N) {
index_t* y = array_malloc<index_t>(N);
for (index_t n = 0; n < N; ++n) y[n] = next_id();
return y;
}
// CLEAR MEMORY (RESETS, BUT DOES NOT FREE)
inline void clear_memory() {
NEXT_ID = 0;
clear_stack();
clear_arena();
}
// AUTODIFF VARIABLE
struct var {
real_t val_;
index_t id_;
~var() = default;
var() = default;
var(const var& v) = default;
var(real_t val, index_t id) : val_(val), id_(id) { }
var(real_t val) : var(val, next_id()) { }
var& operator=(const var& x) {
val_ = x.val_;
id_ = x.id_;
return *this;
}
var& operator=(double x) {
val_ = x;
id_ = next_id();
return *this;
}
var& operator+=(const var& x);
var& operator+=(real_t x);
};
struct vector_var {
int N_;
real_t* val_;
index_t* id_;
vector_var() = default;
// Eigen reference constructor (allow template expression args)
vector_var(const Eigen::Ref<Eigen::Matrix<real_t, -1, 1>>& v)
: N_(v.size()),
val_(copy_array_malloc(v)),
id_(next_id_array_malloc(v.size())) { }
// argument copies on purpose
vector_var(std::initializer_list<real_t> v)
: N_(v.size()),
val_(array_malloc<real_t>(v.size())),
id_(next_id_array_malloc(v.size())) {
}
// // Eigen base constructor
template <typename M>
vector_var(const Eigen::DenseBase<M>& v)
: N_(v.size()),
val_(copy_array_malloc(v)),
id_(next_id_array_malloc(v.size())) { }
// copy constructor
vector_var(const vector_var& v)
: N_(v.N_),
val_(copy_array_malloc(v.val_, v.N_)),
id_(copy_array_malloc(v.id_, v.N_)) {
}
// move constructor (shallow copy in initializers, nullify source in body)
// can this be = default with same behavior?
vector_var(vector_var&& v) noexcept
: N_(v.N_), val_(v.val_), id_(v.id_) {
v.N_ = 0;
v.val_ = nullptr;
v.id_ = nullptr;
}
// allocated, but unitialized (like Eigen, unlike std::vector)
vector_var(size_t N)
: N_(N), val_(array_malloc<real_t>(N)), id_(array_malloc<index_t>(N)) { }
// destructor frees memory to complete RAII pattern
~vector_var() { free(val_); free(id_); }
// are these still necessary with move constructor?
// copy assignment
vector_var& operator=(const vector_var& v) {
N_ = v.N_;
val_ = copy_array_malloc(v.val_, v.N_);
id_ = copy_array_malloc(v.id_, v.N_);
return *this;
}
// move assignment (destructive; see move constructor)
vector_var& operator=(vector_var&& v) {
N_ = v.N_;
val_ = v.val_;
val_ = nullptr;
id_ = v.id_;
id_ = nullptr;
return *this;
}
index_t size() const {
return N_;
}
real_t val(index_t i) const {
return val_[i];
}
decltype(auto) val() const {
return Eigen::Map<real_vector_t>(val_, N_);
}
index_t id(index_t i) const {
return id_[i];
}
decltype(auto) id() const {
return Eigen::Map<index_vector_t>(id_, N_);
}
// std::vector-style indexing
var operator[](index_t i) const {
return var(val(i), id(i));
}
// Eigen::Vector-style indexing
var operator()(index_t i) const {
return var(val(i), id(i));
}
void set(index_t i, real_t v) {
val_[i] = v;
id_[i] = next_id();
}
void set(index_t i, const var& v) {
val_[i] = v.val_;
id_[i] = v.id_;
}
};
// index_t: (2 * N + 1) real_t: (N + 1) size pointer: 1
// bytes (int index_t, double real_t): 16N + 28
// current: N pointers to y, N ptrs to x1, pointer to x2, length (8), vtable
// pointer: 2 N + 3 size: 2
// bytes: 16N + 32 (+ vari being pointed to, but consider it a wash)
struct vector_scalar_multiply_vv : public chainable {
const index_t N_;
const index_t* x1_id_;
const index_t x2_id_;
const index_t* y_id_;
real_t* x1_val_;
real_t x2_val_;
vector_scalar_multiply_vv(const vector_var& x1,
const var& x2,
const vector_var& y)
: N_(x1.N_),
x1_id_(copy_array_arena_malloc<index_t>(x1.id_, x1.N_)),
x2_id_(x2.id_),
y_id_(array_arena_malloc<index_t>(y.N_)),
x1_val_(copy_array_arena_malloc<real_t>(x1.val_, x1.N_)),
x2_val_(x2.val_)
{ }
void operator()(vec_t& adj) const {
for (int n = 0; n < N_; ++n) {
std::cout << "\n**** n = " << n << std::endl;
std::cout << "x1_id_[n] = " << x1_id_[n] << std::endl;
std::cout << "y_id_[n] = " << y_id_[n] << std::endl;
std::cout << "x2_val_ = " << x2_val_ << std::endl;
std::cout << "x1_val_[n] = " << x1_val_[n] << std::endl;
adj[x1_id_[n]] += adj[y_id_[n]] * x2_val_;
adj[x2_id_] += adj[y_id_[n]] * x1_val_[n];
}
}
};
// inline vector_var multiply(const vector_var& x1, var& x2) {
// vector_var y(x1.val() * x2.val_);
// push_chainable(new vector_scalar_multiply_vv(x1, x2, y));
// return y;
// }
// REVERSE PASS GRADIENT CALCULATIONS (PURE CONTINUATIONS)
void grad(//const var& y,
const vector_var& y,
const std::vector<var>& x,
std::vector<real_t>& dy_dx) {
// std::vector<real_t> adj(y.id_ + 1, 0.0);
// adj[y.id_] = 1;
std::vector<real_t> adj(y.id_[y.N_ - 1] + 1, 0.0);
for (int i=0; i < y.N_; ++i)
adj[y.id(i)] = 1;
for (auto&& chain_f = STACK.crbegin(); chain_f != STACK.crend(); ++chain_f)
(**chain_f)(adj);
dy_dx.clear();
for (auto&& xi : x)
dy_dx.push_back(adj[xi.id_]);
}
/**
* Set the value y = f(x) and derivatives dy_dx = d/dx f(x).
*/
void gradient(//const std::function<var(const std::vector<var>&)>&& f,
const std::function<vector_var(const std::vector<var>&)>&& f,
const std::vector<real_t>& x,
// real_t& y,
std::vector<real_t>& y,
std::vector<real_t>& dy_dx) {
clear_memory();
std::vector<var> xv;
xv.reserve(x.size());
for (auto&& xi : x) xv.emplace_back(xi);
// var yv = f(xv);
vector_var yv = f(xv);
y.resize(yv.N_);
// y = yv.val_;
for (int i=0; i < yv.N_; ++i)
y[i] = yv.val(i);
grad(yv, xv, dy_dx);
}
// ADDITION
// allocated in arena
struct add_vv : public chainable {
index_t x1, x2, y;
add_vv(index_t x1, index_t x2, index_t y) : x1(x1), x2(x2), y(y) { }
void operator()(vec_t& adj) const {
adj[x1] += adj[y];
adj[x2] += adj[y];
}
};
struct add_v : public chainable {
index_t x1, y;
add_v(index_t x1, index_t y) : x1(x1), y(y) { }
void operator()(vec_t& adj) const {
adj[x1] += adj[y];
}
};
inline var operator+(const var& x1, const var& x2) {
var y(x1.val_ + x2.val_);
push_chainable(new add_vv(x1.id_, x2.id_, y.id_));
return y;
}
inline var operator+(const var& x1, double x2) {
var y(x1.val_ + x2);
push_chainable(new add_v(x1.id_, y.id_));
return y;
}
inline var operator+(double x1, const var& x2) {
return x2 + x1;
}
// COMPOUND ASSIGN-ADD
inline var& var::operator+=(const var& x) {
val_ += x.val_;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_vv(old_id, x.id_, id_));
return *this;
}
inline var& var::operator+=(double x) {
val_ += x;
index_t old_id = id_;
id_ = next_id();
push_chainable(new add_v(old_id, id_));
return *this;
}
// MULTIPLICATION
// allocated in arena
struct multiply_vv : public chainable {
index_t x1, x2, y;
real_t x1v, x2v;
multiply_vv(index_t x1, index_t x2, index_t y, real_t x1v, real_t x2v)
: x1(x1), x2(x2), y(y), x1v(x1v), x2v(x2v) { }
void operator()(vec_t& adj) const {
adj[x1] += x2v * adj[y];
adj[x2] += x1v * adj[y];
}
};
inline vector_var multiply(const vector_var& x1, var& x2) {
vector_var y(x1.val() * x2.val_);
for (int i=0; i < y.N_; ++i) {
push_chainable(new multiply_vv(x1.id(i), x2.id_, y.id(i), x1.val(i), x2.val_));
}
return y;
}
struct multiply_v : public chainable {
index_t x1, y;
real_t x2v;
multiply_v(index_t x1, index_t y, real_t x2v) : x1(x1), y(y), x2v(x2v) { }
void operator()(vec_t& adj) const {
adj[x1] += x2v * adj[y];
}
};
inline var operator*(const var& x1, const var& x2) {
var y(x1.val_ * x2.val_);
push_chainable(new multiply_vv(x1.id_, x2.id_, y.id_, x1.val_, x2.val_));
return y;
}
inline var operator*(const var& x1, double x2) {
var y(x1.val_ * x2);
push_chainable(new multiply_v(x1.id_, y.id_, x2));
return y;
}
inline var operator*(double x1, const var& x2) {
return x2 * x1;
}
// EXPONENTIATION
struct exp_v : public chainable {
index_t x1, y;
real_t yv;
exp_v(index_t x1, index_t y, real_t yv) : x1(x1), y(y), yv(yv) { }
void operator()(vec_t& adj) const {
adj[x1] += yv * adj[y];
}
};
inline var exp(const var& x) {
var y(std::exp(x.val_));
push_chainable(new exp_v(x.id_, y.id_, y.val_));
return y;
}
} // namespace agrad
// EXAMPLE
struct my_fun {
int n_;
my_fun(int n) : n_(n) { }
my_fun() : my_fun(0) { }
template <typename T>
// T operator()(const std::vector<T>& x) const {
agrad::vector_var operator()(const std::vector<T>& x) const {
T x1 = x[0];
agrad::vector_var x2(x.size() - 1);
for (size_t i = 0; i < x2.size(); ++i) x2.set(i, x[i + 1]);
// return agrad::multiply(x2, x1)(n_);
return agrad::multiply(x2, x1);
}
};
int main() {
using agrad::var;
using std::vector;
using agrad::vec_t;
vec_t x = { 5.2, -3.9, 1.2 };
vec_t dy_dx;
// double y;
std::vector<agrad::real_t> y;
my_fun f;
agrad::gradient(f, x, y, dy_dx);
// std::cout << "y = f(x) = " << y << std::endl;
for (int i=0; i < y.size(); ++i)
std::cout << "y[" << i << "] = " << y[i] << std::endl;
for (std::size_t i = 0; i < x.size(); ++i)
std::cout << "dy / dx[" << (i + 1) << "] = " << dy_dx[i] << std::endl;
agrad::vector_var v = { 1.2, 3.9 };
return 0;
}