Gradient after transformation (math library)

I would like to use the Stan math library to calculate gradients and Hessians for a problem I am working on. To this end, I tried to replicate the example on page 12 of the article “The Stan Math Library: Reverse-Mode Automatic Differentiation in C++” (call it TSML, found at https://arxiv.org/pdf/1509.07164.pdf), using functions. This worked for the gradient, and I also managed to calculate the Hessian:

C++ code
#include <Rcpp.h>
#include <RcppEigen.h>
#include <vector>
#include <iostream>

#include <stan/math.hpp>
#include <stan/math/prim/scal.hpp>
#include <stan/math/mix/mat.hpp>

#include <boost/math/tools/promotion.hpp>

// [[Rcpp::depends(RcppEigen)]]

using Eigen::Dynamic;
using Eigen::VectorXd;
using Eigen::MatrixXd;

using namespace Rcpp;

// From page 12 of https://arxiv.org/pdf/1509.07164.pdf
struct normal_ll {
  const VectorXd y_;
  normal_ll(const VectorXd& y) : y_(y) { }
  
  template <typename T>
  T operator()(const Eigen::Matrix<T, Dynamic, 1>& theta) const {
    T mu = theta[0];
    T sigma = theta[1];
    T lp = stan::math::normal_lpdf(y_, mu, sigma);
    return lp;
  }
  
  stan::math::var operator()(const Eigen::Matrix<stan::math::var, Dynamic, 1>& theta) const {
    stan::math::var mu = theta[0];
    stan::math::var sigma = theta[1];
    stan::math::var lp = normal_lpdf(y_, mu, sigma);
    return lp;
  }
};

// [[Rcpp::export]]
VectorXd normal_ll_grad(const VectorXd& y,
                        double mu,
                        double sigma) {
  normal_ll f(y);
  
  VectorXd x(2);
  x << mu, sigma;
  
  double fx;
  VectorXd grad_fx;
  stan::math::gradient(f, x, fx, grad_fx);
  
  return grad_fx;
}

// [[Rcpp::export]]
MatrixXd normal_ll_hess(const VectorXd& y,
                        double mu,
                        double sigma) {

  normal_ll f(y);

  VectorXd x(2);
  x << mu, sigma;

  double fx;
  VectorXd grad_fx;
  MatrixXd H;
  stan::math::hessian(f, x, fx, grad_fx, H);

  return H;
}
R code
library(Rcpp)

lib_path <- .libPaths()[1]
includes <- paste0('-I\"', lib_path, "/",
                   c("Rcpp", "RcppEigen", "StanHeaders", "BH"),
                   '/include\"')
Sys.setenv("PKG_CXXFLAGS" = paste(includes, collapse = " "))

sourceCpp("Test/try_hessian.cpp")

# Replicate example on page 12 of https://arxiv.org/pdf/1509.07164.pdf
normal_ll_grad(c(1.3, 2.7, -1.9), 1.3, 2.9)

# Now try Hessian
normal_ll_hess(c(1.3, 2.7, -1.9), 1.3, 2.9)

If I now try to do something similar for a function in which I transform the parameter vector before calculating the lpdf, my program crashes. The following C++ code follows that above, in the same file:

Crashing code
template <typename T1, typename T2>
T1 transform_mean(const T1& mu, const T2& coeffs) {
  Eigen::Matrix<T1, Dynamic, 1> res;
  res = stan::math::elt_multiply(mu, coeffs);
  return res;
}

struct posterior_lpdf {
  const VectorXd y_;
  const VectorXd m_;
  posterior_lpdf(const VectorXd& y, const VectorXd& m) : y_(y), m_(m) { }

  template <typename T>
  T operator()(const Eigen::Matrix<T, Dynamic, 1>& theta) const {

    Eigen::Matrix<T, Dynamic, 1> theta_means(theta.size() - 1);
    for (int i = 1; i < theta.size(); ++i) {
      theta_means << theta(i);
    }
    
    // The 3 lines below will allow the program to compile, but crash when run
    T sigma = theta[0];
    T lp = normal_lpdf(y_, theta_means, sigma);
    return lp;
    
    // The 4 lines below show what I would like to do, in this example
    // Eigen::Matrix<T, Dynamic, 1> mu = transform_mean(theta_means, m_);
    // T sigma = theta[0];
    // T lp = normal_lpdf(y_, mu, sigma);
    // return lp;
  }
};

// [[Rcpp::export]]
VectorXd posterior_grad(const VectorXd& y,
                        const VectorXd& m,
                        const VectorXd& mu,
                        const double sigma) {
  posterior_lpdf f(y, m);

  VectorXd x;
  x << sigma;
  for (int i = 0; i < m.size(); ++i) {
    x << mu[i];
  }
  
  double fx;
  VectorXd grad_fx;
  stan::math::gradient(f, x, fx, grad_fx);
  return grad_fx;
}
R code that makes it crash
posterior_grad(rnorm(5, 1:5, 0.2), 1:5, runif(5, 0.95, 1.05), 2)

What am I doing wrong here? How can I make a transformation (using both parameters and non-parameters, e.g. doubles) of the parameter vector before calculating the lpdf, and then get the gradient (and Hessian) with respect to the non-transformed parameters?

The program is segfaulting because of this section:

  VectorXd x;
  x << sigma;
  for (int i = 0; i < m.size(); ++i) {
    x << mu[i];
  }

You need to initialise x to a size before you can use the << operator (e.g. VectorXd x(N); for vector of size N).

Hope it helps!

1 Like

Ah, silly mistake. Thanks for pointing that out. Now, that did not completely fix the problem, because there was (I believe) some type casting problems with my original code. I think I managed to fix these problems as well, and for completeness, here is some example code that will compile and run:

C++ code
#include <Rcpp.h>
#include <RcppEigen.h>
#include <vector>
#include <iostream>

#include <stan/math.hpp>
#include <stan/math/prim/scal.hpp>
#include <stan/math/mix/mat.hpp>

#include <boost/math/tools/promotion.hpp>

// [[Rcpp::depends(RcppEigen)]]

using Eigen::Dynamic;
using Eigen::VectorXd;
using Eigen::MatrixXd;

using namespace Rcpp;

template <typename T1, typename T2, int N>
Eigen::Matrix<typename boost::math::tools::promote_args<T1, T2>::type, N, 1>
transform_mean(const Eigen::Matrix<T1, N, 1>& m1,
               const Eigen::Matrix<T2, N, 1>& m2) {
  Eigen::Matrix<typename boost::math::tools::promote_args<T1, T2>::type, N, 1> new_mean(m1.size());
  new_mean(0) = m1(0) + m2(0);
  for (int i = 1; i < m1.size(); ++i) {
    new_mean(i) = m1(i) * m1(i - 1) + m2(i);
  }
  return new_mean;
}

struct posterior_lpdf {
  const VectorXd y_;
  const VectorXd m_;
  posterior_lpdf(const VectorXd& y, const VectorXd& m) : y_(y), m_(m) { }

  template <typename T>
  T operator()(const Eigen::Matrix<T, Dynamic, 1>& theta) const {
    Eigen::Matrix<T, Dynamic, 1> new_mean(theta.size());
    new_mean = transform_mean(theta, m_);
    T lp = stan::math::normal_lpdf(y_, new_mean, 1.0);
    return lp;
  }
};

// [[Rcpp::export]]
VectorXd posterior_grad(const VectorXd& y,
                        const VectorXd& m,
                        const VectorXd& x) {
  posterior_lpdf f(y, m);

  double fx;
  VectorXd grad_fx;
  stan::math::gradient(f, x, fx, grad_fx);
  return grad_fx;
}

// [[Rcpp::export]]
MatrixXd posterior_hess(const VectorXd& y,
                        const VectorXd& m,
                        const VectorXd& x) {
  posterior_lpdf f(y, m);

  double fx;
  VectorXd grad_fx;
  MatrixXd H;
  stan::math::hessian(f, x, fx, grad_fx, H);

  return H;
}
R code
library(Rcpp)
lib_path <- .libPaths()[1]
includes <- paste0('-I\"', lib_path, "/",
                   c("Rcpp", "RcppEigen", "StanHeaders", "BH"),
                   '/include\"')
Sys.setenv("PKG_CXXFLAGS" = paste(includes, collapse = " "))

sourceCpp("transform_theta.cpp")

make_mean <- function(tt, mm) {
  n <- length(tt)
  out <- numeric(n)
  out[1] <- tt[1] + mm[1]
  out[-1] <- tail(tt, n - 1) * head(tt, n - 1) + mm[-1]
  return(out)
}

set.seed(1)
N <- 3
m <- seq_len(N)
theta <- runif(N, 0.95, 1.05)
true_mean <- make_mean(theta, m)
y <- rnorm(N, true_mean, 1)

theta_eval <- theta + rnorm(N, 0, 0.01)
eval_mean <- make_mean(theta_eval, m)

posterior_grad(y, m, theta_eval)

# Gradient from expression calculated by hand
(grad_true <- c((y[1] - eval_mean[1]) + 
                (y[2] - eval_mean[2]) * theta_eval[2],
                (y[2] - eval_mean[2]) * theta_eval[1] + 
                (y[3] - eval_mean[3]) * theta_eval[3],
                (y[3] - eval_mean[3]) * theta_eval[2]))

# Hessian (checks out as well)
posterior_hess(y, m, theta_eval)