For RStan, there was a Discourse thread about the now released vignette. It is not implemented for PyStan yet, but that usually follows RStan. For CmdStan, it is a bit different but documented in the CmdStan guide.
In Andrew’s case, the Stan program was
functions {
real log_kernel(real lambda, real b_linear, vector b_logit,
vector lambda_logit, vector lambda_kernel, vector b_kernel,
vector y, real mu) {
return lambda * b_linear + dot_product(lambda_logit, b_logit) +
dot_product(lambda_kernel, b_kernel) +
lambda * cauchy_lpdf(y | mu, 1) + (1 - lambda) * normal_lpdf(mu | 0, 10);
}
real[] log_kernel_gradient(real lambda, real b_linear, vector b_logit,
vector lambda_logit, vector lambda_kernel,
vector b_kernel, vector y, real mu); // undefined now
}
data {
int N;
vector[N] y;
real a_lower;
real a_upper;
real b_linear;
int K_logit;
vector[K_logit] mu_logit;
vector[K_logit] sigma_logit;
vector[K_logit] b_logit;
int K_kernel;
vector[K_kernel] mu_kernel;
vector[K_kernel] sigma_kernel;
vector[K_kernel] b_kernel;
}
parameters {
real mu;
real<lower=0,upper=2> a;
}
transformed parameters {
real a_reflected;
real a_scaled;
real lambda;
vector[K_logit] lambda_logit;
vector[K_kernel] lambda_kernel;
a_reflected = 1 - fabs(1 - a);;
a_scaled = (a_reflected - a_lower)/(a_upper - a_lower);
if (a_scaled <= 0)
lambda = 0;
else if (a_scaled < 1)
lambda = 0.5 + 0.25*(3*(2*a_scaled - 1) - (2*a_scaled - 1)^3);
else
lambda = 1;
lambda_logit = 1 ./ (1 + exp(-(lambda - mu_logit) ./ sigma_logit));
lambda_kernel = exp(-.5*(lambda - mu_kernel) .* (lambda - mu_kernel) ./ (sigma_kernel .* sigma_kernel));
}
model {
target += log_kernel(lambda, b_linear, b_logit,
lambda_logit, lambda_kernel, b_kernel,
y, mu);
}
generated quantities {
real grad_lambda[1] =
log_kernel_gradient(lambda, b_linear, b_logit,
lambda_logit, lambda_kernel, b_kernel,
y, mu);
}
and the log_kernel_gradient.hpp file that calculated the gradient was
template <typename T0__, typename T1__, typename T2__, typename T3__, typename T4__, typename T5__, typename T6__, typename T7__>
std::vector<typename boost::math::tools::promote_args<T0__, T1__, T2__, T3__, typename boost::math::tools::promote_args<T4__, T5__, T6__, T7__>::type>::type>
log_kernel_gradient(const T0__& lambda,
const T1__& b_linear,
const Eigen::Matrix<T2__, Eigen::Dynamic,1>& b_logit,
const Eigen::Matrix<T3__, Eigen::Dynamic,1>& lambda_logit,
const Eigen::Matrix<T4__, Eigen::Dynamic,1>& lambda_kernel,
const Eigen::Matrix<T5__, Eigen::Dynamic,1>& b_kernel,
const Eigen::Matrix<T6__, Eigen::Dynamic,1>& y,
const T7__& mu, std::ostream* pstream__) {
auto lambda_v = to_var(lambda); // auto foo_v = to_var(foo) for anything unknown
var lp = log_kernel(lambda_v, b_linear, b_logit, // using lambda_v, not lambda
lambda_logit, lambda_kernel, b_kernel,
y, mu, 0); // need extra 0 at the end
std::vector<var> theta;
theta.push_back(lambda_v); // call theta.push_back(foo_v) for anything unknown
std::vector<double> g;
lp.grad(theta, g);
return g;
}
Calculating Jacobians is about the same and discussed in section 2.4 of the Stan Math paper.