I have a model for a multinomial distribution that fails due to one of the gradient’s returning NaN. The interesting thing is that the likelihood function itself works, it is only the gradient that fails, which suggests a problem with auto-differentiation to me. We have also tried various re-parameterisations but this did not help.
functions {
real pinternal1(real a, real b, real g1) {
return exp(-exp(-g1 + a) + b);
}
real pinternal2(real a, real b, real g1, real g2) {
return exp(-exp(-g1 + a) - exp(-g2 + a) + b);
}
real pfun(real l, real u, real g1, real g2) {
real denominator;
denominator = exp(g1) + exp(g2);
return (pinternal1(l, g1, g1) - pinternal1(u, g1, g1) + pinternal1(l, g2, g1) - pinternal1(u, g2, g1) + pinternal2(u, g2, g1, g2) - pinternal2(l, g2, g1, g2)) / denominator;
}
real gumbelmult(array[] int y, real mu,
real crc, real crl, real crh) {
vector[8] p_tpres;
vector[3] thres;
// calculate thresholds
thres[1] = crc - (exp(crl));
thres[2] = crc;
thres[3] = crc + (exp(crh));
p_tpres[1] = pfun(negative_infinity(), thres[1], 0, mu);
p_tpres[2] = pfun(thres[1], thres[2], 0, mu);
p_tpres[3] = pfun(thres[2], thres[3], 0, mu);
p_tpres[4] = pfun(thres[3], positive_infinity(), 0, mu);
p_tpres[5] = pfun(negative_infinity(), thres[1], mu, 0);
p_tpres[6] = pfun(thres[1], thres[2], mu, 0);
p_tpres[7] = pfun(thres[2], thres[3], mu, 0);
p_tpres[8] = pfun(thres[3], positive_infinity(), mu, 0);
return multinomial_lpmf(y | p_tpres);
}
}
data {
array[8] int dvec;
}
parameters {
real<lower=0> g;
real crc;
real crl;
real crh;
}
transformed parameters {
real lprior = 0;
lprior += student_t_lpdf(g | 3, 1, 2);
lprior += normal_lpdf(crc | 0, 0.5);
lprior += normal_lpdf(crl | -0.5, 0.5);
lprior += normal_lpdf(crh | -0.5, 0.5);
}
model {
target += gumbelmult(dvec, g, crc, crl, crh);
target += lprior;
}
When tryign this in rstan, it fails:
library("rstan")
dat <- c(347L, 622L, 529L, 186L, 409L, 774L, 1048L, 885L)
fit1 <- stan(model_code = multinom_mod, iter = 10, verbose = FALSE,
data = list(dvec = dat), init = 0)
Error:
SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
Chain 1: Rejecting initial value:
Chain 1: Gradient evaluated at the initial value is not finite.
Chain 1: Stan can't start sampling from this initial value.
[1] "Error : Initialization failed."
error occurred during calling the sampler; sampling not done
The interesting thing is that the likelihood function itself works outside of Stan and can be used to fit the data:
expose_stan_functions(fit1)
newll <- function(x) -log_prob(fit1, x)
(res <- optim(c(0, -0.5,0.5,-0.5), newll))
# $par
# [1] -0.4753573 0.4441750 -0.2198889 -0.3659075
#
# $value
# [1] 32.23814
When looking at the gradient directly, it seems to be NaN for the first parameter:
grad_log_prob(fit1, res$par)
# [1] NaN 0.526258994 0.008817364 0.211254299
# attr(,"log_prob")
# [1] -32.23814