Hello All,
I’m using a Newton’s solver in Stan for my nonlinear drug interaction index model. The details of the model can be found in my earlier post: Precise but wrong results for a drug interaction model.
In essence, I need to solve for z in the following target equation,
\frac{x_r[1]}{\exp \left(\frac{z - beta1[1]}{beta1[2]} \right)} + \frac{x_r[2]}{\exp \left(\frac{z - beta2[1]}{beta2[2]} \right)} \\ = \exp(gamma[1] + gamma[2]*\log(x_r[1]) + gamma[3]*\log(x_r[2]) \\ + gamma[4]*\log(x_r[1])*\log(x_r[2]) + gamma[5]*\log(x_r[1])^2 \\+ gamma[6]*\log(x_r[2])^2,
where x_r[1] and x_r[2] are data, vectors of beta1 and beta2 are samples from their posteriors obtained in stage one model (see the attached .stan file), and gamma's are samples from their prior distributions. The associate functions to the Newton’s solver are below,
functions {
real f(real z, vector beta1, vector beta2, vector gamma, real[] x_r){
return x_r[1]/(exp((z-beta1[1])/beta1[2])) + x_r[2]/(exp((z-beta2[1])/beta2[2])) -
exp(gamma[1] + gamma[2]*log(x_r[1]) + gamma[3]*log(x_r[2]) +
gamma[4]*log(x_r[1])*log(x_r[2]) + gamma[5]*log(x_r[1])^2 +
gamma[6]*log(x_r[2])^2);
}
real Df(real z, vector beta1, vector beta2, real[] x_r){
return -x_r[2]*exp((beta2[1]-z)/beta2[2])/beta2[2] - x_r[1]*exp((beta1[1]-
z)/beta1[2])/beta1[2];
}
real newton_method(real x_0, vector beta1, vector beta2, vector gamma, real[] x_r){
real x_1 = x_0 - f(x_0, beta1, beta2, gamma, x_r) / Df(x_0, beta1, beta2, x_r);
real Dfx = Df(x_1, beta1, beta2, x_r);
real x_2 = x_1 - f(x_1, beta1, beta2, gamma, x_r) / Dfx;
int iter = 0;
while (fabs(x_2 - x_1) > 1e-8){
iter += 1;
if (iter > 150){
if (fabs(x_2 - x_1) < 1e-6){
break;
} else{
reject("Newton root solver failed to converge ", fabs(x_2 - x_1));
}
}
x_1 = x_2;
Dfx = Df(x_1, beta1, beta2, x_r);
x_2 = x_1 - f(x_1, beta1, beta2, gamma, x_r) / Dfx;
}
return x_2;
}
f() is the target equation, Df() is its derivative, and newton_method() is the Newton’s method.
I’ve been getting around 1% divergent transitions out of the post-warmup draws, and the posterior results are close to the truth. I have run a couple of tests to determine which parameters are the cause of the divergence and came to the conclusion that it is the gamma vector. I have two suspicions that:
- When Stan samples from the prior distributions of gamma, some combinations of gamma's lead to no solution to the target equation. (I hope this suspicion is not true, because the priors assigned to gamma's are already very tight!);
- My Newton’s solver is not robust enough to solve z under some relatively extreme scenarios.
I’m under the impression that there is some momentum in building the Newton’s solver in Stan within the Devs, so I wonder if anyone can give me some pointers to why/how to fine tune my Newton’s solver and get rid of the divergent transitions?
The model code looks like the following, and I also attached the files that can be used to reproduce the example.
functions {
real f(real z, vector beta1, vector beta2, vector gamma, real[] x_r){
return x_r[1]/(exp((z-beta1[1])/beta1[2])) + x_r[2]/(exp((z-beta2[1])/beta2[2])) -
exp(gamma[1] + gamma[2]*log(x_r[1]) + gamma[3]*log(x_r[2]) + gamma[4]*log(x_r[1])*log(x_r[2]) + gamma[5]*log(x_r[1])^2 + gamma[6]*log(x_r[2])^2);
}
real Df(real z, vector beta1, vector beta2, real[] x_r){
return -x_r[2]*exp((beta2[1]-z)/beta2[2])/beta2[2] - x_r[1]*exp((beta1[1]-z)/beta1[2])/beta1[2];
}
real newton_method(real x_0, vector beta1, vector beta2, vector gamma, real[] x_r){
real x_1 = x_0 - f(x_0, beta1, beta2, gamma, x_r) / Df(x_0, beta1, beta2, x_r);
real Dfx = Df(x_1, beta1, beta2, x_r);
real x_2 = x_1 - f(x_1, beta1, beta2, gamma, x_r) / Dfx;
int iter = 0;
while (fabs(x_2 - x_1) > 1e-8){
iter += 1;
if (iter > 150){
if (fabs(x_2 - x_1) < 1e-6){
break;
} else{
reject("Newton root solver failed to converge ", fabs(x_2 - x_1));
}
}
x_1 = x_2;
Dfx = Df(x_1, beta1, beta2, x_r);
x_2 = x_1 - f(x_1, beta1, beta2, gamma, x_r) / Dfx;
}
return x_2;
}
real scaled_logit_normal_lpdf(real y, real mu, real sigma, real ymax){
real logit_ymax = logit(y/ymax);
return normal_lpdf(logit_ymax | mu, sigma) + log(ymax) - log(y) - log(ymax-y);
}
}
data {
int n1;
int n2;
real d1[n1];
real d2[n2];
real y1[n1];
real y2[n2];
int N;
real x_r[N, 2];
real y[N];
real zinit;
vector[2] bMu[2];
cov_matrix[2] bSig[2];
int<lower=0,upper=1> surv[N];
}
transformed data{
real logd1[n1];
real logd2[n2];
real yobsmax1 = max(y1);
real yobsmax2 = max(y2);
logd1 = log(d1);
logd2 = log(d2);
}
parameters {
vector[2] beta1;
vector[2] beta2;
real<lower=yobsmax1> ymax1;
real<lower=yobsmax2> ymax2;
vector[6] gamma;
real<lower=0> std1;
real<lower=0> std2;
real<lower=0> sigma;
vector[2] alpha;
}
transformed parameters{
vector[3] theta1;
vector[3] theta2;
theta1[1] = -beta1[1]/beta1[2];
theta1[2] = beta1[2];
theta1[3] = ymax1;
theta2[1] = -beta2[1]/beta2[2];
theta2[2] = beta2[2];
theta2[3] = ymax2;
}
model{
// stage one
real mu1[n1];
real mu2[n2];
// stage two
real z_hat[N];
ymax1 ~ normal(yobsmax1, 10);
ymax2 ~ normal(yobsmax2, 10);
beta1 ~ multi_normal(bMu[1], bSig[1]);
beta2 ~ multi_normal(bMu[2], bSig[2]);
alpha ~ normal(0, 10);
gamma ~ normal(0, 0.6);
//stage one:
for(i in 1:n1){
mu1[i] = beta1[1] + beta1[2]*logd1[i];
y1[i] ~ scaled_logit_normal_lpdf(mu1[i], std1, ymax1);
}
for(i in 1:n2){
mu2[i] = beta2[1] + beta2[2]*logd2[i];
y2[i] ~ scaled_logit_normal_lpdf(mu2[i], std2, ymax2);
}
//stage two:
for(i in 1:N){
z_hat[i] = newton_method(zinit, beta1, beta2, gamma, x_r[i]);
y[i] ~ scaled_logit_normal_lpdf(z_hat[i], sigma, ymax1);<a class="attachment" href="//cdck-file-uploads-canada1.s3.dualstack.ca-central-1.amazonaws.com/flex030/uploads/mc_stan/original/2X/f/f923d3711b4171ced6d040f96f185af3724d5c53.stan">LaModel2.0.stan</a> (2.9 KB)
<a class="attachment" href="//cdck-file-uploads-canada1.s3.dualstack.ca-central-1.amazonaws.com/flex030/uploads/mc_stan/original/2X/f/f923d3711b4171ced6d040f96f185af3724d5c53.stan">LaModel2.0.stan</a> (2.9 KB)
<a class="attachment" href="//cdck-file-uploads-canada1.s3.dualstack.ca-central-1.amazonaws.com/flex030/uploads/mc_stan/original/2X/6/6b9bc37e60bf1c8350c580532fca1049ea1cbb70.R">La_func.R</a> (3.1 KB)
surv[i] ~ bernoulli_logit(alpha[1] + alpha[2]*y[i]);
}
}
Any help will be very much appreciated! Thanks in advance.
LaModelCall_NS.R (1.2 KB)
LaModelFunc_NS.R (6.0 KB)
LaModel2.0.stan (2.9 KB)