Problem running RStan with simulated data in R

Hi all,
I want to check my Stan model with simulated data in R. To this end, I generated data in R as follows:

rm(list=ls())
set.seed(998)
simfun <- function(n,f,g, beta0, beta1, beta2, beta3,scale,shape){
  data <- list()
  m=31
  mi=20
  mis = rep(mi,m)
  id=rep(1:m,mis)
  n = length(id)
  data$status <- rep(NA,n)
  data$iid0=rtnorm(n=31, mean = 0, sd = 1, lower = 0)
  data$iid = rep(data$iid0,mis)
  c <- runif(n,f,g)
  data$U <- runif(n)
  data$x1 <- rbinom(n, size=1, prob=0.5)
  data$x2 <- rnorm(n,0,2)
  data$model <- RMgauss(var=4,scale=2)
  data$x=seq(1,31,1)
  data$sim.spatcov=RFsimulate(data$model , x=data$x)
  data$spatcov0=as.data.frame(data$sim.spatcov)
  data$spatcov=rep(data$spatcov0[,1],mis )
  data$constant=rep(1,n)
  data$eta <- (exp((-beta0 - beta1*data$x1 - beta2*data$x2 - beta3*data$spatcov)*shape))* (data$iid)
  data$y<-(((1 / data$U)-1)^(1/shape)) * (1 / (data$eta ^ (1/shape)))
  for (i in 1:n){
    if(data$y[i] <= c[i]) {data$status[i]=1} else {data$status[i]=0}
  }
  data$T <- data$status*data$y+(1-data$status)* c
  data<- data.frame(cbind(id,data$status, data$U, data$x1,data$x2,data$spatcov, data$constant,data$eta,data$y,c,data$T ))
  names(data)= c("id", "status","U", "x1","x2","spatcov","constant","eta","y","c","T")
  return (data)
}

After generating the data, I wrote the following codes to run the model in the RStan as follows:

kernel=read.csv("D:\\attachments\\kernel.1001grid.csv",sep=';',header=F)
kernel=as.matrix(kernel)
dim(kernel)
stanfunc<-function(stan_model1,data3,kernel, chains,warmup,iter01,thin,adapt_delta,max_treedepth,num,number,m,n_grid){
  Onicescu.model <- stan_model(file = stan_model1)
  data3=simfun(n=620,f=10,g=150,beta0=1, beta1=1, beta2=1,beta3=-0.50, scale=0.80,shape=0.90)
  county <- data3$id
  n = length(data3$id)
  status <- data3$status
  spatcov=data3$spatcov
  cr <- 1-(sum(status)/n)
  p=c()
  for (j in 1:m){
    p[j] <- sum(data3$status[data3$id==j]) / length(data3$status[data3$id==j])
  }
  spatcovmean=c()
  for (k in 1:m){
    spatcovmean[k] <- sum(data3$spatcov[data3$id==k]) / length(data3$spatcov[data3$id==k])
  }
  da<-list(data3=data3,number=number,kernel=kernel,p=p,n_grid=n_grid,num=num,m=m, county= county, n=n)
  fit <- sampling(object = Onicescu.model,data = da,iter = iter01,warmup=warmup, cores =8,chains = chains, thin = thin,control=list(adapt_delta=adapt_delta, max_treedepth=max_treedepth),open_progress = F,verbose = F)
  estimate <- as.data.frame(extract(fit, pars = c("beta","sigma"," r_county")))
  spdf1 <- extract(fit)$spdf
  estimate.mean <- apply(estimate,2,mean)
  spdf1.mean <- apply(spdf1,2,mean)
  return(list("fit"=fit,"p"=p,"cr"=cr,"spdf1.mean"=spdf1.mean,"estimate.mean"=estimate.mean,"spatcovmean"=spatcovmean))
}
###run the model with iterat=50 
iterat <- 50
estimate.mean1<- matrix(NA, ncol=36, nrow=iterat)
ID<-rep(NA,iterat)
p1 <- matrix(NA, ncol=31, nrow=iterat)
cr<- rep(NA,iterat)
spatcovmean1 <- matrix(NA, ncol=31, nrow=iterat)
spdf11 <- matrix(NA, ncol=31, nrow=iterat)
boots <- function(iterat,result){
  for (i in 1:iterat){
    print(i)
    result = stanfunc("sim.depen model.20nd oct.stan",data3=data3,kernel=kernel , chains=1, warmup=10000 , iter01=20000,thin=10,adapt_delta=0.99,max_treedepth=15,num=4,number=11,m=31,n_grid=1001)
    ID[i]=i
    p1[i,] <- result$p
    cr[i] <- result$cr
    spdf11 [i,] <- result$spdf1.mean
    spatcovmean1[i,] <- result$spatcovmean
    estimate.mean1[i,] <- result$estimate.mean
  }
  return(list("ID"=ID,"p1"=p1,"cr"=cr,"spdf11"=spdf11,"estimate.mean1"=estimate.mean1,"spatcovmean1"=spatcovmean1))
}
### running the model with iteration=50
result00 <- boots(iterat, result)

When I consider the iterations=50, my model runs only up to 6 iterations, and then I got the following error:

Chain 1: Rejecting initial value:
 Chain 1:   Log probability evaluates to log(0), i.e. negative infinity.
Chain 1:   Stan can't start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1:   Log probability evaluates to log(0), i.e. negative infinity.
Chain 1:   Stan can't start sampling from this initial value.
Chain 1: 
 Chain 1: Initialization between (-2, 2) failed after 100 attempts. 
Chain 1:  Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model.
[1] "Error in sampler$call_sampler(args_list[[i]]) : Initialization failed."                                                                                                   
[2] "In addition: Warning messages:"                                                                                                                                           
[3] "1: In readLines(file, warn = TRUE) :"                                                                                                                                     
[4] "  incomplete final line found on 'D:\\Eisa Nazar\\generate spatial data using spat covariate.right\\December\\sim.depen model with spat cov.normalprior.right.final.stan'"
[5] "2: In readLines(file, warn = TRUE) :"                                                                                                                                     
[6] "  incomplete final line found on 'D:\\Eisa Nazar\\generate spatial data using spat covariate.right\\December\\sim.depen model with spat cov.normalprior.right.final.stan'"
[7] "3: In readLines(file, warn = TRUE) :"                                                                                                                                     
[8] "  incomplete final line found on 'D:\\generate spatial data .right\\December\\sim.depen model.20nd oct.stan'"
[1] "error occurred during calling the sampler; sampling not done"
Stan model 'sim.depen model' does not contain samples.
Error in estimate.mean1[i, ] <- result$estimate.mean : 
 replacement has length zero

Can someone help me to solve this problem?

Thank you!

Here are my RStan codes (sim.depen model.20nd oct.stan):

functions{
  vector a (vector p,matrix kernel, vector exp_x){
    vector[num_elements(p)] a; 
    vector[num_elements(p)] landa;
    for (k in 1 : num_elements(p)) {
        landa[k] = kernel[k,] * exp_x * p[k];
         }
   a = landa / sum(landa); // assign `a`
  return a;
            }
// defines the log survival
vector log_S (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_S;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
log_S[i] = log(spdf[county[i]]) + log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape)))); // county is a vector that shows the location of patients with dim = num_elements(t) 
}
return log_S;
}
//defines the log pdf
vector log_f (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_f;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
log_f[i] = log(spdf[county[i]]) + log((scale[i]*shape*t[i]^(shape-1))/((1+(scale[i]*t[i]^shape))^2));
}
return log_f;
}
//defines the log-likelihood for right censored data
real surv_loglogistic_lpdf(vector t,vector d,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_lik;
real prob;
vector[num_elements(t)] ls_s;
vector[num_elements(t)] ls_f;
ls_s = log_S( t, p,  kernel,  exp_x, county,  shape, scale);
ls_f = log_f( t, p,  kernel,  exp_x, county,  shape, scale);
log_lik = d .* log_f(t, p,  kernel,  exp_x, county,  shape, scale)+ (1 - d) .* log_S(t, p,  kernel,  exp_x, county,  shape, scale);
prob = sum(log_lik);
return prob;
}
}
//data block
data{
  int <lower=0> m;
  int <lower=0> n_grid;
  int <lower=0> n;
  int <lower=0> num;
  int <lower=0> number;
  int <lower =0 , upper = m> county[n];
  vector <lower=0> [m] p;
  matrix [m,n_grid] kernel;
  matrix [n,number] data3;
}
//parameters block
parameters{
  vector [num] beta;
  real <lower=0> sigma;//scale parameter sigma=1/shape
  vector [n_grid] x;
  vector <lower=0> [m] r_county; 
}
// transformed parameters block
transformed parameters{
  vector [num] time_ratio;
vector[n] lambdaa;
real <lower=0> alpha = 1 / sigma;
  vector [n_grid] exp_x;
time_ratio  = exp(beta);
  exp_x = exp(x);
  lambdaa = exp (((-1 * data3[,4:7] * beta) / sigma)+log(r_county [county]));
}
// model block
model{
  target += normal_lpdf(x|0,5);
  target += normal_lpdf(beta | 0, 1000);
   target += gamma_lpdf(alpha | 0.001, 0.001);
  target += normal_lpdf(r_county| 0, 1);
 target += surv_loglogistic_lpdf (data3[,11] | data3[,2],p, kernel,  exp_x, county, alpha, lambdaa);
  //model for the data
}
// generated quantities block
generated quantities{
vector[n] log_lik;//log-likelihood
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
{ for(j in 1:n){
  
log_lik[j] = data3[j,2] * (log((exp (((-1 * data3[j,4:7] * beta) * alpha)+log(r_county [county[j]]))*alpha*data3[j,11]^(alpha-1)) / ((1 + (exp (((-1 * data3[j,4:7] * beta) * alpha) + log(r_county [county[j]]))*data3[j,11]^alpha))^2))) + (1 - data3[j,2]) * (log(1- ((exp (((-1 * data3[j,4:7] * beta) * alpha) + log(r_county [county[j]]))*(data3[j,11])^alpha) / (1+(exp (((-1 * data3[j,4:7] * beta) * alpha)+log(r_county [county[j]]))*(data3[j,11])^alpha)))));}
}
}

EDIT: @maxbiostat edited this post for syntax highlighting.

Dear Stan community,

Can anyone help me to solve this problem?

Warm regards,
Eisa

Do you have any suggestions to solve this problem and can you help me, please @nhuurre, @maxbiostat, and @andrjohns?

Warm regards,
Eisa

It is hard to debug such a complicated program. can you strip it down to the basic structure in order to make a simpler reproducible example?

Thank you so much for you reply @maxbiostat. Can you explain more for me, please?

The program you posted is too complicated to find out what is going wrong. Please try to simplify it as much as you can (removing parameters that vary by country, for example) so that it is easier to investigate what is going wrong.

The model isn’t that complicated if you ignore generated quantities.
My first guess is that the problem is

log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape))))

It’s more numerically stable (and readable) when you simplify the expression

  vector log_S(vector t, vector p, matrix kernel, vector exp_x,
               array[] int county, real shape, vector scale) {
    vector[num_elements(t)] log_S_;
    vector[num_elements(p)] spdf;
    spdf = a(p, kernel, exp_x);
    for (i in 1 : num_elements(t)) {
      // county is a vector that shows the location of patients with dim = num_elements(t)
      log_S_[i] = log(spdf[county[i]]) - log1p(scale[i] * (t[i]) ^ shape);
    }
    return log_S_;
  }

Also, make sure that t[i] > 0.

Thanks a ton for your reply and help @nhuurre and @maxbiostat. I changed my stan codes but again got the same error.

Thank you in advance!

here are my stan codes:

functions{
  // defines the log spatial pdf
  vector a (vector p,matrix kernel, vector exp_x){
    vector[num_elements(p)] a; 
    vector[num_elements(p)] landa;
    for (k in 1 : num_elements(p)) {
        landa[k] = kernel[k,] * exp_x * p[k];
         }
   a = landa / sum(landa); // assign `a`
  return a;
            }
// defines the log survival
//vector log_S (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
  vector log_S(vector t, vector p, matrix kernel, vector exp_x,int[] county, real shape, vector scale) {
vector[num_elements(t)] log_S;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
//log_S[i] = log(spdf[county[i]]) + log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape)))); // county is a vector that shows the location of patients with dim = num_elements(t) 
log_S[i] = log(spdf[county[i]]) - log(scale [i]*(t[i])^shape); // county is a vector that shows the location of patients with dim = num_elements(t) 
}
return log_S;
}
//defines the log pdf
vector log_f (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_f;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
//log_f[i] = log(spdf[county[i]]) + log((scale[i]*shape*t[i]^(shape-1))/((1+(scale[i]*t[i]^shape))^2));
log_f[i] = log(spdf[county[i]]) + log(scale[i]*shape*t[i]^(shape-1)) - 2* log1p((scale[i]*t[i]^shape));
}
return log_f;
}
//defines the log-likelihood for right censored data
real surv_loglogistic_lpdf(vector t,vector d,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_lik;
real prob;
vector[num_elements(t)] ls_s;
vector[num_elements(t)] ls_f;
ls_s = log_S( t, p,  kernel,  exp_x, county,  shape, scale);
ls_f = log_f( t, p,  kernel,  exp_x, county,  shape, scale);
log_lik = d .* log_f(t, p,  kernel,  exp_x, county,  shape, scale)+ (1 - d) .* log_S(t, p,  kernel,  exp_x, county,  shape, scale);
prob = sum(log_lik);
return prob;
}
}
//data block
data{
  int <lower=0> m;
  int <lower=0> n_grid;
  int <lower=0> n;
  int <lower=0> num;
  int <lower=0> number;
  int <lower =0 , upper = m> county[n];
  vector <lower=0> [m] p;
  matrix [m,n_grid] kernel;
  matrix [n,number] data3;
}
//parameters block
parameters{
  vector [num] beta;
  real <lower=0> sigma;//scale parameter sigma=1/shape
  vector [n_grid] x;
  vector <lower=0> [m] r_county; 
}
// transformed parameters block
transformed parameters{
  vector [num] time_ratio;
vector[n] lambdaa;
real <lower=0> alpha = 1 / sigma;
  vector [n_grid] exp_x;
time_ratio  = exp(beta);
  exp_x = exp(x);
  lambdaa = exp (((-1 * data3[,4:7] * beta) / sigma)+log(r_county [county]));
}
// model block
model{
  target += normal_lpdf(x|0,5);
  target += normal_lpdf(beta | 0, 1000);
  target += inv_gamma_lpdf(sigma | 0.001, 0.001);
  target += normal_lpdf(r_county| 0, 1);
 target += surv_loglogistic_lpdf (data3[,11] | data3[,2],p, kernel,  exp_x, county, alpha, lambdaa);
}

.```

That’s not correct. I wrote log1p(), it’s not the same as log().

Did you check that t[i] is always positive, nonzero number? Or maybe the problem is a? Have you tried init=0?
You could add

for (i in 1:size(spdf)) {
  if (is_nan(spdf[i]) || (spdf[i] <= 0.0)) {
    reject("spdf[",i,"] is undefined");
  }
}

and other variations of if (is_nan(...)) reject(...). That should give you a different error with more information.

1 Like