Problem running RStan with simulated data in R

Hi all,
I want to check my Stan model with simulated data in R. To this end, I generated data in R as follows:

rm(list=ls())
set.seed(998)
simfun <- function(n,f,g, beta0, beta1, beta2, beta3,scale,shape){
  data <- list()
  m=31
  mi=20
  mis = rep(mi,m)
  id=rep(1:m,mis)
  n = length(id)
  data$status <- rep(NA,n)
  data$iid0=rtnorm(n=31, mean = 0, sd = 1, lower = 0)
  data$iid = rep(data$iid0,mis)
  c <- runif(n,f,g)
  data$U <- runif(n)
  data$x1 <- rbinom(n, size=1, prob=0.5)
  data$x2 <- rnorm(n,0,2)
  data$model <- RMgauss(var=4,scale=2)
  data$x=seq(1,31,1)
  data$sim.spatcov=RFsimulate(data$model , x=data$x)
  data$spatcov0=as.data.frame(data$sim.spatcov)
  data$spatcov=rep(data$spatcov0[,1],mis )
  data$constant=rep(1,n)
  data$eta <- (exp((-beta0 - beta1*data$x1 - beta2*data$x2 - beta3*data$spatcov)*shape))* (data$iid)
  data$y<-(((1 / data$U)-1)^(1/shape)) * (1 / (data$eta ^ (1/shape)))
  for (i in 1:n){
    if(data$y[i] <= c[i]) {data$status[i]=1} else {data$status[i]=0}
  }
  data$T <- data$status*data$y+(1-data$status)* c
  data<- data.frame(cbind(id,data$status, data$U, data$x1,data$x2,data$spatcov, data$constant,data$eta,data$y,c,data$T ))
  names(data)= c("id", "status","U", "x1","x2","spatcov","constant","eta","y","c","T")
  return (data)
}

After generating the data, I wrote the following codes to run the model in the RStan as follows:

kernel=read.csv("D:\\attachments\\kernel.1001grid.csv",sep=';',header=F)
kernel=as.matrix(kernel)
dim(kernel)
stanfunc<-function(stan_model1,data3,kernel, chains,warmup,iter01,thin,adapt_delta,max_treedepth,num,number,m,n_grid){
  Onicescu.model <- stan_model(file = stan_model1)
  data3=simfun(n=620,f=10,g=150,beta0=1, beta1=1, beta2=1,beta3=-0.50, scale=0.80,shape=0.90)
  county <- data3$id
  n = length(data3$id)
  status <- data3$status
  spatcov=data3$spatcov
  cr <- 1-(sum(status)/n)
  p=c()
  for (j in 1:m){
    p[j] <- sum(data3$status[data3$id==j]) / length(data3$status[data3$id==j])
  }
  spatcovmean=c()
  for (k in 1:m){
    spatcovmean[k] <- sum(data3$spatcov[data3$id==k]) / length(data3$spatcov[data3$id==k])
  }
  da<-list(data3=data3,number=number,kernel=kernel,p=p,n_grid=n_grid,num=num,m=m, county= county, n=n)
  fit <- sampling(object = Onicescu.model,data = da,iter = iter01,warmup=warmup, cores =8,chains = chains, thin = thin,control=list(adapt_delta=adapt_delta, max_treedepth=max_treedepth),open_progress = F,verbose = F)
  estimate <- as.data.frame(extract(fit, pars = c("beta","sigma"," r_county")))
  spdf1 <- extract(fit)$spdf
  estimate.mean <- apply(estimate,2,mean)
  spdf1.mean <- apply(spdf1,2,mean)
  return(list("fit"=fit,"p"=p,"cr"=cr,"spdf1.mean"=spdf1.mean,"estimate.mean"=estimate.mean,"spatcovmean"=spatcovmean))
}
###run the model with iterat=50 
iterat <- 50
estimate.mean1<- matrix(NA, ncol=36, nrow=iterat)
ID<-rep(NA,iterat)
p1 <- matrix(NA, ncol=31, nrow=iterat)
cr<- rep(NA,iterat)
spatcovmean1 <- matrix(NA, ncol=31, nrow=iterat)
spdf11 <- matrix(NA, ncol=31, nrow=iterat)
boots <- function(iterat,result){
  for (i in 1:iterat){
    print(i)
    result = stanfunc("sim.depen model.20nd oct.stan",data3=data3,kernel=kernel , chains=1, warmup=10000 , iter01=20000,thin=10,adapt_delta=0.99,max_treedepth=15,num=4,number=11,m=31,n_grid=1001)
    ID[i]=i
    p1[i,] <- result$p
    cr[i] <- result$cr
    spdf11 [i,] <- result$spdf1.mean
    spatcovmean1[i,] <- result$spatcovmean
    estimate.mean1[i,] <- result$estimate.mean
  }
  return(list("ID"=ID,"p1"=p1,"cr"=cr,"spdf11"=spdf11,"estimate.mean1"=estimate.mean1,"spatcovmean1"=spatcovmean1))
}
### running the model with iteration=50
result00 <- boots(iterat, result)

When I consider the iterations=50, my model runs only up to 6 iterations, and then I got the following error:

Chain 1: Rejecting initial value:
 Chain 1:   Log probability evaluates to log(0), i.e. negative infinity.
Chain 1:   Stan can't start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1:   Log probability evaluates to log(0), i.e. negative infinity.
Chain 1:   Stan can't start sampling from this initial value.
Chain 1: 
 Chain 1: Initialization between (-2, 2) failed after 100 attempts. 
Chain 1:  Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model.
[1] "Error in sampler$call_sampler(args_list[[i]]) : Initialization failed."                                                                                                   
[2] "In addition: Warning messages:"                                                                                                                                           
[3] "1: In readLines(file, warn = TRUE) :"                                                                                                                                     
[4] "  incomplete final line found on 'D:\\Eisa Nazar\\generate spatial data using spat covariate.right\\December\\sim.depen model with spat cov.normalprior.right.final.stan'"
[5] "2: In readLines(file, warn = TRUE) :"                                                                                                                                     
[6] "  incomplete final line found on 'D:\\Eisa Nazar\\generate spatial data using spat covariate.right\\December\\sim.depen model with spat cov.normalprior.right.final.stan'"
[7] "3: In readLines(file, warn = TRUE) :"                                                                                                                                     
[8] "  incomplete final line found on 'D:\\generate spatial data .right\\December\\sim.depen model.20nd oct.stan'"
[1] "error occurred during calling the sampler; sampling not done"
Stan model 'sim.depen model' does not contain samples.
Error in estimate.mean1[i, ] <- result$estimate.mean : 
 replacement has length zero

Can someone help me to solve this problem?

Thank you!

Here are my RStan codes (sim.depen model.20nd oct.stan):

functions{
  vector a (vector p,matrix kernel, vector exp_x){
    vector[num_elements(p)] a; 
    vector[num_elements(p)] landa;
    for (k in 1 : num_elements(p)) {
        landa[k] = kernel[k,] * exp_x * p[k];
         }
   a = landa / sum(landa); // assign `a`
  return a;
            }
// defines the log survival
vector log_S (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_S;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
log_S[i] = log(spdf[county[i]]) + log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape)))); // county is a vector that shows the location of patients with dim = num_elements(t) 
}
return log_S;
}
//defines the log pdf
vector log_f (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_f;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
log_f[i] = log(spdf[county[i]]) + log((scale[i]*shape*t[i]^(shape-1))/((1+(scale[i]*t[i]^shape))^2));
}
return log_f;
}
//defines the log-likelihood for right censored data
real surv_loglogistic_lpdf(vector t,vector d,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_lik;
real prob;
vector[num_elements(t)] ls_s;
vector[num_elements(t)] ls_f;
ls_s = log_S( t, p,  kernel,  exp_x, county,  shape, scale);
ls_f = log_f( t, p,  kernel,  exp_x, county,  shape, scale);
log_lik = d .* log_f(t, p,  kernel,  exp_x, county,  shape, scale)+ (1 - d) .* log_S(t, p,  kernel,  exp_x, county,  shape, scale);
prob = sum(log_lik);
return prob;
}
}
//data block
data{
  int <lower=0> m;
  int <lower=0> n_grid;
  int <lower=0> n;
  int <lower=0> num;
  int <lower=0> number;
  int <lower =0 , upper = m> county[n];
  vector <lower=0> [m] p;
  matrix [m,n_grid] kernel;
  matrix [n,number] data3;
}
//parameters block
parameters{
  vector [num] beta;
  real <lower=0> sigma;//scale parameter sigma=1/shape
  vector [n_grid] x;
  vector <lower=0> [m] r_county; 
}
// transformed parameters block
transformed parameters{
  vector [num] time_ratio;
vector[n] lambdaa;
real <lower=0> alpha = 1 / sigma;
  vector [n_grid] exp_x;
time_ratio  = exp(beta);
  exp_x = exp(x);
  lambdaa = exp (((-1 * data3[,4:7] * beta) / sigma)+log(r_county [county]));
}
// model block
model{
  target += normal_lpdf(x|0,5);
  target += normal_lpdf(beta | 0, 1000);
   target += gamma_lpdf(alpha | 0.001, 0.001);
  target += normal_lpdf(r_county| 0, 1);
 target += surv_loglogistic_lpdf (data3[,11] | data3[,2],p, kernel,  exp_x, county, alpha, lambdaa);
  //model for the data
}
// generated quantities block
generated quantities{
vector[n] log_lik;//log-likelihood
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
{ for(j in 1:n){
  
log_lik[j] = data3[j,2] * (log((exp (((-1 * data3[j,4:7] * beta) * alpha)+log(r_county [county[j]]))*alpha*data3[j,11]^(alpha-1)) / ((1 + (exp (((-1 * data3[j,4:7] * beta) * alpha) + log(r_county [county[j]]))*data3[j,11]^alpha))^2))) + (1 - data3[j,2]) * (log(1- ((exp (((-1 * data3[j,4:7] * beta) * alpha) + log(r_county [county[j]]))*(data3[j,11])^alpha) / (1+(exp (((-1 * data3[j,4:7] * beta) * alpha)+log(r_county [county[j]]))*(data3[j,11])^alpha)))));}
}
}

EDIT: @maxbiostat edited this post for syntax highlighting.

Dear Stan community,

Can anyone help me to solve this problem?

Warm regards,
Eisa

Do you have any suggestions to solve this problem and can you help me, please @nhuurre, @maxbiostat, and @andrjohns?

Warm regards,
Eisa

It is hard to debug such a complicated program. can you strip it down to the basic structure in order to make a simpler reproducible example?

Thank you so much for you reply @maxbiostat. Can you explain more for me, please?

The program you posted is too complicated to find out what is going wrong. Please try to simplify it as much as you can (removing parameters that vary by country, for example) so that it is easier to investigate what is going wrong.

The model isn’t that complicated if you ignore generated quantities.
My first guess is that the problem is

log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape))))

It’s more numerically stable (and readable) when you simplify the expression

  vector log_S(vector t, vector p, matrix kernel, vector exp_x,
               array[] int county, real shape, vector scale) {
    vector[num_elements(t)] log_S_;
    vector[num_elements(p)] spdf;
    spdf = a(p, kernel, exp_x);
    for (i in 1 : num_elements(t)) {
      // county is a vector that shows the location of patients with dim = num_elements(t)
      log_S_[i] = log(spdf[county[i]]) - log1p(scale[i] * (t[i]) ^ shape);
    }
    return log_S_;
  }

Also, make sure that t[i] > 0.

Thanks a ton for your reply and help @nhuurre and @maxbiostat. I changed my stan codes but again got the same error.

Thank you in advance!

here are my stan codes:

functions{
  // defines the log spatial pdf
  vector a (vector p,matrix kernel, vector exp_x){
    vector[num_elements(p)] a; 
    vector[num_elements(p)] landa;
    for (k in 1 : num_elements(p)) {
        landa[k] = kernel[k,] * exp_x * p[k];
         }
   a = landa / sum(landa); // assign `a`
  return a;
            }
// defines the log survival
//vector log_S (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
  vector log_S(vector t, vector p, matrix kernel, vector exp_x,int[] county, real shape, vector scale) {
vector[num_elements(t)] log_S;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
//log_S[i] = log(spdf[county[i]]) + log(1- ((scale [i]*(t[i])^shape)/ (1+(scale [i]*(t[i])^shape)))); // county is a vector that shows the location of patients with dim = num_elements(t) 
log_S[i] = log(spdf[county[i]]) - log(scale [i]*(t[i])^shape); // county is a vector that shows the location of patients with dim = num_elements(t) 
}
return log_S;
}
//defines the log pdf
vector log_f (vector t,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_f;
vector[num_elements(p)] spdf;
spdf = a (p,kernel, exp_x);
for (i in 1:num_elements(t)){
//log_f[i] = log(spdf[county[i]]) + log((scale[i]*shape*t[i]^(shape-1))/((1+(scale[i]*t[i]^shape))^2));
log_f[i] = log(spdf[county[i]]) + log(scale[i]*shape*t[i]^(shape-1)) - 2* log1p((scale[i]*t[i]^shape));
}
return log_f;
}
//defines the log-likelihood for right censored data
real surv_loglogistic_lpdf(vector t,vector d,vector p, matrix kernel, vector exp_x, int[] county, real shape,vector scale){
vector[num_elements(t)] log_lik;
real prob;
vector[num_elements(t)] ls_s;
vector[num_elements(t)] ls_f;
ls_s = log_S( t, p,  kernel,  exp_x, county,  shape, scale);
ls_f = log_f( t, p,  kernel,  exp_x, county,  shape, scale);
log_lik = d .* log_f(t, p,  kernel,  exp_x, county,  shape, scale)+ (1 - d) .* log_S(t, p,  kernel,  exp_x, county,  shape, scale);
prob = sum(log_lik);
return prob;
}
}
//data block
data{
  int <lower=0> m;
  int <lower=0> n_grid;
  int <lower=0> n;
  int <lower=0> num;
  int <lower=0> number;
  int <lower =0 , upper = m> county[n];
  vector <lower=0> [m] p;
  matrix [m,n_grid] kernel;
  matrix [n,number] data3;
}
//parameters block
parameters{
  vector [num] beta;
  real <lower=0> sigma;//scale parameter sigma=1/shape
  vector [n_grid] x;
  vector <lower=0> [m] r_county; 
}
// transformed parameters block
transformed parameters{
  vector [num] time_ratio;
vector[n] lambdaa;
real <lower=0> alpha = 1 / sigma;
  vector [n_grid] exp_x;
time_ratio  = exp(beta);
  exp_x = exp(x);
  lambdaa = exp (((-1 * data3[,4:7] * beta) / sigma)+log(r_county [county]));
}
// model block
model{
  target += normal_lpdf(x|0,5);
  target += normal_lpdf(beta | 0, 1000);
  target += inv_gamma_lpdf(sigma | 0.001, 0.001);
  target += normal_lpdf(r_county| 0, 1);
 target += surv_loglogistic_lpdf (data3[,11] | data3[,2],p, kernel,  exp_x, county, alpha, lambdaa);
}

.```

That’s not correct. I wrote log1p(), it’s not the same as log().

Did you check that t[i] is always positive, nonzero number? Or maybe the problem is a? Have you tried init=0?
You could add

for (i in 1:size(spdf)) {
  if (is_nan(spdf[i]) || (spdf[i] <= 0.0)) {
    reject("spdf[",i,"] is undefined");
  }
}

and other variations of if (is_nan(...)) reject(...). That should give you a different error with more information.

1 Like

Huge thanks for your great help @nhuurre and sorry for the delay.
I edited my codes again according to your suggestion, but again encountered with the same error.
"
SAMPLING FOR MODEL ‘sim.depen model with spat cov.normalprior.right’ NOW (CHAIN 1).
Chain 1: Rejecting initial value:
Chain 1: Log probability evaluates to log(0), i.e. negative infinity.
Chain 1: Stan can’t start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1: Log probability evaluates to log(0), i.e. negative infinity.
Chain 1: Stan can’t start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1: Log probability evaluates to log(0), i.e. negative infinity.
Chain 1: Stan can’t start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1: Log probability evaluates to log(0), i.e. negative infinity.
Chain 1: Stan can’t start sampling from this initial value.
Chain 1: Rejecting initial value:
Chain 1: Log probability evaluates to log(0), i.e. negative infinity.
Chain 1: Stan can’t start sampling from this initial value.
Chain 1:
Chain 1: Initialization between (-2, 2) failed after 100 attempts.
Chain 1: Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model.
[1] “Error in sampler$call_sampler(args_list[[i]]) : Initialization failed.”
[1] “error occurred during calling the sampler; sampling not done”
Stan model ‘sim.depen model with spat cov.normalprior.right’ does not contain samples.
Error in estimate.mean1[i, ] ← result$estimate.mean :
replacement has length zero
"

Kind regards,
Eisa

You mean log1p() vs log(), right? Did you try init=0?

Try this, it should give a different error:

real surv_loglogistic_lpdf(vector t, vector d, vector p, matrix kernel,
                           vector exp_x, array[] int county, real shape,
                           vector scale) {
  vector[num_elements(t)] log_lik;
  real prob;
  vector[num_elements(t)] ls_s;
  vector[num_elements(t)] ls_f;
  for (i in 1 : num_elements(t)) {
    if (t[i] <= 0.0) {
      reject("t[", i, "] is ", t[i]);
    }
  }
  ls_s = log_S(t, p, kernel, exp_x, county, shape, scale);
  ls_f = log_f(t, p, kernel, exp_x, county, shape, scale);
  log_lik = d .* log_f(t, p, kernel, exp_x, county, shape, scale)
            + (1 - d) .* log_S(t, p, kernel, exp_x, county, shape, scale);
  for (i in 1 : num_elements(log_lik)) {
    if (is_inf(log_lik[i]) || is_nan(log_lik[i])) {
      reject("log_lik[", i, "] is ", log_lik[i]);
    }
  }
  prob = sum(log_lik);
  return prob;
}

Ok, thanks a ton for your assistance @nhuurre. I’ll edit my codes and inform you of the result.

Best wishes,
Eisa

Dear Niko,

I want to inform you that I changed my codes according to your suggestions, but after running all iterations I got the attached error and couldn’t output. thanks a lot for your support @nhuurre.

Regards,
Eisa

So initialization works now, no more “rejecting initial value”?

I don’t actually use RStan (or RStudio) so I can’t help with an R session crash. Maybe @maxbiostat has advice?

Yes, initialization works now. Thank you so much.

Dear Luiz,
I encountered the attached error after executing my codes for simulated data (iterations=50) and can’t get the output.

Do you have any advice to solve this problem @maxbiostat?

Thank you in advance!

What happens if you try to run the code outside of Rstudio? I don’t use Windows, so maybe @andrjohns can advise on the best way to run the code so as to unearth the true error.

I would guess that the session might be running out of RAM, given that you’re working with large data and the screenshot showed 15000 iterations. Try reducing the number of iterations and seeing if the problem persists.

Alternatively, try using cmdstanR instead, as it generally encounters fewer crashes with R

1 Like

Sorry for the delay. Thank you so much for your support @andrjohns. I run my simulation in R instead of RStudio, but I encountered the same above error again.

Regards,
Eisa

Have you tried reducing the number of iterations or using cmdstanr?