I am trying to fit the following regression model using data augmentation through which the outcome variable h[n] is generated based on 11-scale ordered categorical answers between 0 to 10. The truncated normal distribution transforms ordered categories into continuous values, ranging from lower boundaries L[N] to upper boundaries UP[N].
Stan code
data {
int N;
int<lower=0, upper=1> SF[N];
int<lower=0, upper=1> EJH[N];
int<lower=0, upper=1> EH[N];
int<lower=0, upper=1> EJC[N];
int<lower=0, upper=1> EG[N];
int<lower=0, upper=1> HH[N];
int<lower=0, upper=1> HHM[N];
int<lower=0, upper=1> HLM[N];
int<lower=0, upper=1> HL[N];
int<lower=0, upper=1> In[N];
int<lower=0, upper=1> I9[N];
int<lower=0, upper=1> I19[N];
int<lower=0, upper=1> I39[N];
int<lower=0, upper=1> I79[N];
int<lower=0, upper=1> I99[N];
int<lower=0, upper=1> I119[N];
int<lower=0, upper=1> I139[N];
int<lower=0, upper=1> I140[N];
int<lower=0, upper=1> PM[N];
int<lower=0, upper=1> PD[N];
int<lower=0, upper=1> PW[N];
int<lower=0, upper=1> PS[N];
int<lower=0, upper=1> PC[N];
int<lower=0, upper=1> C[N];
int<lower=0, upper=1> U[N];
real A[N];
real L[N];
real UP[N];
}
parameters {
real b;
real bf;
real bejh;
real beh;
real bejc;
real beg;
real bhh;
real bhhm;
real bhlm;
real bhl;
real bin;
real bi9;
real bi19;
real bi39;
real bi79;
real bi99;
real bi119;
real bi139;
real bi140;
real bpm;
real bpd;
real bpw;
real bps;
real bpc;
real bc;
real bu;
real ba;
real a0;
real<lower=L[N],upper=UP[N]> h[N];
real<lower=0> sigma;
}
transformed parameters {
real mu[N];
for (n in 1:N)
mu[n] = b + bf*SF[n] + bejh*EJH[n] + beh*EH[n] + bejc*EJC[n] + beg*EG[n] +
bhh*HH[n] + bhhm*HHM[n] + bhlm*HLM[n] + bhl*HL[n] + bin*In[n] + bi9*I9[n] +
bi19*I19[n] + bi39*I39[n] + bi79*I79[n] + bi99*I99[n] + bi119*I119[n] +
bi139*I139[n] + bi140*I140[n] + bpm*PM[n] + bpd*PD[n] + bpw*PW[n] +
bps*PS[n] + bpc*PC[n] + bc*C[n] + bu*U[n] + ba*(A[n]-a0)^2;
}
model {
for (n in 1:N){
h[n] ~ normal(0,1) T[L[n],UP[n]]; //data augmentation
h[n] ~ normal(mu[n], sigma);
}
}
generated quantities {
real h_pred[N];
for (n in 1:N)
h_pred[n] = normal_rng(mu[n], sigma);
}
R code
setwd("~/docdis")
library(rstan)
d <- read.csv(file='output/ch4/lall3.csv', header = TRUE)
d <- subset(d, subset = year == 2011)
d$age_std <-(d$age-mean(d$age))/sd(d$age)
#preparation for data augmentation
freq <- as.vector(table(d$hap))
den <- freq/sum(freq)
den2 <- 0
for(i in 1:length(den)){
den2[i+1] <- den2[i]+den[i]
}
den2
ap <- 0
for(i in 1:length(den2)){
ap[i] <- qnorm(den2[i], 0, 1)
}
ap
xx <- rep(-1:10) # 11-scale-category
trns <- as.data.frame(t(rbind(xx,ap)))
trns$xx2 <- xx+1
trns$ap2 <- trns$ap
colnames(trns) <- c("Uhap", "Uhap_q", "Lhap", "Lhap_q")
trns
d <- merge(d, trns[,1:2], by.x = "hap", by.y = "Uhap")
d <- merge(d, trns[,3:4], by.x = "hap", by.y = "Lhap")
#list for mcmc
N <- nrow(d)
data <- list(N=N,SF=d$sex_f,EJH=d$edu_Jh,EH=d$edu_H,EJC=d$edu_Jc,
EG=d$edu_G,HH=d$hel_h,HHM=d$hel_hm,HLM=d$hel_lm,HL=d$hel_l, In=d$inc_none,
I9=d$inc_9.9,I19=d$inc_19.9,I39=d$inc_39.9,I79=d$inc_79.9,
I99=d$inc_99.9,I119=d$inc_119.9,I139=d$inc_139.9,
I140=d$inc_140.0, PM=d$par_M,PD=d$par_D,
PW=d$par_W,PS=d$par_S,PC=d$par_C,C=d$chi,U=d$uem,
A=d$age_std, L=d$Lhap_q, UP=d$Uhap_q)
set.seed(243)
#stan
stanmodel <- stan_model(file='model/ch4/model_ind_mreghap_daug.stan')
fit <- sampling(
stanmodel,
data = data,
seed =243,
chains=4, iter=500, warmup=100, thin=1,
)
After compiling I got the following message
In file included from C:/Users/yohei/Documents/R/win-library/3.4/BH/include/boost/config.hpp:39:0,
from C:/Users/yohei/Documents/R/win-library/3.4/BH/include/boost/math/tools/config.hpp:13,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/stan/math/rev/core/var.hpp:7,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/stan/math/rev/core/gevv_vvv_vari.hpp:5,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/stan/math/rev/core.hpp:12,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/stan/math/rev/mat.hpp:4,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/stan/math.hpp:4,
from C:/Users/yohei/Documents/R/win-library/3.4/StanHeaders/include/src/stan/model/model_header.hpp:4,
from filee08cdf6e52.cpp:8:
C:/Users/yohei/Documents/R/win-library/3.4/BH/include/boost/config/compiler/gcc.hpp:186:0: warning: “BOOST_NO_CXX11_RVALUE_REFERENCES” redefineddefine BOOST_NO_CXX11_RVALUE_REFERENCES
^
:0:0: note: this is the location of the previous definition
Then, sampling failed with the repeated messages:
SAMPLING FOR MODEL ‘model_ind_mreghap_daug’ NOW (CHAIN 1).
Rejecting initial value:
Log probability evaluates to log(0), i.e. negative infinity.
Stan can’t start sampling from this initial value.
Rejecting initial value:
Log probability evaluates to log(0), i.e. negative infinity.
Stan can’t start sampling from this initial value.
Rejecting initial value:
Log probability evaluates to log(0), i.e. negative infinity.
Stan can’t start sampling from this initial value.
…(repeated messages)…
Rejecting initial value:
Log probability evaluates to log(0), i.e. negative infinity.
Stan can’t start sampling from this initial value.Initialization between (-2, 2) failed after 100 attempts.
Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model.
[1] “Error in sampler$call_sampler(args_list[[i]]) : Initialization failed.”
[1] “error occurred during calling the sampler; sampling not done”
After this, I set a vector of initial values by generating random values from a truncated normal distribution:
R code
library(msm)
h_ini <- as.vector(rtnorm(N,mean=0,sd=1, lower=d$Lhap_q, upper=d$Uhap_q))
stanmodel <- stan_model(file='model/ch4/model_ind_mreghap_daug.stan')
fit <- sampling(
stanmodel,
data = data,
seed =243,
chains=4, iter=500, warmup=100, thin=1#,
init=function(){ #setting of initial values
list(h=h_ini)
}
)
But, I got another different error message about initial values:
SAMPLING FOR MODEL ‘model_ind_mreghap_daug’ NOW (CHAIN 1).
Error transforming variable h: lub_free: Bounded variable is -2.4652, but must be in the interval [1.24518, inf]
[1] “Error in sampler$call_sampler(args_list[[i]]) : "
[2] " Error transforming variable h: lub_free: Bounded variable is -2.4652, but must be in the interval [1.24518, inf]”
[1] “error occurred during calling the sampler; sampling not done”
Could someone please advise me about how to fix this code?
Unfortunately, I am not permitted to distribute the data set for this analysis.
I would appreciate it if you would respond to me.
> sessionInfo()
R version 3.4.1 (2017-06-30)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=Japanese_Japan.932 LC_CTYPE=Japanese_Japan.932
[3] LC_MONETARY=Japanese_Japan.932 LC_NUMERIC=C
[5] LC_TIME=Japanese_Japan.932
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] msm_1.6.4 rstan_2.16.2 StanHeaders_2.16.0-1
[4] ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.11 compiler_3.4.1 plyr_1.8.4 bindr_0.1
[5] tools_3.4.1 tibble_1.3.3 gtable_0.2.0 lattice_0.20-35
[9] pkgconfig_2.0.1 rlang_0.1.1 Matrix_1.2-10 parallel_3.4.1
[13] mvtnorm_1.0-6 expm_0.999-2 loo_1.1.0 bindrcpp_0.2
[17] gridExtra_2.2.1 coda_0.19-1 dplyr_0.7.1 stats4_3.4.1
[21] grid_3.4.1 inline_0.3.14 glue_1.1.1 R6_2.2.2
[25] rethinking_1.59 survival_2.41-3 tidyr_0.6.3 magrittr_1.5
[29] scales_0.4.1 codetools_0.2-15 matrixStats_0.52.2 MASS_7.3-47
[33] splines_3.4.1 assertthat_0.2.0 colorspace_1.3-2 lazyeval_0.2.0
[37] munsell_0.4.3