Nice to meet you, everyone.
I’m new to Stan.
I am challenging the following cross-classified random effects model.
Level1(Within-Cell Model)
Y_{ijk} = \beta_{0jk}+\beta_{1}AGE_{ijk}+\displaystyle \Sigma_{m=2}^{n}\beta_{m}X_{mijk}+e_{ijk}, e_{ijk} \sim N(0,\sigma^{2})
Level2(Between-Cell Model)
\beta_{0jk}=\gamma_{0}+u_{0j}+v_{0k}, u_{0j} \sim N(0,\tau_{u}), v_{0k} \sim N(0,\tau_{v})
The Stan code is as follows.
data {
int<lower=0> N; // length of y; data size
int<lower=0> J; //number of groups
int<lower=0> K; //number of cohort group
vector[N] indi; // response variable
vector[N] edum; // explanatory variable; centralized
vector[N] age; // explanatory variable; centralized
vector[N] income; // explanatory variable; centralized
vector[N] sex; // explanatory variable; centralized
vector[N] work2; // explanatory variable; centralized
vector[N] work3; // explanatory variable; centralized
vector[N] work4; // explanatory variable; centralized
vector[N] marry; // explanatory variable; centralized
int<lower=1, upper=J> year[N]; // indeces
int<lower=1, upper=K> cohort[N]; // indeces
real sx1; //Standard deviation of the explanatory variable
real sx2; //Standard deviation of the explanatory variable
real sx3; //Standard deviation of the explanatory variable
real sx4; //Standard deviation of the explanatory variable
real sx5; //Standard deviation of the explanatory variable
real sx6; //Standard deviation of the explanatory variable
real sx7; //Standard deviation of the explanatory variable
real sx8; //Standard deviation of the explanatory variable
real sy; //Standard deviation of the dependent variable
real my; //Mean value of the dependent variable
int indi_nMiss; // N missing from variables; excluded have no missings
int edum_nMiss;
int marry_nMiss;
int income_nMiss;
int work2_nMiss;
int work3_nMiss;
int work4_nMiss;
int indi_index[indi_nMiss]; // indeces for missing values
int edum_index[edum_nMiss];
int marry_index[marry_nMiss];
int income_index[income_nMiss];
int work2_index[work2_nMiss];
int work3_index[work3_nMiss];
int work4_index[work4_nMiss];
}
parameters {
vector[J] u0j;
vector[K] v0k;
real beta1;
real beta2;
real beta3;
real beta4;
real beta5;
real beta6;
real beta7;
real beta8;
real alpha;
real<lower=0> sigma; //residualSD
real<lower=0> u0j_tau_u; //standard deviation of the random effects
real<lower=0> v0k_tau_v; //standard deviation of the random effects
vector<lower=0>[indi_nMiss] indi_imp; // missing values
vector<lower=0>[edum_nMiss] edum_imp;
vector<lower=0>[marry_nMiss] marry_imp;
vector<lower=0>[income_nMiss] income_imp;
vector<lower=0>[work2_nMiss] work2_imp;
vector<lower=0>[work3_nMiss] work3_imp;
vector<lower=0>[work4_nMiss] work4_imp;
cholesky_factor_cov[8] Sigma;
vector[8] Mu;
}
transformed parameters{
// Recombine data
vector[N] mu;
matrix[N,9] Data;
Data[,2] = edum; Data[edum_index,2] = edum_imp;
Data[,3] = age;
Data[,4] = income; Data[income_index,4] = income_imp;
Data[,5] = sex;
Data[,6] = work2; Data[work2_index,6] = work2_imp;
Data[,7] = work3; Data[work3_index,7] = work3_imp;
Data[,8] = work4; Data[work4_index,8] = work4_imp;
Data[,9] = marry; Data[marry_index,9] = marry_imp;
Data[,1] = indi; Data[indi_index,1] = indi_imp;
for(i in 1:N) mu[i] = Data[i,2]*beta1 + Data[i,3]*beta2 + Data[i,4]*beta3 +Data[i,5]*beta4
+Data[i,6]*beta5+Data[i,7]*beta6 +Data[i,8]*beta7 + Data[i,9]*beta8 + u0j[year[i]]+v0k[cohort[i]] + alpha;
model {
// MVN imputation
for(n in 1:N){
Data[n,2:9] ~ multi_normal_cholesky(Mu,Sigma);
}
target += normal_lpdf(Data[,1] | mu,sigma);
beta1~ normal(0,2.5*sy/sx1);
beta2 ~ normal(0,2.5*sy/sx2);
beta3 ~ normal(0,2.5*sy/sx3);
beta4 ~ normal(0,2.5*sy/sx4);
beta5 ~ normal(0,2.5*sy/sx5);
beta6 ~ normal(0,2.5*sy/sx6);
beta7 ~ normal(0,2.5*sy/sx7);
beta8 ~ normal(0,2.5*sy/sx8);
target += normal_lpdf(u0j | 0, u0j_tau_u);
target += normal_lpdf(v0k | 0, v0k_tau_v);
alpha ~ normal(my, 2.5*sy);
sigma ~ exponential(1/sy);
target += cauchy_lpdf(u0j_tau_u | 0, 2.5)
- 1 * cauchy_lccdf(0 | 0, 2.5);
target += cauchy_lpdf(v0k_tau_v | 0, 2.5)
- 1 * cauchy_lccdf(0 | 0, 2.5);
}
I use default prior for each explanatory variable. This is the same one used in rstanarm.
The R code is as follows.
missings <- lapply(wavea,function(var){which(is.na(var))}) # For each variable in data, which rows are NA
nMissings <- lapply(missings,function(var){length(var)}) # Number of NAs
names(missings) <- paste0(names(missings),'_index') # Rename indices to variable_index
names(nMissings) <- paste0(names(nMissings),'_nMiss') # Rename n_miss to variable_nMiss
datas <- list(
indi = indi,
age = age,
marry = marry,
sex = sex,
income = income,
edum = edum,
work2 = work2,
work3=work3,
work4=work4,
N = N,
J = 6,
K = 21,
year=year2,
cohort = birth3 ,
sx1 = sx1,
sx2 = sx2,
sx3 = sx3,
sx4 = sx4,
sx5 = sx5,
sx6 = sx6,
sx7 = sx7,
sx8 = sx8,
my = my,
sy = sy,
indi_nMiss=nMissings[["indi_nMiss"]],
edum_nMiss=nMissings[["edum_nMiss"]],
marry_nMiss=nMissings[["marry_nMiss"]],
income_nMiss= nMissings[["income_nMiss"]],
work2_nMiss=nMissings[["work2_nMiss"]],
work3_nMiss=nMissings[["work3_nMiss"]],
work4_nMiss= nMissings[["work4_nMiss"]],
indi_index=missings[["indi_index"]],
edum_index=missings[["edum_index"]],
marry_index=missings[["marry_index"]],
income_index=missings[["income_index"]],
work2_index=missings[["work2_index"]],
work3_index=missings[["work3_index"]],
work4_index=missings[["work4_index"]]
)
fitl <- modapc$sample(
data = datas,
seed = 123,
iter_warmup = 1000,
iter_sampling = 3000,
chains = 4,
parallel_chains = 4,
adapt_delta=0.99
)
In the data used here, missing values are present in many of the explanatory and dependent variables. Therefore, we have also built a model in which the explanatory variables are assumed to be multivariate normal.
The model runs, but it takes about a day for the entire chain to finish sampling. Also, R crashes when displaying the summary.
Is there still something wrong with the notation of the model?
I would appreciate it if you could help me.