Coming back at this, saw this other post in the forum Latent class previous post where thet adjust the DINA model code
And after testing found that this method to estimate latent classes from binary data works well for increasing numbers of classes. With a small change in the constraints between C = 2 and C > 2.
I am not clear why this approach works but using the bernoulli_lpmf
distribution doesnt work. I was looking to use the Stan distributions because that way its easier to combine different types of items, as for example this code only works for binary items
Here is the code for C=2
data {
int<lower=1> I; // number of items
int<lower=1> J; // number of respondents
matrix[J, I] y; // score for obs n
int<lower=1> C; // # of attribute profiles (latent classes)
}
parameters {
simplex[C] nu;
vector<lower=0>[C] alpha_dir;
// average intercept and main effect
real mean_intercept;
real<lower=0> mean_maineffect;
// item level deviations from average effects
real dev_intercept[I];
real<lower=-mean_maineffect> dev_maineffect[I];
}
transformed parameters {
matrix[I,C] pi; // Probability of correct response for each class on each item
vector[C] ps[J];
real log_items[I];
real intercept[I];
real maineffect[I];
vector[I] master_pi;
vector[I] nonmaster_pi;
for (i in 1:I) {
intercept[i] = mean_intercept + dev_intercept[i];
maineffect[i] = mean_maineffect + dev_maineffect[i];
nonmaster_pi[i] = inv_logit(intercept[i]);
master_pi[i] = inv_logit(intercept[i] + maineffect[i]);
}
// Probability of correct response for each class on each item
for (c in 1:C) {
for (i in 1:I) {
pi[i,c] = master_pi[i]^(c - 1) * nonmaster_pi[i]^(1 - (c - 1));
}
}
// Define model
for (j in 1:J) {
for (c in 1:C) {
for (i in 1:I) {
log_items[i] = y[j,i] * log(pi[i,c]) + (1 - y[j,i]) * log(1 - pi[i,c]);
}
ps[j][c] = sum(log_items);
}
}
}
model{
// Priors
nu ~ dirichlet(alpha_dir);
alpha_dir ~ gamma(1, 0.5);
mean_intercept ~ normal(0, 5);
mean_maineffect ~ normal(0, 5);
dev_intercept ~ normal(0, 3);
dev_maineffect ~ normal(0, 3);
target += log_mix(nu, ps);
}
And when C > 2, the parameter block changes to
parameters {
simplex[C] nu;
vector<lower=0>[C] alpha_dir;
// average intercept and main effect
real mean_intercept;
real mean_maineffect;
// item level deviations from average effects
real dev_intercept[I];
real dev_maineffect[I];
}
And here is the R code for runing it
library(poLCA)
library(loo)
library(rstan)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
dat <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/lca1.dat",
header=F)
colnames(dat) <- c("id", paste0("item",1:9))
head(dat)
apply(dat[,2:10], 2, table)
dat2 <- dat[,2:10]
head(dat2)
wide_data <- list(
I = ncol(dat2),
J = nrow(dat2),
y = dat2,
C = 3
)
pars <- c("nu","pi")
iter <- 1000
warmup <- iter - 500
wide_model <- stan(model_code=lca_wide,
data = wide_data,
pars=pars,
chains = 3,
iter = iter,
warmup = warmup)
wide_model
## results with poLCA
ch2 <- poLCA(cbind(item1, item2, item3, item4, item5, item6, item7, item8, item9)~1,
dat2+1,nclass=4, maxiter=20000)
ch2
Now I am trying to adjust that approach to include categorical indicators with more than 2 answers, and continuous items