I am working up to a stratified sampling model for ordered categorical data. I expect some small sample sizes. When I run a simple model on one sample with n small, the mean probabilities are well off the sample probabilities. When I run a larger sample, the means are close. Should I be worried? The models are below.
Stan Model for Ordered Categorical - Small n
options(digits=3)
data
c1= rep(1,1)
c2 = rep(2,2)
c3 = rep(3,3)
c4 = rep(4,4)
scores = append(c1,c2)
scores = append(scores,c3)
scores = append(scores,c4)
N = length(scores)
library(rstan)
rs=list(N=N,scores=scores)
build stan model
stanstratrubric_ord = "
data {
int<lower=0> N; // number of observations
int<lower=1,upper=4> scores[N]; // list of scores
}
parameters {
ordered[3] cutpoints; // 3 cutpoints for 4 scores, keeps in order
}
model {
cutpoints ~ normal(0,10); // our prior for cutpoints (actually like -3->p=.05,-2,-1,0->p=.50 etc)
for (i in 1:N) {
scores[i] ~ ordered_logistic(0, cutpoints);
}
}
generated quantities {
vector[3] cumprob; // cumulative probabilities
vector[4] prob; // probabilities
cumprob = inv_logit(cutpoints);
prob[1] = cumprob[1];
prob[2] = cumprob[2] - cumprob[1];
prob[3] = cumprob[3] - cumprob[2];
prob[4] = 1 - cumprob[3];
}
"
fit <- stan(model_code = stanstratrubric_ord, data = rs, iter = 4000, warmup = 2000,chains = 4,control = list(adapt_delta = 0.99),refresh = 0)
get_posterior_mean(fit,par=c(“prob[1]”,“prob[2]”,“prob[3]”,“prob[4]”))
Stan Model for Ordered Categorical - Large n
options(digits=3)
data
c1= rep(1,10)
c2 = rep(2,20)
c3 = rep(3,30)
c4 = rep(4,40)
scores = append(c1,c2)
scores = append(scores,c3)
scores = append(scores,c4)
N = length(scores)
library(rstan)
rs=list(N=N,scores=scores)
build stan model
stanstratrubric_ord = "
data {
int<lower=0> N; // number of observations
int<lower=1,upper=4> scores[N]; // list of scores
}
parameters {
ordered[3] cutpoints; // 3 cutpoints for 4 scores, keeps in order
}
model {
cutpoints ~ normal(0,10); // our prior for cutpoints (actually like -3->p=.05,-2,-1,0->p=.50 etc)
for (i in 1:N) {
scores[i] ~ ordered_logistic(0, cutpoints);
}
}
generated quantities {
vector[3] cumprob; // cumulative probabilities
vector[4] prob; // probabilities
cumprob = inv_logit(cutpoints);
prob[1] = cumprob[1];
prob[2] = cumprob[2] - cumprob[1];
prob[3] = cumprob[3] - cumprob[2];
prob[4] = 1 - cumprob[3];
}
"
fit <- stan(model_code = stanstratrubric_ord, data = rs, iter = 4000, warmup = 2000,chains = 4,control = list(adapt_delta = 0.99),refresh = 0)
get_posterior_mean(fit,par=c(“prob[1]”,“prob[2]”,“prob[3]”,“prob[4]”))