I made a couple of small changes to the code. Mainly use inv_logit()
as @jonah suggested, and I also attempted to vectorize the calculations for a
and b
.
data {
int<lower = 0, upper = 1> run_estimation; // a switch to evaluate the likelihood
int N ; // number of observations
int I ; // number of students
int<lower=1,upper=I> ii[N]; // student id index
vector<lower=650,upper=850>[N] raw_pre_test ; // raw pre-test
vector<lower=650,upper=850>[N] raw_score ; // raw score
real nu_mean; //
real<lower=0> nu_sigma ; //
real mu_mean ; //
real<lower=0> mu_sigma; //
real<lower=0> sigma_student_sigma; //
}
transformed data {
vector<lower=0,upper=1>[N] pre_test_std = (raw_pre_test-650)/200; // pre-test
vector<lower=0,upper=1>[N] score_std = (raw_score-650)/200 ; // score
}
parameters {
real<lower=0> nu ;
vector<lower=2>[N] kappa;
real mu;
vector[I] student_std;
real<lower=0> sigma_student;
}
transformed parameters {
vector[I] student = mu + sigma_student * student_std; // Matt trick
}
model {
vector[N] omega ;
vector[N] a ;
vector[N] b ;
// priors
nu ~ normal(nu_mean, nu_sigma) ;
kappa ~ normal(10, 2) ;
mu ~ normal(mu_mean, mu_sigma);
student_std ~ normal(0,1);
sigma_student ~ normal(0,sigma_student_sigma);
// transformed parameters
omega = inv_logit(nu * pre_test_std + student[ii]) ;
a = omega .* (kappa-2) + 1 ;
b = (1-omega).*(kappa-2)+1 ;
//for(n in 1:N){
// a[n] = omega[n] * (kappa[n]-2) + 1 ;
// b[n] = (1-omega[n])*(kappa[n]-2)+1;
//}
// likelihood, which we only evaluate conditionally
if(run_estimation==1){
score_std ~ beta(a,b);
}
}
generated quantities {
vector[N] score_sim;
vector[N] omega ;
//vector[N] expX ;
vector[N] a ;
vector[N] b ;
omega = inv_logit(nu * pre_test_std + student[ii]) ;
a = omega .* (kappa-2) + 1 ;
b = (1-omega).*(kappa-2)+1 ;
for (n in 1:N) {
//expX[n] = exp(nu * pre_test_std[n] + student[ii[n]]) ;
//omega[n] = expX[n]/(1+expX[n]) ;
//a[n] = omega[n] * (kappa[n]-2) + 1;
//b[n] = (1-omega[n])*(kappa[n]-2)+1;
score_sim[n] = beta_rng(a[n], b[n]) * 200 + 650;
}
}
Following @James_Savage tutorial I tried to see whether or not I can recover my parameters:
library(tidyverse)
library(rstan)
set.seed(9782)
I <- 50; N <- 100 ; omega <- 0.8 ; kappa <- 10
a <- omega*(kappa-2)+1
b <- (1-omega)*(kappa-2)+1
raw_pre_test <- rbeta(N,a,b)*200+650
raw_score <- rbeta(N,a,b)*200+650
# Compile the stan code ---------------------------------------------------
compiled_model <- stan_model("beta.stan")
# First run ---------------------------------------------------------------
sim_out <- sampling(compiled_model, data = list(run_estimation = 0,
N = N,
I = I,
ii = rep(1:I,2),
raw_pre_test = raw_pre_test,
raw_score = raw_score,
nu_mean = 1,
nu_sigma = 0.5,
mu_mean = 0,
mu_sigma = 1,
sigma_student_sigma = 0.1),
seed = 42118)
fake_data_matrix <- sim_out %>%
as.data.frame %>%
select(contains("score_sim"))
# Can I recover the parameters? -------------------------------------------
draw <- 21
score_sim <- extract(sim_out, pars = "score_sim")[[1]][draw,]
true_nu <- extract(sim_out, pars = "nu")[[1]][draw]
true_kappa <- extract(sim_out, pars = "kappa")[[1]][draw,]
true_student <- extract(sim_out, pars = "student")[[1]][draw,]
# Now estimate the model with this "true" data, to see that we can estimate the known parameters
recapture_fit <- sampling(compiled_model, data = list(run_estimation = 1,
N = N,
I = I,
ii = rep(1:I,2),
raw_pre_test = raw_pre_test,
raw_score = score_sim,
nu_mean = 1,
nu_sigma = 0.5,
mu_mean = 0,
mu_sigma = 1,
sigma_student_sigma = 0.1),
seed = 42118)
## Nu?
nu <- as.data.frame(recapture_fit, pars = "nu")
ggplot(data = nu, aes(x = nu)) +
geom_density() + geom_vline(aes(xintercept = true_nu), color="red")
Based on this plot, should I conclude that the model succeded at recovering nu
? If not, does anyone have any recommendation about what to do next?
I also tried to recover the students’ random effects:
students <- as.data.frame(recapture_fit, pars = "student")
colnames(students) <- c(1:I)
students <- students %>%
gather("student", "effect", factor_key = T) %>%
inner_join(data.frame(student=as.factor(1:I), true_student=true_student), by = "student")
ggplot(data = students, aes(x = effect)) +
geom_density() + geom_vline(aes(xintercept = true_student), color = "red") +
facet_wrap( ~ student)
and the scores:
scores <- as.data.frame(recapture_fit, pars = "score_sim")
colnames(scores) <- c(1:N)
scores <- scores %>%
gather("N", "score", factor_key = T) %>%
inner_join(data.frame(N=as.factor(1:N), true_score=score_sim), by = "N")
ggplot(data = scores, aes(x = score)) +
geom_density() + geom_vline(aes(xintercept = true_score)) +
facet_wrap( ~ N)
Finally, I checked whether or not the 50% credible interval covers 50 of the true values:
credible_intervals <- scores %>% group_by(N) %>% summarise(lb = quantile(score, probs = c(0.25)),
ub = quantile(score, probs = c(0.75))) %>%
inner_join(data.frame(N=as.factor(1:N), true_score=score_sim), by = "N") %>%
mutate(inside = case_when(true_score >= lb & true_score<=ub ~ 1,
TRUE ~ 0))
mean(credible_intervals$inside)
[1] 0.54
Thoughts? Should I check if i can fit real data using this model, or should I stay away from real data while i’m building the model?