I am reproducing the tutorial rstanarm multilevel example rewriting it in “pure” ®Stan with some twists. I wanted to produce the mean predictions per school, and after about fifteen trials and errors, I ended with the combination of transformed parameters and generated quantities as follows:
//
// This Stan program defines a multilevel model
// expressed in lmer as
//
// lmer(formula = course ~ 1 + female + (1 | school), data = GCSE)
//
// The input data
data {
// number of observations
int<lower=0> N;
// number of schools
int<lower=0> M;
// outcome variable: course
vector[N] y;
// predictor: female
vector[N] female;
// cluster variable
int school[N];
// school sizes, to be computed in R
int s[M];
}
parameters {
// intercept
real beta0;
// fixed effects slope
real beta_f;
// random effects
vector[M] u_j;
// residual error
real<lower=0> sigma;
// random effect variance
real<lower=0> sigma_u;
}
transformed parameters {
// level-1 unit specific means
vector[N] mu_i;
mu_i = beta0 + beta_f * female + u_j[school];
}
model {
y ~ normal(mu_i, sigma);
u_j ~ normal(0, sigma_u);
}
generated quantities {
// running counter
int pos;
// school empirical mean
vector[M] m_j;
pos = 1;
for (j in 1:M) {
m_j[j] = mean( segment(mu_i,pos,s[j]) );
pos = pos + s[j];
}
}
The R code that gets the data is
### pure Rstan multilevel example
library(tidyverse)
library(rstan)
library(here)
library(mlmRev)
data(Gcsemv, package = "mlmRev")
summary(Gcsemv)
# massage the data a bit
Gcsemv$female <- relevel(Gcsemv$gender, "M")
Gcsemv %>%
select(school, student, female, course) %>%
filter( !is.na(course) ) %>%
arrange(school, student) %>% # sort by school and student within school
group_by(school) %>%
mutate(s=n()) %>% ungroup() -> # compute school sizes
GCSE_nmy
# convert to lists for Stan
GCSE_stan_data <- list(
# number of observations
N = nrow(GCSE_nmy),
# number of schools
M = length(unique(GCSE_nmy$school)),
# outcome variable: course
y = GCSE_nmy$course,
# predictor: female
female = as.integer(GCSE_nmy$female)-1,
# cluster variable
school = as.integer(GCSE_nmy$school),
# school sizes for segmenting
s = (GCSE_nmy %>% group_by(school) %>% slice(1) %>% ungroup() %>% select(s) %>% unlist() )
)
So when I run
multilevel_stan_fit <- stan(
file = here('multilevel-stan.stan'), # program
data = GCSE_stan_data, # data
chains = 4, # number of Markov chains
warmup = 2000, # number of warmup iterations per chain
iter = 10000, # total number of iterations per chain
cores = 4, # number of cores to use
refresh = 200, # show progress every 'refresh' iterations
seed = 10203
)
I end up with
> print( object.size(multilevel_stan_fit), units="Mb")
576.1 Mb
which takes a fairly long while to create and access.
The blame, of course, is with mu_i
, an object of size 1725 x 4000, which I don’t really care that much about. Is there any way to omit this vector of transformed parameters from being written into R? (It has to be there in memory for the generated quantities to be created, but the write step of outputting it is a different one, as far as I understand from https://mc-stan.org/docs/2_21/reference-manual/output.html.)