Hi all, I have a question regarding how to summarize the stacking predictions with the actual values using KL-divergence. The code is below, and everything works fine. As you can see, I’m looking at 5 reasonable models of reading performance. I obtain the stacking, pseudoBMA and pseudoBMABB weights for each model. If we look at ypredStacking for example, we have a matrix which is, say d x n, where the rows are the number of draws and the columns are the students. So for each student I have d draws from their posterior predictive distribution. If I want to compare the predictive distribution of reading scores for the sample to the actual reading scores, I am taking the column means of ypredStacking, which gives me an n x 1 vector of predicted scores and then running KLD on the n x 1 actual values. As I mentioned, this all works, but I would appreciate a sanity check on my thinking.

Thanks in advance,

David

library(rstanarm)

library(loo)

library(LaplacesDemon)

PISAdata <- read.csv(file.choose(), header=T)

PISAdata <- PISAdata[1:4000,]

PISAdata$Female <- ifelse(PISAdata$Female==“Female”,1,0)

# Here is where you would write out the separate models. There would be 6 models in total.

# Notice that the first model is just the intercept, and then each model updates the first.

# Model 1: FEMALE, ESCS, HOMEPOS, ICTRES

# Model 2: JOYREAD, PISADIFF, SCREADCOMP, SCREADDIF

# Model 3: METASUM, GFOFAIL, MASTGOAL, SWBP, WORKMAST, ADAPTIVITY, COMPETE

# Model 4: PERFEED, ADAPTIVE, TEACHINT, BELONG

# Model 5: Full model with all predictors

attach(d)

fits <- list()

fits[[1]] <- stan_glm(PV1READ ~ 1, data = d, seed = 832762) # This is the baseline model

fits[[2]] <- update(fits[[1]], formula = PV1READ ~ ) # model 1 variables

fits[[3]] <- update(fits[[1]], formula = PV1READ ~ ) # model 2 variables

fits[[4]] <- update(fits[[1]], formula = PV1READ ~ ) # model 4 variables

fits[[5]] <- update(fits[[1]], formula = PV1READ ~ ) # model 5 variables

fits[[6]] <- update(fits[[1]], formula = PV1READ ~ ) # full model

# Compute loo

loo_list <- lapply(fits, loo, cores = 4)

# Compute stacking weights

wtsStacking <- loo_model_weights(loo_list, method = “stacking”)

print(wtsStacking)

wtsPseudoBMA <- loo_model_weights(loo_list, method = “pseudobma”, BB=FALSE)

print(wtsPseudoBMA)

wtsPseudoBMABB <- loo_model_weights(loo_list, method = “pseudobma”)

print(wtsPseudoBMABB)

# Average predictions (sample with probability equal to weights)

n_draws <- nrow(as.matrix(fits[[1]]))

ypredStacking <- matrix(NA, nrow = n_draws, ncol = nobs(fits[[1]]))

for (d in 1:n_draws) {

k <- sample(1:length(wts), size = 1, prob = wtsStacking)

ypredStacking[d, ] <- posterior_predict(fits[[k]], draws = 1)

}

n_draws <- nrow(as.matrix(fits[[1]]))

ypredPseudoBMA<- matrix(NA, nrow = n_draws, ncol = nobs(fits[[1]]))

for (d in 1:n_draws) {

k <- sample(1:length(wts), size = 1, prob = wtsPseudoBMA)

ypredPseudoBMA[d, ] <- posterior_predict(fits[[k]], draws = 1)

}

n_draws <- nrow(as.matrix(fits[[1]]))

ypredPseudoBMABB<- matrix(NA, nrow = n_draws, ncol = nobs(fits[[1]]))

for (d in 1:n_draws) {

k <- sample(1:length(wts), size = 1, prob = wtsPseudoBMABB)

ypredPseudoBMABB[d, ] <- posterior_predict(fits[[k]], draws = 1)

}

# This provides the distribution of predicted values and compares them to the

# actual values using the KLD. Please make a note of all of the values.

ypredStacking <- colMeans(ypredStacking)

KLD(ypredStacking,PV1READ)

ypredPseudoBMA <- colMeans(ypredPseudoBMA)

KLD(ypredPseudoBMA,PV1READ)

ypredPseudoBMABB <- colMeans(ypredPseudoBMABB)

KLD(ypredPseudoBMABB,PV1READ)