Thanks @dafanast and @fweber144 for your responses.
Using projpred
v2.8.0.9000 from GitHub works out of the box with varsel()
, but I’m struggling to get cv_varsel()
to work. This must be because I’m missing something important out of the custom ref_predfun
function, so any advice would be welcome! Here’s a reprex below (admittedly not the tidiest or most efficient code). I’m assuming that by manually specifying search_terms
here, that the constant effect that y ~ .
will automatically add for the grouping variable name will be ignored by projpred
?
# Packages ----
library(brms)
library(dplyr)
library(dimreduce)
library(loo)
library(projpred)
library(purrr)
library(tidyr)
# Simulation parameters ----
set.seed(1234)
n <- 100 # number of observations
p <- 50 # total number of variables
k <- 5 # number of relevant variables
# Data simulation ----
vars <- matrix(rnorm(n * p), ncol = p)
colnames(vars) <- c(paste0("noise.", 1:(p - k)), paste0("pred.", 1:k))
betas <- c(rep(0, p - k), rnorm(k))
linpred <- plogis(0 + vars %*% betas)
df_sim <- data.frame(y = rbinom(n, 1, linpred), vars, subject = as.factor(1:n))
# Add noise to simulate duplicate observations for 20% of n ----
dup_idx <- rbinom(n, size = 1, prob = 0.2)
df_sim <- df_sim |>
mutate(dup = dup_idx) |>
group_by(subject) |>
pivot_longer(-c(y, subject, dup), names_to = "var") |>
mutate(value2 = if_else(dup == 1, true = value + rnorm(p, 0, sd = 0.5), false = NA)) |>
select(-dup) |>
pivot_longer(-c(y, subject, var)) |>
pivot_wider(id_cols = c(y, subject, name), names_from = var, values_from = value) |>
select(-name) |>
ungroup() |>
na.omit()
# Supervised PCA ----
nc <- 5
x <- select(df_sim, starts_with("noise"), starts_with("pred"))
dr <- spca(x = x, y = df_sim$y, nctot = nc, screenthresh = 0.6, window = p, sup.only = TRUE, verbose = FALSE)
z <- predict(dr, df_sim |> select(-y, -subject)) |>
as_tibble() |>
mutate(subject = df_sim$subject)
colnames(z)[1:nc] <- paste0("PC.", 1:nc)
# Fit the reference model in brms (as per model.stan in Pavone 2023) ----
smax <- max(dr$sdev)
custom_stanvars <- stanvar(smax, "smax")+
stanvar(nc, "nc")+
stanvar(scode = " real<lower=0> tau;", block = "parameters")
priors <- set_prior("normal(0, tau ^ 2)", class = "b")+
set_prior("normal(0, tau ^ 2)", class = "Intercept")+
set_prior("target += student_t_lpdf(tau | nc + 1, 0, smax ^ (-2))", check = FALSE)+
set_prior("student_t(3, 0, 1)", class = "sd")
form <- paste0("y ~ ", paste0(colnames(z)[1:nc], collapse = " + "), " + (1 | subject)")
fit_spca <- brm(
form
,data = data.frame(y = df_sim$y, z)
,family = bernoulli
,prior = priors
,stanvars = custom_stanvars
,iter = 3000
,chains = 4
,cores = 4
,seed = 1234
,control = list(adapt_delta = 0.95)
)
# Make refmodel object (traditional projection); varsel runs OK ----
refm <- init_refmodel(
fit_spca
,data = df_sim
,formula = y ~ . + (1 | subject)
,family = binomial()
)
custom_search_terms <- colnames(select(df_sim, starts_with("noise"), starts_with("pred")))
vsel <- varsel(
refm
,seed = 1234
,method = "forward"
,nterms = 10
,search_terms = colnames(select(df_sim, starts_with("noise"), starts_with("pred")))
)
# cv_varsel ----
folds <- kfold_split_grouped(K = 2, x = df_sim$subject)
custom_ref_predfun <- function(fit, newdata = NULL) {
set.seed(1234)
lprd_args <- nlist(object = fit, newdata, allow_new_levels = TRUE)
out <- do_call(posterior_linpred, lprd_args)
if (length(dim(out)) == 2) {
out <- t(out)
}
out
}
custom_cvrefbuilder <- function(cvfit) {
refm <- init_refmodel(
cvfit
,data = df_sim[-cvfit$omitted, , drop = FALSE]
,formula = y ~ . + (1 | subject)
,family = binomial()
,called_from_cvrefbuilder = TRUE
,ref_predfun = custom_ref_predfun
)
return(refm)
}
custom_cvfun <- function(folds, ...) {
cvfits <- lapply(
seq_along(1:max(folds))
,function(k) {
df <- df_sim[folds != k, ]
dr <- spca(
select(df, -y, -subject)
,y = df$y
,nctot = 5
,screenthresh = 0.6
,window = p
,sup.only = TRUE
,verbose = FALSE
)
z <- predict(dr, df |> select(-y, -subject)) |>
as_tibble() |>
mutate(subject = df$subject)
colnames(z)[1:5] <- paste0("PC.", 1:5)
smax <- max(dr$sdev)
custom_stanvars <- stanvar(smax, "smax")+
stanvar(nc, "nc")+
stanvar(scode = " real<lower=0> tau;", block = "parameters")
priors <- set_prior("normal(0, tau ^ 2)", class = "b")+
set_prior("normal(0, tau ^ 2)", class = "Intercept")+
set_prior("target += student_t_lpdf(tau | nc + 1, 0, smax ^ (-2))", check = FALSE)+
set_prior("student_t(3, 0, 1)", class = "sd")
form <- paste0("y ~ ", paste0(colnames(z[,1:nc]), collapse = "+"), "+ (1 | subject)")
model_k <- brm(
form
,data = data.frame(y = df$y, z)
,family = bernoulli
,prior = priors
,stanvars = custom_stanvars
,iter = 3000
,chains = 4
,cores = 4
,seed = 1234
,control = list(adapt_delta = 0.95)
)
return(model_k)
})
return(cvfits)
}
refm <- init_refmodel(
fit_spca
,data = df_sim
,formula = y ~ . + (1 | subject)
,family = binomial()
,ref_predfun = custom_ref_predfun
,cvfun = custom_cvfun
,cvrefbuilder = custom_cvrefbuilder
)
cv_fits <- run_cvfun(refm, folds = folds, seed = 1234)
cvsel <- cv_varsel(
refm
,seed = 1234
,cv_method = "kfold"
,method = "forward"
,nterms = 10
,validate_search = TRUE
,search_terms = custom_search_terms
,cvfits = cv_fits
)
This throws the below error (either using the default ref_predfun
without having a multilevel reference model or using the custom_ref_predfun
). I’ve included the full traceback at the bottom of this post.
Error: The following variables can neither be found in ‘data’ nor in ‘data2’:
‘PC.1’, ‘PC.2’, ‘PC.3’, ‘PC.4’, ‘PC.5’
Latent projection:
As a side note, trying to use the latent projection with the same model/data throws the error below:
# Make refmodel object (latent projection); errors ----
refm <- init_refmodel(
fit_spca
,data = df_sim
,formula = y ~ . + (1 | subject)
,family = binomial()
,latent = TRUE
)
Error in eval(formula[[2]], data, environment(formula)) :
object ‘.y’ not found
Full error traceback:
Error: The following variables can neither be found in ‘data’ nor in ‘data2’:
‘PC.1’, ‘PC.2’, ‘PC.3’, ‘PC.4’, ‘PC.5’
23: stop(…, call. = FALSE)
22: stop2("The following variables can neither be found in ", “‘data’ nor in ‘data2’:\n”, collapse_comma(missing_vars2))
21: validate_data(newdata, bterms = bterms, na_action = na.pass, drop_unused_levels = FALSE, attr_terms = attr_terms, data2 = current_data2(object, newdata2), knots = get_knots(object$data))
20: validate_newdata(newdata, object = object, …)
19: current_data(object, newdata, newdata2 = data2, re_formula = re_formula, …)
18: standata.brmsfit(x, newdata = newdata, re_formula = re_formula, newdata2 = newdata2, resp = resp, allow_new_levels = allow_new_levels, internal = TRUE, …)
17: standata(x, newdata = newdata, re_formula = re_formula, newdata2 = newdata2, resp = resp, allow_new_levels = allow_new_levels, internal = TRUE, …)
16: prepare_predictions.brmsfit(object, newdata = newdata, re_formula = re_formula, resp = resp, ndraws = ndraws, draw_ids = draw_ids, check_response = FALSE, …)
15: prepare_predictions(object, newdata = newdata, re_formula = re_formula, resp = resp, ndraws = ndraws, draw_ids = draw_ids, check_response = FALSE, …)
14: posterior_linpred.brmsfit(object = .x1, newdata = .x2, allow_new_levels = .x3)
13: .fun(object = .x1, newdata = .x2, allow_new_levels = .x3) at #1
12: eval(expr, envir, …)
11: eval(expr, envir, …)
10: eval2(call, envir = args, enclos = parent.frame())
9: do_call(posterior_linpred, lprd_args)
8: ref_predfun_usr(fit = fit, newdata = newdata)
7: fold$refmodel$ref_predfun(fold$refmodel$fit, newdata = refmodel$fetch_data(obs = fold$omitted), excl_offs = FALSE)
6: one_fold(fold = list_cv[[k]], rk = search_out_rks[[k]], …)
5: FUN(X[[i]], …)
4: lapply(seq_along(list_cv), function(k) { if (verbose) { on.exit(utils::setTxtProgressBar(pb, k)) } …
3: kfold_varsel(refmodel = refmodel, method = method, nterms_max = nterms_max, ndraws = ndraws, nclusters = nclusters, ndraws_pred = ndraws_pred, nclusters_pred = nclusters_pred, refit_prj = refit_prj, penalty = penalty, verbose = verbose, search_control = search_control, K = K, …
2: cv_varsel.refmodel(refm, seed = 1234, cv_method = “kfold”, method = “forward”, nterms = 10, validate_search = TRUE, search_terms = custom_search_terms, cvfits = cv_fits)
1: cv_varsel(refm, seed = 1234, cv_method = “kfold”, method = “forward”, nterms = 10, validate_search = TRUE, search_terms = custom_search_terms, cvfits = cv_fits)
Edit:
I can also confirm that using y ~ .
works with projpred
when the number of predictors is high enough to cause a stack overflow when trying to fit a non-dimension-reduced reference model.