Thank you so much for the advice. First off, I took away the 0’s that were not needed in the formula:
formula ← bf(Outcome ~ 0 + ELO + GK_TV + (0 + GK_TV | GK_AGE) + CB_TV + (0 + CB_TV | CB_AGE) + CM_TV + (0 + CM_TV | CM_AGE) + CF_TV + (0 + CF_TV | CF_AGE) + OUT_TV + (0 + OUT_TV | OUT_AGE))
I know a lot of successful soccer model’s have used a bivariate Poisson model to predict outcomes for soccer matches, like Dixon and Coles.
However, personally I do not think score line is best in predicting knock-out stage soccer matches and the information it provides may not be as accurate as the true outcome itself.
- There are only 2 outcomes that ultimately determine success in knock-out matches: win or go home.
- Certain teams consistently win by small margins while other teams win less consistently but by large margins.
- Game scenarios can greatly affect score lines even when teams are closely matched.
- The GK position may be more important in knockout-stage matches since games can go penalty kicks.
These sources seemed very useful for what I am working on but a lot of what was talked about is beyond my current level of expertise.
Predictive projection in order to reduce the number of features may not be as useful as regulation techniques to decrease the magnitude of those features, especially since the variables seem key to understanding the outcome in there own ways. I could be wrong though and this may help me mitigate overfitting the training data.
The paper having to do with the regularized horseshoe prior seems very useful in my case.
I will follow your advice using the R2D2 prior though. It seems to fit best with the model I am trying to build since:
- Use R2D2 prior when you are interested in controlling the proportion of variance explained by the predictors and have a moderate number of predictors.
- Use regularized horseshoe prior when you are dealing with high-dimensional data with many predictors, where you expect only a few of them to be truly relevant.
Please let me know if this code is okay for computing LOOCV to determine accuracy of this kind of model since it is my first time using rstan and code came from ChatGPT (Imay end up taking ELO out and also seeing how it performs). Also would PCA be helpful at all to reduce the dimensionality (I am guessing the shrinkage prior should be fine without).
Here is my code:
stan_model_code_r2d2 ← "
data {
int<lower=0> n; // number of observations
int<lower=0> d; // number of predictors
int<lower=0> k; // number of age groups
int<lower=0, upper=1> y[n]; // binary outcome
matrix[n, d] x; // predictors
matrix[n, k] age_group_indices; // age group indices
real<lower=0> scale_icept; // prior std for the intercept
}
parameters {
real beta0; // intercept
real<lower=0, upper=1> r2; // R-squared parameter
vector[d] beta_raw; // raw coefficients
matrix[d, k] u; // random effects for each predictor by age group
}
transformed parameters {
vector[d] beta; // regression coefficients
vector[n] eta; // linear predictor
beta = beta_raw * sqrt(r2 / (1 - r2));
// Compute linear predictor
eta = beta0 + x * beta;
for (i in 1:n) {
eta[i] += u[1, ] * age_group_indices[i, 1] * x[i, 2]; // GK_TV | GK_AGE
eta[i] += u[2, ] * age_group_indices[i, 2] * x[i, 3]; // CB_TV | CB_AGE
eta[i] += u[3, ] * age_group_indices[i, 3] * x[i, 4]; // CM_TV | CM_AGE
eta[i] += u[4, ] * age_group_indices[i, 4] * x[i, 5]; // CF_TV | CF_AGE
eta[i] += u[5, ] * age_group_indices[i, 5] * x[i, 6]; // OUT_TV | OUT_AGE
}
}
model {
// Prior for R-squared
r2 ~ beta(2, 2);
// Prior for coefficients
beta_raw ~ normal(0, 1);
// Prior for random effects
for (j in 1:k) {
u[, j] ~ normal(0, 1);
}
// Prior for intercept
beta0 ~ normal(0, scale_icept);
// Logistic regression model
y ~ bernoulli_logit(eta);
}
"
library(rstan)
library(dplyr)
Load the data
soccer ← read.table(“/mnt/data/luluboyo.txt”, header = TRUE, sep = “\t”)
Define the number of observations
n_obs ← nrow(soccer)
Initialize a vector to store the predicted probabilities
predicted_probs ← numeric(n_obs)
Perform LOOCV
for (i in 1:n_obs) {
Prepare training data leaving out the ith observation
soccer_train ← soccer[-i, ]
soccer_test ← soccer[i, ]
Extract predictors and outcome for training data
x_train ← as.matrix(soccer_train[, c(“ELO”, “GK_TV”, “CB_TV”, “CM_TV”, “CF_TV”, “OUT_TV”)])
age_group_indices_train ← as.matrix(soccer_train[, c(“GK_AGE”, “CB_AGE”, “CM_AGE”, “CF_AGE”, “OUT_AGE”)])
y_train ← soccer_train$Outcome
Prepare data for Stan
stan_data_train ← list(
n = nrow(x_train),
d = ncol(x_train),
k = ncol(age_group_indices_train),
y = as.integer(y_train),
x = x_train,
age_group_indices = age_group_indices_train,
scale_icept = 10
)
Compile and fit the model
stan_model ← stan(model_code = stan_model_code_r2d2, data = stan_data_train,
iter = 2000, chains = 4, cores = parallel::detectCores(),
control = list(max_treedepth = 15, adapt_delta = 0.99))
Extract posterior samples
posterior_samples ← extract(stan_model)
Prepare the new data (left-out observation)
new_x ← as.matrix(soccer_test[, c(“ELO”, “GK_TV”, “CB_TV”, “CM_TV”, “CF_TV”, “OUT_TV”)])
new_age_group_indices ← as.matrix(soccer_test[, c(“GK_AGE”, “CB_AGE”, “CM_AGE”, “CF_AGE”, “OUT_AGE”)])
Number of posterior samples
n_samples ← length(posterior_samples$beta0)
Initialize a vector to store the predictions
predictions ← numeric(n_samples)
Compute the linear predictor (eta) for each posterior sample
for (j in 1:n_samples) {
eta ← posterior_samples$beta0[j] + sum(new_x * posterior_samples$beta[j, ])
eta ← eta + posterior_samples$u[1, j] * new_age_group_indices[1, 1] * new_x[1, 2]; // GK_TV | GK_AGE
eta ← eta + posterior_samples$u[2, j] * new_age_group_indices[1, 2] * new_x[1, 3]; // CB_TV | CB_AGE
eta ← eta + posterior_samples$u[3, j] * new_age_group_indices[1, 3] * new_x[1, 4]; // CM_TV | CM_AGE
eta ← eta + posterior_samples$u[4, j] * new_age_group_indices[1, 4] * new_x[1, 5]; // CF_TV | CF_AGE
eta ← eta + posterior_samples$u[5, j] * new_age_group_indices[1, 5] * new_x[1, 6]; // OUT_TV | OUT_AGE
predictions[j] ← 1 / (1 + exp(-eta)) # Apply logistic function to get probabilities
}
Store the mean prediction for the ith observation
predicted_probs[i] ← mean(predictions)
cat(“Finished LOOCV for observation”, i, “with predicted probability:”, predicted_probs[i], “\n”)
}
Print all predicted probabilities
predicted_probs