Predicting New Values in State Space Model Using Only Prior Data

Hi all,

I am attempting to make a model similar to what Milad Kharratzadeh did for the English Premier League. I can fit the model without issue but would like to simulate game outcomes using only the information available at the beginning of the season.

The code below simulates game outcomes based on team ability. x_prior provides some information about team ability going into the first game and ability is changing over time. I would like to return predictions for each game knowing only the value of x_prior for the two teams involved but haven’t been able to figure out how to do this in the generated quantities section.

library(purrr)
library(dplyr)
library(rstan)

n_teams <- 30
n_weeks <- 16
alpha <- 3
sigma <- 14
sigma_start = 3
sigma_state = 1
x_prior <- rnorm(n_teams, 0, 1)
x_beta <- .5
mu_start <- rnorm(n_teams, x_prior * x_beta, sigma_start)


gen_mu_ts <- function(mu_start, sigma_state) {
  mu_team <- 0
  for (i in 1:16){
    if (i == 1){
      mu_team[i] <- mu_start
    } else {
      mu_team[i] <- mu_team[i-1] + rnorm(1, 0, sigma_state)
    }
  }
  return(mu_team)
} 

create_matchups <- function(n_teams, n_weeks){
  weeklist <- list()
  for (i in 1:n_weeks){
    teams <- 1:n_teams
    matchlist <- list()
    for (k in 1:(n_teams/2)){
      match <- sample(teams, 2)
      teams <- teams[!teams %in% match]
      matchlist[[k]] <- match
    }
    df <- data.frame(team_1 = sapply(matchlist, '[', 1),
                     team_2 = sapply(matchlist, '[', 2),
                     week = i
    )
    weeklist[[i]] <- df 
  }
  return_df <- dplyr::bind_rows(weeklist)
  return(return_df)
}

games <- create_matchups(n_teams, n_weeks)

##Generate team abilities over time
team_abilities <- unlist(map(mu_start, gen_mu_ts, sigma_state))
team_id <- unlist(lapply(1:n_teams, rep, n_weeks))
week <- rep(1:n_weeks, 30)
abilities <- data.frame(ability = team_abilities,
                        team_id = team_id,
                        week = week)

##Join team abilities

games <- games %>%
  left_join(abilities, by = c('team_1' = 'team_id',
                              'week' = 'week')) %>%
  rename(team_1_mu = ability) %>%
  left_join(abilities, by = c('team_2' = 'team_id',
                              'week' = 'week')) %>%
  rename(team_2_mu = ability)

## Generate Score
games$score_diff <- rnorm(nrow(games), alpha + games$team_1_mu - games$team_2_mu, sigma)

standat <- list(T = n_weeks,
                N = nrow(games),
                K = n_teams,
                week = games$week,
                team_1 = games$team_1,
                team_2 = games$team_2,
                y = games$score_diff,
                x = mu_start)

Below is the stan code with simulated game outcomes generated using the updated information about team ability.

data {
  int<lower = 1> T;       // Number of time periods
  int<lower=0> N;         // number of observations
  int<lower=1> K;         //number of teams
  int<lower=1> week[N];
  int<lower=1> team_1[N];
  int<lower=1> team_2[N];
  row_vector[K] x;              // Independent variable for prior
  real y[N];              // final score difference
}
parameters {
  real alpha;                    // standard home field advantage
  real beta_x;                   // coefficient on prior
  real<lower = 0> sigma_state;   // week to week variation
  real<lower = 0> sigma_ability; // team ability variation
  real<lower = 0> sigma;         // residual standard error
  matrix[T,K] eta_mu; //Matrix of team abilities
}
transformed parameters {
  matrix[T,K] mu; // Team abilities
  mu[1] = x * beta_x + sigma_ability * eta_mu[1];
  for (t in 2:T){
    mu[t] = mu[t-1] + sigma_state * eta_mu[t];
  }
}

model {
  vector[N] mu_diff;

  alpha ~ normal(3,1);
  sigma_state ~ normal(1,1);
  sigma_ability ~ normal(3,1);
  sigma ~ normal(14,3);
  beta_x ~ normal(0,1);
  to_vector(eta_mu) ~ normal(0,1);
  for (n in 1:N){
    mu_diff[n] = mu[week[n],team_1[n]] - mu[week[n], team_2[n]];
  }  
  
  //likelihood  
  y ~ normal(alpha + mu_diff, sigma);
}
generated quantities{
  vector[N] yrep;
  for (n in 1:N){
    yrep[n] = normal_rng(alpha + mu[week[n],team_1[n]] - mu[week[n], team_2[n]], sigma);
  }
}
 
} 

Hi,
sorry for nto getting to you earlier.

I admit I don’t fully understand the problem: are you trying to replicate the R code you provided in your generated quantities block and have trouble doing that? Or are you trying to use the model “with no data”, just as a prior simulator? (The simplest way to do this is to introduce a binary data parameter to toggle using likelihood on/off, i.e.:

data {
  ...
  int<lower=0,1> prior_only;
}


model {
   ..
   if(!prior_only) {
     //likelihood  
     y ~ normal(alpha + mu_diff, sigma);
   }
}

Or is the problem with something else?