Hello MC Stanites,
I’ve been banging my head on this for a while. Any tips or guidance on specification would be very much appreciated.
I’m fitting an MMM model based on the google white paper. google white paper
The original model specification is on the bottom of that paper. I’m attempting to introduce a year grouping to get a yearly read on the parameters as well as the intercept tau (varying intercept and slopes, if i’m not mistaken on terminology).
functions {
real Hill(real t, real ec, real slope) {
return 1 / (1 + (t / ec)^(-slope));
}
real Adstock(row_vector t, row_vector weights) {
return dot_product(t, weights) / sum(weights);
}
}
data {
int<lower=1> N; // the total number of observations -- weekly
real<lower=0> Y[N]; // the vector of sales
int<lower=1> max_lag; // the maximum duration of lag effect, in weeks
int<lower=1> num_media; // the number of media channels
row_vector[max_lag] lag_vec; // a vector of 0 to max_lag - 1
row_vector[max_lag] X_media[N, num_media]; //media variables
int<lower=1> num_ctrl; // the number of control variables
row_vector[num_ctrl] X_ctrl[N]; // control variables
int<lower=1> year_index[N]; // vector of year index -- 1 : 2019; 2 : 2020; etc
int<lower=1> num_years; // number of years
}
parameters {
real<lower=0> noise_var; // residual variance
vector<lower=0>[num_years] tau; // the intercept by year
vector[num_ctrl] gamma_ctrl; // coefficients for control variables
vector<lower=0>[num_media] mu_beta_medias_year; // year-group vector for media vars
vector<lower=0>[num_media] sigma_beta_medias_year; // year-group vector for media var sd
vector<lower=0>[num_media] beta_medias_year[num_years];// coeff for media vars varying slopes
real mu_alpha_year;
real<lower=0> sigma_alpha_year;
vector<lower=0,upper=1>[num_media] retain_rate;
vector<lower=0,upper=max_lag-1>[num_media] delay;
vector<lower=0,upper=1>[num_media] ec;
vector<lower=0>[num_media] slope;
}
transformed parameters {
real mu[N]; // a vector of the mean response
real baseline[N]; // the cumulative baseline effect
real cum_effect; // the cumulative media effect after adstock
row_vector[num_media] cum_effects_hill[N]; // the cumulative media effect after adstock, and then Hill transformation
row_vector[max_lag] lag_weights;
for (nn in 1:N) {
for (media in 1 : num_media) {
for (lag in 1 : max_lag) {
lag_weights[lag] = pow(retain_rate[media], (lag - 1 - delay[media]) ^ 2);
}
cum_effect = Adstock(X_media[nn, media], lag_weights);
cum_effects_hill[nn, media] = Hill(cum_effect, ec[media], slope[media]);
}
mu[nn] = tau[year_index[nn]] +
dot_product(cum_effects_hill[nn], beta_medias_year[year_index[nn],]) +
dot_product(X_ctrl[nn], gamma_ctrl);
baseline[nn] = tau[year_index[nn]] + dot_product(X_ctrl[nn], gamma_ctrl);
}
}
model {
retain_rate ~ beta(3, 3);
delay ~ uniform(0, max_lag - 1);
slope ~ gamma(3, 1);
ec ~ beta(2,2);
mu_alpha_year ~ normal(0, 1);
sigma_alpha_year ~ cauchy(0, 1);
for (year in 1 : num_years) {
tau[year] ~ normal(mu_alpha_year, sigma_alpha_year);
}
for (media_index in 1 : num_media) {
mu_beta_medias_year[media_index] ~ normal(0, 0.5);
sigma_beta_medias_year[media_index] ~ cauchy(0, 0.1);
for (year in 1 : num_years) {
beta_medias_year[year, media_index] ~ normal(mu_beta_medias_year[media_index], sigma_beta_medias_year[media_index]);
}
}
for (ctrl_index in 1 : num_ctrl) {
gamma_ctrl[ctrl_index] ~ normal(0,1);
}
noise_var ~ inv_gamma(0.05, 0.05 * 0.01);
Y ~ normal(mu, sqrt(noise_var));
}
generated quantities {
real y_rep[N];
y_rep = normal_rng(mu, sqrt(noise_var));
}
However I’m ending up with a significant amount of divergences (between 3% and up to 13% depending on prior tweaks, adapt_delta tweaks, n_iters, etc). Given I’m working with 3 years worth of weekly data, 30+ parameters I am dealing with low N and several thousand parameters.
I’ve reviewed a few sources online, including:
https://dev.to/martinmodrak/taming-divergences-in-stan-models-5762
https://mc-stan.org/docs/2_27/stan-users-guide/reparameterization-section.html
(one other solid article on a “funnel of hell” that I can’t seem to find again)
The model specification seems ok, posterior predictive check also looks solid. I’ve tried running with adapt_delta 0.8 to 0.99). Given the low N it seems like non-centered parameterization would be the best possible fix. I attempted to make these updates to the model based on the mc-stan doc section on reparameterization linked above and this is where I netted out, but I’m still seeing a decent amount of divergence.
Updated model specification:
functions {
real Hill(real t, real ec, real slope) {
return 1 / (1 + (t / ec)^(-slope));
}
real Adstock(row_vector t, row_vector weights) {
return dot_product(t, weights) / sum(weights);
}
}
data {
int<lower=1> N; // the total number of observations -- weekly
real<lower=0> Y[N]; // the vector of sales
int<lower=1> max_lag; // the maximum duration of lag effect, in weeks
int<lower=1> num_media; // the number of media channels
row_vector[max_lag] lag_vec; // a vector of 0 to max_lag - 1
row_vector[max_lag] X_media[N, num_media]; // media variables
int<lower=1> num_ctrl; // the number of other control variables
row_vector[num_ctrl] X_ctrl[N]; // control variables
int<lower=1> year_index[N]; // vector of year index -- 1 : 2019; 2 : 2020; etc
int<lower=1> num_years; // number of years
}
parameters {
real<lower=0> noise_var; // residual variance
vector<lower=0>[num_years] tau; // the intercept by year
vector[num_ctrl] gamma_ctrl; // coefficients for other control variables
vector<lower=0, upper=0.5>[num_media] mu_beta_medias_year; // year-group vector for media vars
vector<lower=0, upper=0.2>[num_media] sigma_beta_medias_year; // year-group vector for media var sd
vector<lower=0, upper=0.3>[num_media] beta_medias_year_raw[num_years];// media vars varying slopes
real mu_alpha_year;
real<lower=0> sigma_alpha_year;
vector<lower=0,upper=1>[num_media] retain_rate;
vector<lower=0,upper=max_lag-1>[num_media] delay;
vector<lower=0,upper=1>[num_media] ec;
vector<lower=0>[num_media] slope;
}
transformed parameters {
real mu[N]; // a vector of the mean response
real baseline[N]; // the cumulative baseline effect
real cum_effect; // the cumulative media effect after adstock
row_vector[num_media] cum_effects_hill[N]; // the cumulative media effect after adstock, and then Hill transformation
row_vector[max_lag] lag_weights;
vector<lower=0>[num_media] beta_medias_year[num_years];
// implies: beta_medias_year ~ normal(mu_beta_medias_year, sigma_beta_medias_year)
for (media_index in 1 : num_media) {
for (year in 1 : num_years) {
beta_medias_year[year, media_index] = mu_beta_medias_year[media_index] + sigma_beta_medias_year[media_index] * beta_medias_year_raw[year, media_index];
}
}
for (nn in 1:N) {
for (media in 1 : num_media) {
for (lag in 1 : max_lag) {
lag_weights[lag] = pow(retain_rate[media], (lag - 1 - delay[media]) ^ 2);
}
cum_effect = Adstock(X_media[nn, media], lag_weights);
cum_effects_hill[nn, media] = Hill(cum_effect, ec[media], slope[media]);
}
mu[nn] = tau[year_index[nn]] +
dot_product(cum_effects_hill[nn], beta_medias_year[year_index[nn],]) +
dot_product(X_ctrl[nn], gamma_ctrl);
baseline[nn] = tau[year_index[nn]] + dot_product(X_ctrl[nn], gamma_ctrl);
}
}
model {
retain_rate ~ beta(1, 3); // used to be 3, 3
delay ~ normal(1, 1); // was delay ~ uniform(0, max_lag - 1);
slope ~ gamma(3, 1);
ec ~ beta(2,2);
mu_alpha_year ~ normal(0, 1);
sigma_alpha_year ~ cauchy(0, 1);
for (year in 1 : num_years) {
tau[year] ~ normal(mu_alpha_year, sigma_alpha_year);
beta_medias_year_raw[,num_years] ~ normal(0, 0.2); //normal(mu_beta_medias_year[media_index], sigma_beta_medias_year[media_index]);
}
for (media_index in 1 : num_media) {
mu_beta_medias_year[media_index] ~ normal(0, 0.2);
sigma_beta_medias_year[media_index] ~ cauchy(0, 0.1);
}
for (ctrl_index in 1 : num_ctrl) {
gamma_ctrl[ctrl_index] ~ normal(0,1);
}
noise_var ~ inv_gamma(0.05, 0.05 * 0.01);
Y ~ normal(mu, sqrt(noise_var));
}
generated quantities {
real y_rep[N];
y_rep = normal_rng(mu, sqrt(noise_var));
}
I read somewhere you shouldn’t introduce soft prior knowledge as a hard constraint for:
vector<lower=0, upper=0.5>[num_media] mu_beta_medias_year; // year-group vector for media vars
vector<lower=0, upper=0.2>[num_media] sigma_beta_medias_year; // year-group vector for media var sd
vector<lower=0, upper=0.3>[num_media] beta_medias_year_raw[num_years];// media vars varying slopes
Not sure if that’s also problematic. Thank you for your time!!
Cheers,
Simon