I have built a hierarchical negative binomial GLM to estimate the effect of a number of factors (e.g. age, met location etc.) on the immune cell count from a series of primary/metastatic tumours. By eye, the model seems to do a pretty good job of predicting the y_hat values (see posterior predictive plot below), but calculating the pareto \hat{k} values indicates that ~80% of the data points have a \hat{k}>0.7. Please could somebody help me understand why this model appears to be misspecified?
For context, the calculated p_loo=299.18
and there are 9 independent variables + 3 different met locations for fitting 150 data points coming from 16 different individuals. There is, however, a wide degree of heterogeneity between how many samples per individual are available, ranging from 1-33 samples per individual.
functions {
int neg_binomial_2_log_safe_rng(real eta, real phi) {
real gamma_rate = gamma_rng(phi, phi / exp(eta));
if (gamma_rate > exp(20.7)){
gamma_rate = exp(20.7); // i think this is the max value before overflow but haven't double checked
}
return poisson_rng(gamma_rate);
}
}
data {
int<lower=0> N; // Number of samples
int<lower=0> S; // Number of patients
int<lower=0> K; // Number of met groups
int<lower=0> B; // Number of control coeffs to control for
int<lower=1,upper=S> patient[N]; // Which patient a sample belongs to
int<lower=0,upper=K> group[N]; // Met location (0 colon)
matrix[B, N] control_factors;
int<lower=0> y[N];
int<lower=0> tumour[N];
}
transformed data{
real log_tumour[N];
for (i in 1:N){
log_tumour[i] = log(tumour[i]);
}
}
parameters {
real mean_a;
real<lower=0> sigma_a;
row_vector[B] control_coeffs;
vector[K] coeffs;
real<lower=0> sqrt_phi_inv;
vector[S] a_raw;
}
transformed parameters{
real<lower=0> phi;
vector[S] a;
vector[N] mu;
vector[N] mu_scaled;
phi = 1 / sqrt_phi_inv^2;
a = mean_a + sigma_a * a_raw;
for (i in 1:N){
if (group[i] == 0){
mu[i] = (a[patient[i]] +
control_coeffs * control_factors[:, i]);
}
else {
mu[i] = (a[patient[i]] +
control_coeffs * control_factors[:, i] +
coeffs[group[i]]);
}
mu_scaled[i] = mu[i] + log_tumour[i];
}
}
model {
sigma_a ~ normal(0, 0.1);
mean_a ~ normal(0, 1);
sqrt_phi_inv ~ normal(0, 1);
a_raw ~ normal(0, 1);
control_coeffs ~ normal(0, 1);
coeffs ~ normal(0, 1);
y ~ neg_binomial_2_log(mu_scaled, phi);
}
generated quantities {
vector[S] lp_a;
vector[N] lp;
vector[S] a_hat;
vector[N] mu_hat;
vector[N] y_hat;
vector[N] ratio_hat;
for (s in 1:S){
a_hat[s] = normal_rng(mean_a, sigma_a);
lp_a[s] = normal_lpdf(a[s] | mean_a, sigma_a);
}
for (i in 1:N){
if (group[i] == 0){
mu_hat[i] = (a_hat[patient[i]] +
control_coeffs * control_factors[:, i]);
}
else {
mu_hat[i] = (a_hat[patient[i]] +
control_coeffs * control_factors[:, i] +
coeffs[group[i]]);
}
y_hat[i] = neg_binomial_2_log_safe_rng(mu_hat[i] + log_tumour[i], phi);
ratio_hat[i] = y_hat[i] / tumour[i];
lp[i] = lp_a[patient[i]];
lp[i] += neg_binomial_2_log_lpmf(y[i] | mu_scaled[i], phi);
}
}