Hi everyone. I’m trying to write a phylogenetic generalized linear mixed model for use with microbial strain data. I’m trying to use LOO to do a model comparison against a model with the phylogenetic component removed, but I’m running into some issues.
Using cmdstanr’s loo() method with log_lik[i] = normal_id_glm_lpdf(...)
was producing many bad Pareto k diagnostic values, particularly with some of my real-world datasets which are sometimes >50% “bad”. I saw this tweet and I’ve been trying the integration-based method but I have a couple questions and a problem:
- Question: Is it impactful to disregard the covariance of the elements of
phylo_effect
terms in the integrand function? It seems like it would be pretty tricky to pack the covariance matrix into thearray[] real x_r
argument. Or should I just pass in the appropriate element ofz_1
and usestd_normal_lpdf()
? - Problem: The code below has some sort of bug when calling
normal_id_glm_lpdf()
in the integrand that I can’t figure out. The error message shows up on every chain, saying it detects infinite values in the independent variable matrix but I never see any.
Chain 4 Exception: Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'C:/Users/aghazi/AppData/Local/Temp/RtmpIpsjz3/model-347421127513.stan', line 19, column 4 to line 23, column 53) (in 'C:/Users/aghazi/AppData/Local/Temp/RtmpIpsjz3/model-347421127513.stan', line 83, column 4 to line 92, column 41)
I started out with a model from following along with the brms PGLMM vignette but I’ve made some simplifications to make it more readable. The code blocks below give the model and some R code to generate synthetic data.
Thanks for any help on this!
The model:
functions {
real integrand(real phylo_effect, real notused,
array[] real theta,
array[] real yXi, array[] int Kc) {
real sigma_phylo = theta[1];
real sigma_resid = theta[2];
real Intercept = theta[3];
int ntheta = num_elements(theta);
vector[ntheta-3] b = to_vector(theta[4:ntheta]);
real yi = yXi[1];
row_vector[Kc[1]] Xci = to_row_vector(yXi[2:(Kc[1]+1)]);
// V These never show any infinite values
// print(theta);
// print(to_matrix(to_vector(yXi)));
return exp(normal_lpdf(phylo_effect | 0, sigma_phylo) + // is this right given the covariance of phylogenetic effects?
normal_id_glm_lpdf(yi | to_matrix(Xci), // there are supposedly inf values here
phylo_effect + Intercept,
b,
sigma_resid));
}
}
data {
int<lower=1> N; // total number of observations
vector[N] Y; // response variable
int<lower=1> K; // number of population-level effects
matrix[N, K] X; // population-level design matrix
matrix[N, N] Lcov; // cholesky factor of known correlation matrix
real<lower=0> resid_scale;
}
transformed data {
int Kc = K - 1;
matrix[N, Kc] Xc; // centered version of X without an intercept
vector[Kc] means_X; // column means of X before centering
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}
parameters {
vector[Kc] b; // population-level effects
real Intercept; // temporary intercept for centered predictors
real<lower=0> sigma_resid; // residual noise
real<lower=0> sigma_phylo; // phylogenetic noise
vector[N] z_1;
}
transformed parameters {
vector[N] phylo_effect;
phylo_effect = (sigma_phylo * (Lcov * z_1));
}
model {
// likelihood including constants
vector[N] mu = Intercept + phylo_effect;
target += normal_id_glm_lpdf(Y | Xc, mu, b, sigma_resid);
// priors including constants
// target += gamma_lpdf(sigma_phylo / sigma_resid | 1, 2);
// ^ this can help avoid divergences
target += student_t_lpdf(sigma_resid | 3, 0, resid_scale)
- 1 * student_t_lccdf(0 | 3, 0, resid_scale);
target += student_t_lpdf(sigma_phylo | 3, 0, resid_scale)
- 1 * student_t_lccdf(0 | 3, 0, resid_scale);
target += std_normal_lpdf(z_1);
}
generated quantities {
// actual population-level intercept
real b_Intercept = Intercept - dot_product(means_X, b);
array[N] real yrep;
vector[N] log_lik;
for (i in 1:N){
// V This log_lik value gives bad pareto k diagnostics
// log_lik[i] = normal_id_glm_lpdf(Y[i] | to_matrix(Xc[i]), Intercept + phylo_effect[i], b, sigma_resid);
yrep[i] = normal_rng(Intercept + phylo_effect[i] + Xc[i]*b, sigma_resid);
log_lik[i] = log(integrate_1d(integrand,
negative_infinity(),
positive_infinity(),
append_array({sigma_phylo},
append_array({sigma_resid},
append_array({Intercept},
to_array_1d(b)))),
append_array({Y[i]},
to_array_1d(Xc[i])),
{Kc}));
}
}
And the R script:
library(ape)
library(cmdstanr)
n = 100
sigma_phylo = 1
sigma_resid = 1
tr = rtree(n)
vcv = vcv.phylo(tr)
s = sqrt(diag(vcv))
cor_mat = diag(1/s) %*% vcv %*% diag(1/s)
rownames(cor_mat) = rownames(vcv)
Lcov = chol(cor_mat)
dat = data.frame(sample_id = colnames(vcv),
x1 = rnorm(n),
x2 = rnorm(n))
# intercept = -2, beta[1] = 1, beta[2] = -.5
dat$outcome = sigma_phylo * MASS::mvrnorm(1, mu = rep(0, n), Sigma = cor_mat) +
sigma_resid * rnorm(n, mean = dat$x1 - .5*dat$x2 - 2, sd = 1)
pglmm_model = cmdstanr::cmdstan_model(stan_file = "C:/Users/aghazi/strain_stats/src/stan_files/cont_pglmm_int_loo.stan",
quiet = TRUE)
data_list = list(N = n,
Y = dat$outcome,
K = 3,
X = model.matrix(outcome ~ x1 + x2, data = dat),
Lcov = Lcov,
resid_scale = sd(dat$outcome))
pglmm_fit = pglmm_model$sample(data = data_list,
chains = 4,
parallel_chains = 4,
iter_warmup = 500,
iter_sampling = 500,
refresh = 50)