I’ve recently gone back to working model for cophylogenetic regression marginalising out the uncertainty in both phylogenies, which I originally posted about on here a while ago. It seems to return simulated values, but approximately 5% of the post warmup draws show divergences (in the lower section of the pairs plot, so I can’t just adapt-delta them away) *, and chains occasionally give low E-BFMIs. Obviously, it’s not currently useable. I have another version of this model that doesn’t use the simplex parameterisation, so priors are placed over each individual variance component rather than the total variance, and while I’m still continuing tests with that version it doesn’t seem to show the same issues. That would be fine, but for reasons I can’t quite grok, it runs about 10x slower than this implementation, which makes it impractically slow to actually use on real data.
*the manual says that simplexes are particularly nasty to sample from, so I suspect that’s the cause?
The simplex implementation (the one I’ve done considerably more testing on) is shown below. Basically, I was wondering if anyone could see any simple reparameterisations of this model that might help with the divergences? If not, I’ll put up the other version of the model after I’m 100% convinced it really is passing all the tests, and see if anyone can see any obvious speed ups I could use there.
Thanks for all the help I’ve had with this model so far.
//a comparative model looking at the associations between hosts and viruses
//per Hadfield et al - A Tale of Two Phylogenies marginalising
//phylogenetic uncertainty -- see that paper for descriptions of all terms
//and formulas for calculation of covariance matrices
//code based on and inspired by that of Diogo Melo - originally licensed
//under the MIT Licence
//github.com/diogro/stanAnimal
//marginalisation of phylogenetic uncertainty implemented per solution
//on Stan forums and latent discrete parameters section of the manual
//discourse.mc-stan.org/t/sampling-over-an-uncertain-variance-covariance-matrix
//pooled binomial likelihood implemented per Thompson 1971
data {
int<lower=1> datapoints;
int<lower=1> virusspecies;
int<lower=1> hostspecies;
int<lower=1> virushostcombinations;
int<lower=1> pools;
int<lower=1> spatialcompositions;
int<lower=1> matrices;
int<lower=0, upper=1> run_estimation;
int<lower=0, upper=1> y[datapoints];
int<lower=1, upper=hostspecies> host[datapoints];
int<lower=1, upper=virusspecies> virus[datapoints];
int<lower=1, upper=virushostcombinations> combination[datapoints];
int<lower=1, upper=pools> pool[datapoints];
int<lower=1, upper=spatialcompositions> spatialcomposition[datapoints];
//for OLRE - additive overdispersion
int<lower=1, upper=datapoints> ID[datapoints];
int<lower=1> samples[datapoints];
matrix[hostspecies, hostspecies] HostPhy[matrices];
matrix[virusspecies, virusspecies] VirusPhy[matrices];
matrix[virushostcombinations, virushostcombinations] HostInter[matrices];
matrix[virushostcombinations, virushostcombinations] VirusInter[matrices];
matrix[virushostcombinations, virushostcombinations] CoevoInter[matrices];
}
transformed data{
real log_unif;
matrix[hostspecies, hostspecies] CD_HostPhy[matrices];
matrix[virusspecies, virusspecies] CD_VirusPhy[matrices];
matrix[virushostcombinations, virushostcombinations] CD_HostInter[matrices];
matrix[virushostcombinations, virushostcombinations] CD_VirusInter[matrices];
matrix[virushostcombinations, virushostcombinations] CD_CoevoInter[matrices];
//for constant correction to posterior density
log_unif = -log(matrices);
//for speed cholesky decompose all covariance matrices
for (i in 1:matrices) {
CD_HostPhy[i] = cholesky_decompose(HostPhy[i]);
CD_VirusPhy[i] = cholesky_decompose(VirusPhy[i]);
CD_HostInter[i] = cholesky_decompose(HostInter[i]);
CD_VirusInter[i] = cholesky_decompose(VirusInter[i]);
CD_CoevoInter[i] = cholesky_decompose(CoevoInter[i]);
}
}
parameters{
//total variance
real<lower=0> sigma;
//for partitioning of total variance
simplex[11] sim;
//non-centering
vector[virusspecies] virus_tilde;
vector[hostspecies] host_tilde;
vector[virushostcombinations] inter_tilde;
vector[pools] pool_tilde;
vector[spatialcompositions] spatial_tilde;
vector[datapoints] residual_tilde;
vector[virusspecies] vphy_tilde;
vector[hostspecies] hphy_tilde;
vector[virushostcombinations] vinter_tilde;
vector[virushostcombinations] hinter_tilde;
vector[virushostcombinations] coevointer_tilde;
//posterior latent mean
real alpha;
}
transformed parameters{
//predicted probability for each datapoint for each matrix
vector[datapoints] predprob[matrices];
vector[datapoints] theta[matrices];
//predicted means for each partially pooled effect
vector[virusspecies] mean_virus;
vector[hostspecies] mean_host;
vector[virushostcombinations] mean_inter;
vector[pools] mean_pool;
vector[spatialcompositions] mean_spatial;
vector[datapoints] mean_residual;
vector[virusspecies] mean_virusphy[matrices];
vector[hostspecies] mean_hostphy[matrices];
vector[virushostcombinations] mean_virusinter[matrices];
vector[virushostcombinations] mean_hostinter[matrices];
vector[virushostcombinations] mean_coevointer[matrices];
//vector for calculating marginalised log likelihoods
vector[matrices] lp;
//add exact correction for marginalisation
lp = rep_vector(log_unif, matrices);
//non-centred calculation of means for non-phylogenetically associated effects
mean_virus = sqrt(sigma * sim[1]) * virus_tilde;
mean_host = sqrt(sigma * sim[2]) * host_tilde;
mean_inter = sqrt(sigma * sim[3]) * inter_tilde;
mean_pool = sqrt(sigma * sim[4]) * pool_tilde;
mean_spatial = sqrt(sigma * sim[5]) * spatial_tilde;
mean_residual = sqrt(sigma * sim[6]) * residual_tilde;
//non-centered calculation of means for phylogenetically associated effects
for (i in 1:matrices) {
mean_virusphy[i] = sqrt(sigma * sim[7]) * (CD_VirusPhy[i] * vphy_tilde);
mean_hostphy[i] = sqrt(sigma * sim[8]) * (CD_HostPhy[i] * hphy_tilde);
mean_virusinter[i] = sqrt(sigma * sim[9]) * (CD_VirusInter[i] * vinter_tilde);
mean_hostinter[i] = sqrt(sigma * sim[10]) * (CD_HostInter[i] * hinter_tilde);
mean_coevointer[i] = sqrt(sigma * sim[11]) * (CD_CoevoInter[i] * coevointer_tilde);
}
//precalculate expected probability for each data point
for (i in 1:datapoints) {
for (j in 1:matrices) {
//predprob saved for later posterior predictive simulations
predprob[j,i] = inv_logit(alpha + mean_virus[virus[i]] +
mean_host[host[i]] + mean_inter[combination[i]] +
mean_virusphy[j,virus[i]] + mean_hostphy[j,host[i]] +
mean_virusinter[j,combination[i]] + mean_hostinter[j,combination[i]] +
mean_coevointer[j,combination[i]] + mean_pool[pool[i]] +
mean_spatial[spatialcomposition[i]] + mean_residual[ID[i]]);
theta[j,i] = 1 - (1 - predprob[j,i])^samples[i];
}
}
//calculate model likelihood marginalising over the phylogenetic uncertainty
if (run_estimation==1) {
for (i in 1:matrices) {
//likelihood of at least 1 success in n trials with underlying
//probability of success theta is modified Bernoulli with success
//probability = 1-(1-p)^n -- quantity precalculated above in theta
//see Thompson 1971 Biometrics
lp[i] = lp[i] + bernoulli_lpmf(y | theta[i]);
}
}
}
model {
//logistic prior over global mean - uniform on the probability scale
target += logistic_lpdf(alpha | 0,1);
//normal(0,1) priors over variables used for for non-centreing
target += normal_lpdf(vphy_tilde | 0,1);
target += normal_lpdf(hphy_tilde | 0,1);
target += normal_lpdf(vinter_tilde | 0,1);
target += normal_lpdf(hinter_tilde | 0,1);
target += normal_lpdf(coevointer_tilde | 0,1);
target += normal_lpdf(virus_tilde | 0,1);
target += normal_lpdf(host_tilde | 0,1);
target += normal_lpdf(inter_tilde | 0,1);
target += normal_lpdf(pool_tilde | 0,1);
target += normal_lpdf(spatial_tilde | 0,1);
target += normal_lpdf(residual_tilde | 0,1);
//prior over global variance - prior mean 6.25 (implying prior mean standard
//deviation of 2.5) - approximately 99% of pdf below 25 (implying 99% of pdf
//below standard deviation of 5)
target += gamma_lpdf(sigma | 1.33,0.2128);
//dirichlet prior over simplex
target += dirichlet_lpdf(sim | rep_vector(1, 11));
//increment the log posterior by the summed likelihood
if (run_estimation==1) {
target += log_sum_exp(lp);
}
}
generated quantities {
//define variance parameters
real<lower=0> sigma_virus;
real<lower=0> sigma_virusphy;
real<lower=0> sigma_host;
real<lower=0> sigma_hostphy;
real<lower=0> sigma_virusinter;
real<lower=0> sigma_hostinter;
real<lower=0> sigma_inter;
real<lower=0> sigma_coevointer;
real<lower=0> sigma_pool;
real<lower=0> sigma_spatial;
real<lower=0> sigma_residual;
real<lower=0> denominator;
//define ICC parameters
real<lower=0, upper=1> ICC_virus;
real<lower=0, upper=1> ICC_virusphy;
real<lower=0, upper=1> ICC_host;
real<lower=0, upper=1> ICC_hostphy;
real<lower=0, upper=1> ICC_virusinter;
real<lower=0, upper=1> ICC_hostinter;
real<lower=0, upper=1> ICC_inter;
real<lower=0, upper=1> ICC_coevointer;
real<lower=0, upper=1> ICC_pool;
real<lower=0, upper=1> ICC_spatial;
real<lower=0, upper=1> ICC_residual;
real<lower=0, upper=1> ICC_nonphylogenetic;
real<lower=0, upper=1> ICC_phylogenetic;
//define parameters for data simulation
int<lower=0, upper=1> y_sim[datapoints];
//define parameters for model comparision
vector[matrices] uncertainty_log_lik;
vector[datapoints] log_lik;
real<lower=0> calc;
//define for the case where matrices == 1 and calc
//would otherwise be undefined
calc = 0;
//generate implied variance explained by each model component
sigma_virus = sigma * sim[1];
sigma_host = sigma * sim[2];
sigma_inter = sigma * sim[3];
sigma_pool = sigma * sim[4];
sigma_spatial = sigma * sim[5];
sigma_residual = sigma * sim[6];
sigma_virusphy = sigma * sim[7];
sigma_hostphy = sigma * sim[8];
sigma_virusinter = sigma * sim[9];
sigma_hostinter = sigma * sim[10];
sigma_coevointer = sigma * sim[11];
//denominator of ICC calculation equal to total variance plus
//variance of the logistic distribution
denominator = sigma + (pi()^2)/3;
//generate proportion of variance explained by each model component as well as
//all phylogenetically and all non-phylogenetically associated effects
ICC_virus = sigma_virus/denominator;
ICC_virusphy = sigma_virusphy/denominator;
ICC_host = sigma_host/denominator;
ICC_hostphy = sigma_hostphy/denominator;
ICC_virusinter = sigma_virusinter/denominator;
ICC_hostinter = sigma_hostinter/denominator;
ICC_inter = sigma_inter/denominator;
ICC_coevointer = sigma_coevointer/denominator;
ICC_pool = sigma_pool/denominator;
ICC_spatial = sigma_spatial/denominator;
ICC_residual = sigma_residual/denominator;
ICC_nonphylogenetic = (sigma_virus + sigma_host + sigma_inter +
sigma_spatial + sigma_pool)/denominator;
ICC_phylogenetic = (sigma_virusphy + sigma_hostphy + sigma_virusinter +
sigma_hostinter + sigma_coevointer)/denominator;
//generate pointwise log-likelihoods for PSIS-LOO model comparison
//see Vehtari, Gelman and Gabry 2017
//as total model likelihood equal
//sum from 1:K -log uniform density + log likelihood
//log likelihood of a single data point equals sum of log likelihoods
//of that point over each matrix combination
//see case with a single data point
for (i in 1:datapoints) {
for (j in 1:matrices) {
//likelihood of at least 1 success in n trials with underlying
//probability of success theta is modified Bernoulli with success
//probability = 1-(1-p)^n -- quantity precalculated above in theta
//see Thompson 1971 Biometrics
//calculate log likelihood for ith datapoint on each matrix
uncertainty_log_lik[j] = log(((1 - theta[j,i])^samples[i])^(1 - y[i]) *
(1 - (1 - theta[j,i])^samples[i])^y[i]);
}
//log_sum_exp to get log-likelihood
log_lik[i] = log_sum_exp(uncertainty_log_lik);
}
//generate data simulations, using infection probability weighted by posterior
//probability of each tree (VCV) in the phylogenetic uncertainty case
if (matrices == 1) {
for(i in 1:datapoints) {
y_sim[i] = bernoulli_rng(theta[1,i]);
}
} else {
calc = 0;
for(i in 1:datapoints) {
for (j in 1:matrices) {
calc = calc + predprob[j,i] * exp(lp[i] - log_sum_exp(lp));
}
y_sim[i] = bernoulli_rng(1 - (1 - calc)^samples[i]);
}
}
}