Hello,
I have a spatial population genetic clustering method in which samples are assigned to, or admixed between, a userspecified number of discrete groups within which relatedness decays with distance. Each group is modeled as a spatial Gaussian process, so the full model is a mixture of spatial Gaussian processes. The data can consist of between tens of thousands to millions of independent observations, so, for efficiency, I summarize with the observed covariance between samples across observations, and model that covariance as a draw from a Wishart with a parametric scale matrix (detailed in the model block below). Implemented in STAN, the model works great for a smallish number of samples (fewer than ~100), but itâ€™s quite slow for larger datasets.
I have two questions:

Are there any lowhanging fruit speedups that you can see, either in reworking the syntax or reparameterizations? I know the STAN crew is generally antiWishart as a prior, but, because there are so many observations (potentially into the millions), Iâ€™m think that the Wishart is more efficient than the multivariate Gaussian for calculating the likelihood.

Are there ways to implement rankone updates to the inverse of the Wishartâ€™s parametric scale matrix in STAN? For some parameters, if I were to update only that parameter, I could use the ShermanMorrisonWoodbury trick to update a single row/column of the inverse matrix. I know the point of STANâ€™s sampler is to move in many directions at once in parameter space, but Iâ€™m curious if thereâ€™s any way to work in this kind of efficiency (or if itâ€™s already happening under the hood).
Thanks, I appreciate the feedback!
functions {
matrix spCov(int N, real aD, matrix D, real mu){
matrix[N,N] cov;
for(i in 1:N){
for(j in i:N){
cov[i,j] = exp( (aD* D[i,j])) + mu;
cov[j,i] = cov[i,j];
}
}
return cov;
}
matrix admixed_covariance(int N, int K, vector alphaD, matrix geoDist, matrix w_mat, vector nugget, vector mu, real gamma) {
matrix[N,N] parCov;
matrix[N,N] Nug_mat;
parCov = rep_matrix(0,N,N);
Nug_mat = diag_matrix(nugget);
for(k in 1:K){
parCov = parCov + tcrossprod(to_matrix(w_mat[,k])) .* spCov(N,alphaD[k],,geoDist,mu[k]);
}
parCov = gamma + parCov + Nug_mat;
return parCov;
}
matrix make_w_matrix(int N, int K, vector[] w){
matrix[N,K] w_mat;
for(i in 1:N){
w_mat[i] = to_row_vector(w[i]);
}
return w_mat;
}
}
data {
int<lower=1> K; // number of layers
int<lower=2> N; // number of samples
int<lower=N+1> L; // number of loci
matrix[N,N] obsCov; // observed sample covariance
matrix[N, N] geoDist; // matrix of pairwise geographic distance
real varMeanFreqs; // variance in mean allele frequencies across loci
real<lower=0,upper=1> temp; // temperature parameter for estimating marginal likelihood
}
parameters {
vector<lower=0>[K] alphaD; // effect of geographic distance in the parametric covariance in layer k
positive_ordered[K] mu; // shared drift effect in layer k
vector<lower=0>[N] nugget; // samplespecific variance (allele sampling error + samplespecific drift)
simplex[K] w[N]; // every sample (N in total) has a K simplex (i.e. K layers)
real<lower=0> gamma; // baseline covariance between all layers
}
transformed parameters {
matrix[N,N] parCov; // this specifies the parametric, admixed covariance matrix
vector[K] dirConPar; // concentration parameters of dirichlet prior on w
matrix[N,K] w_mat; // converting w simplex to matrix for indexing
dirConPar = rep_vector(0.1,K);
w_mat = make_w_matrix(N,K,w);
parCov = admixed_covariance(N, K, alphaD, geoDist, w_mat, nugget, mu, gamma);
}
model {
alphaD ~ normal(0,1); // prior on alphaD
nugget ~ normal(0,1); // prior on nugget
mu ~ normal(0,1); // prior on nugget
gamma ~ normal(varMeanFreqs,0.5); // prior on nugget
for(i in 1:N) w[i] ~ dirichlet(dirConPar); // prior on admixture proportions
(L*obsCov) ~ wishart(L,parCov); // likelihood function
}