I have tried implementing a so-called independent factor analysis model which is a standard factor model (K factors) with an independent univariate mixture (C components) on the weights of each factor. I have not registered anything catastrophic yet, but it is quite slow.
Both mixtures and factor models are … problematic in the Bayesian setting, so right now I just want to perform a sanity check by doing cross-validation on some heldout data and the MAP solution.
The speed issues are likely just a consequence of applying a multi-normal likelihood on vectors with lengths in the 200s, but is there anything obvious I could do to speed this up further? Perhaps some pre-computation or vectorization to simplify the multi-normal likelihood? The test block is also made costlier by having to marginalize over a product of mixtures, generating a combinatorial number of components; should I maybe switch to computing that via sampling instead?
obs: there are some includes in the definition, but it is mostly just a lot of array manipulation necessary to form the features
matrix, which is a combination of parameters (missing values) and data.
functions {
int[] num2baseB(int num, int bits, int B) {
/* convert a number `num` to a base `B` number using `bits` digits. */
int base[bits];
int factor = num;
for (brev in 1:bits) {
int b = bits + 1 - brev;
base[b] = factor % B;
factor = factor / B;
}
return base;
}
}
data {
#include data_template.stan
int include_prior;
int include_likelihood;
int K; // number of factors
int C; // number of clusters per factor
}
transformed data {
int P = P_cog + P_eeg + P_mri + P_dti;
int combinations = 1;
int holdoutsize = 0;
for (k in 1:K) {
combinations *= C; //count codes of length K with C symbols
}
for (n in 1:N) {
holdoutsize += heldout[n]; //increment if true
}
}
parameters {
#include parameter_template.stan
matrix[K,C] mu;
matrix<lower=0>[K,C] sigma;
simplex[C] weight[K];
matrix[N,K] W;
matrix[K, P] B;
vector<lower=0>[P] noise_std;
}
transformed parameters {
matrix[N,P] features;
real testllk = 0;
{
#include transformed_parameters_declaration_template.stan
#include transformed_parameters_template.stan
features = append_col(cog,append_col(eeg,append_col(mri,dti))); //concatenation of feature blocks
}
//calculate marginal probability of heldout rows, integrating over W
if (holdoutsize>0) {
vector[combinations] lps[holdoutsize];
for (index in 1:combinations) {
int cluster_indices[K] = num2baseB(index - 1, K, C);
row_vector[K] mu_index;
row_vector[K] sigma_index;
real log_weight_index = 0;
vector[P] mu_combination;
matrix[P,P] cov_combination;
matrix[P,P] cov_chol_combination;
int test_index = 1;
for (k in 1:K) {
mu_index[k] = mu[k, cluster_indices[k] + 1];
sigma_index[k] = sigma[k, cluster_indices[k] + 1];
log_weight_index += log(weight[k, cluster_indices[k] + 1]);
}
mu_combination = to_vector(mu_index * B);
cov_combination = crossprod(diag_pre_multiply(sigma_index, B)) + diag_matrix(noise_std);
cov_chol_combination = cholesky_decompose(cov_combination);
for (n in 1:N) {
if (heldout[n]) {
lps[test_index, index] = log_weight_index + multi_normal_cholesky_lpdf(to_vector(features[n]) | mu_combination, cov_chol_combination);
test_index += 1;
}
}
}
//add all mixture likelihoods together
for (ntest in 1:holdoutsize) {
testllk += log_sum_exp(lps[ntest]);
}
}
}
model {
if (include_prior) {
noise_std ~ normal(0,1);
to_vector(B) ~ normal(0,1);
to_vector(mu) ~ normal(0,1);
to_vector(sigma) ~ normal(0,1);
for (k in 1:K) {
weight[k] ~ dirichlet(rep_vector(1,C));
}
}
//generate each column of W[,k] from univariate mixture with means mu[k,] and std-devs sigma[k,]
for (n in 1:N) {
for (k in 1:K) {
vector[C] lps;
for (c in 1:C) {
lps[c] = log(weight[k,c]) + normal_lpdf(W[n,k] | mu[k,c], sigma[k,c]);
}
target += log_sum_exp(lps);
}
}
if (include_likelihood) {
for (n in 1:N) {
if (!heldout[n]) {
target += normal_lpdf(features[n] | W[n] * B, to_row_vector(noise_std));
}
}
}
}
edit: to be slightly more specific, the model is X_{features}={W}{B}+E where each element of W is drawn from a mixture specific to each column, and B is either without prior or with a simple Gaussian prior. E is Gaussian noise.
edit2: okay, one obvious thing to do is of course to not do testing in the transformed parameters block… Is there any way to run something only after optimizing? Or do I have to use different code for testing in the sampling and the MAP regime?