If I do the imputation, is there still an advantage to vectorizing it by using e.g. segment
on the y_obs
and y_miss
in @aornugent 's script? My gut says it might be better than doing everything element-wise, but Stan’s loop vs. vectorization performance has confounded me before.
At the moment the imputation strategy seems to incur a significant computational cost. Sampling goes from costing 3 seconds for N=35 to 138 seconds with N=35 and N_{corrupt}=31. Simply doubling the dataset to N=70 costs 7 seconds, so the computational cost is definitely due to the imputation.
edit: using segment
takes the cost from the absurd 100+ to a much more manageable 30 seconds.
Here is my current script:
functions {
real multi_normal_lowrank_cholesky_lpdf(vector x, vector mu, vector sigma, matrix B, matrix L) {
int lowrank = cols(B);
vector[lowrank] v = mdivide_left_tri_low(L, B' * ((x-mu) ./ square(sigma)));
return normal_lpdf(x | mu, sigma) + dot_self(v)/2. - sum(log(diagonal(L)));
}
real multi_normal_lowrank_lpdf(vector x, vector mu, vector sigma, matrix B) {
int lowrank = cols(B);
matrix[rows(B),lowrank] Bscaled = diag_pre_multiply(inv(sigma), B);
matrix[lowrank,lowrank] L = cholesky_decompose(diag_matrix(rep_vector(1,lowrank)) + crossprod(Bscaled));
return multi_normal_lowrank_cholesky_lpdf(x | mu, sigma, B, L);
}
}
data {
int include_prior;
int include_likelihood;
int N; //observed rows
int K; // number of factors
int P; // number of features
vector[P] features[N];
int N_corrupt;
vector[P] corrupt_features[N_corrupt];
int<lower=0,upper=1> iscorrupt[N_corrupt,P];
}
transformed data {
int P_corrupt[N_corrupt];
int total_corrupt = 0;
for (n in 1:N_corrupt) {
P_corrupt[n] = 0;
for (p in 1:P) {
P_corrupt[n] += iscorrupt[n,p];
}
total_corrupt += P_corrupt[n];
}
}
parameters {
cholesky_factor_cov[K,K] B_tri;
matrix[P-K,K] B_tilde;
vector<lower=0>[P] sigma;
vector[total_corrupt] missing_features;
vector[N] llk;
}
transformed parameters {
matrix[P,K] B = append_row(B_tri, B_tilde);
matrix[P,K] Bscaled = diag_pre_multiply(inv(sigma), B);
matrix[K,K] L = cholesky_decompose(diag_matrix(rep_vector(1,K)) + crossprod(Bscaled));
vector[P] imputed_features[N_corrupt];
for (n in 1:N_corrupt) {
int pos = 1;
for (p in 1:P) {
if (iscorrupt[n,p]) {
imputed_features[n,p] = missing_features[pos];
pos = pos + 1;
} else {
imputed_features[n,p] = corrupt_features[n,p];
}
}
}
}
model {
if (include_prior) {
//Leung and Drton prior on B_tri
for (l in 1:K) {
for (k in l:K) {
if (k==l) {
B_tri[k,l] ~ normal(0.,1.) T[0,];
target += (K-k)*log(B_tri[k,l]);
} else {
B_tri[k,l] ~ normal(0,1);
}
}
}
to_vector(B_tilde) ~ normal(0,1);
sigma ~ normal(0,1);
}
if (include_likelihood) {
for (n in 1:N) {
llk[n] = multi_normal_lowrank_cholesky_lpdf(to_vector(features[n]) | rep_vector(0,P), sigma, B, L);
}
target += sum(llk)
for (n in 1:N_corrupt) {
target += multi_normal_lowrank_cholesky_lpdf(to_vector(imputed_features[n]) | rep_vector(0,P), sigma, B, L);
}
}
}
It is basically a marginalized factor model. The weird prior on B_tri
is designed so that B
has the distribution of the triangular factor of a Gaussian matrix, so just ignore it.
edit: here is the segment
based imputation strategy:
vector[P] imputed_features[N_corrupt];
if (N_corrupt>0) {
int pos = 1;
for (n in 1:N_corrupt) {
int observed[P-P_corrupt[n]];
int corrupted[P_corrupt[n]];
int pos_corrupt = 1;
int pos_obs = 1;
for (p in 1:P) {
if (iscorrupt[n,p]) {
corrupted[pos_corrupt] = p;
pos_corrupt += 1;
} else {
observed[pos_obs] = p;
pos_obs += 1;
}
}
imputed_features[n,observed] = corrupt_features[n,observed];
imputed_features[n,corrupted] = segment(missing_features, pos, P_corrupt[n]);
pos += P_corrupt[n];
}
}