# Time Varying Factor Analysis

Hi all,

I’m trying to reimplement Lopes and Carvalho 2007. I’ve cross-referenced Rick Farouni’s very helpful post and ldeschamps’s thread on the forum, and have arrived at a simplified working implementation. However, I am getting various Stan warnings about divergent transitions, maximum treedepth, largest R-hat is NA etc. Could someone enlighten me about what I’m doing wrong in terms of code/efficiency/priors as it’s the first time I’m using Stan?

`````` data {
int <lower=1> P; // number of dimensions
int <lower=1> TS; // number of time steps
int <lower=1> D; // number of latent factors
vector[P] y[TS];
}
transformed data {
int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
// Parameters
vector<lower=0>[D] f_sd[TS]; // latent factor standard deviations
vector<lower=0>[beta_l] L_lower_sd; // lower diagonal standard deviations
vector[P] y_mu; // y means
vector<lower=0>[P] y_sd; // y standard deviations
// Hyperparameters
vector<lower=0>[D] tau; // scale
cholesky_factor_corr[D] sigma_f_sd; //prior correlation
}
transformed parameters {
cholesky_factor_cov[P, D] beta [TS];
cov_matrix[P] Q[TS];
// beta[T] and Q[T]
for (t in 1:TS){
int idx;
idx = 1;
for (j in 1:D) {
beta[t][j, j] = L_diag[t][j]; // set positive diagonal loadings
for (k in (j+1):D){
beta[t][j, k] = 0; // set upper triangle values to 0
}
for (i in (j+1):P){
beta[t][i, j] = L_lower[t][idx]; // set lower diagonal loadings
idx = idx + 1;
}
}
Q[t] = beta[t] * diag_matrix(f_sd[t]) * beta[t]' + diag_matrix(y_sd); // specify covariance of y
}
}
model {
// priors
f_sd[1] ~ gamma(2, 0.1);
L_lower[1] ~ normal(0, 1);
L_diag[1] ~ gamma(1, 1);
tau ~ cauchy(0, 2.5);
sigma_f_sd ~ lkj_corr_cholesky(2);
L_lower_sd ~ gamma(2, 0.1);

for (t in 2:TS){
f_sd[t] ~ multi_normal_cholesky(f_sd[t-1], quad_form_diag(sigma_f_sd, tau));
L_lower[t] ~ normal(L_lower[t-1], L_lower_sd);
y[t] ~ multi_normal(y_mu, Q[t]);
}
}
``````

Cheers

Whether R-hat=NA is a problem or not depends on which parameters are causing it. The upper halves of those Cholesky factors are by definition zero so their R-hat is undefined. It’s safe to ignore them.
Look at the summary and see if any non-fixed parameters with high R-hat.

That should be

``````f_sd[t] ~ multi_normal_cholesky(f_sd[t-1],
diag_pre_multiply(tau, sigma_f_sd));
``````

The covariance matrix diagonal is variances, not standard deviations.

``````Q[t] = tcrossprod(diag_post_multiply(beta[t], f_sd[t]))
+ diag_matrix(square(y_sd));
``````

Cauchy distribution has a long tail. The prior expects that one `tau` will be much larger than the others.
A better prior would be `tau ~ normal(0, max_tau);` where `max_tau` is the largest value `tau` could reasonably be.

Thanks for all the advice. Currently, my code looks like:

``````data {
int <lower=1> P; // number of dimensions
int <lower=1> TS; // number of time steps
int <lower=1> D; // number of latent factors
vector[P] y[TS];
}
transformed data {
int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
// Parameters
vector[D] log_f_sd[TS]; // log latent factor standard deviations
vector[D] log_f_sd_beta; // log latent factor standard deviation transform
vector[D] log_f_sd_bias; // log latent factor standard deviation bias
vector[beta_l] L_lower_beta; // lower diagonal transform
vector[beta_l] L_lower_bias; // lower diagonal bias
vector<lower=0>[beta_l] L_lower_sd; // lower diagonal standard deviations
vector[P] y_mu; // y means
vector<lower=0>[P] y_sd; // y standard deviations
// Hyperparameters
vector<lower=0>[D] tau; // scale
cholesky_factor_corr[D] sigma_log_f_sd; //prior correlation
}
transformed parameters {
cholesky_factor_cov[P, D] beta [TS];
cov_matrix[P] Q[TS];
// beta[T] and Q[T]
for (t in 1:TS){
int idx;
idx = 1;
for (j in 1:D) {
beta[t][j, j] = L_diag[t][j]; // set positive diagonal loadings
for (k in (j+1):D){
beta[t][j, k] = 0; // set upper triangle values to 0
}
for (i in (j+1):P){
beta[t][i, j] = L_lower[t][idx]; // set lower diagonal loadings
idx = idx + 1;
}
}
Q[t] = tcrossprod(diag_post_multiply(beta[t], exp(log_f_sd[t]))) + diag_matrix(square(y_sd)); // specify covariance of y
}
}
model {
// priors
log_f_sd[1] ~ normal(0, 1);
L_lower[1] ~ normal(0, 1);
L_diag[1] ~ gamma(1, 1);
tau ~ normal(0, 1);
sigma_log_f_sd ~ lkj_corr_cholesky(1);
L_lower_sd ~ gamma(2, 0.1);

for (t in 2:TS){
// log_f_sd[t] ~ multi_normal_cholesky(log_f_sd[t-1], diag_pre_multiply(tau, sigma_f_sd));
log_f_sd[t] ~ multi_normal_cholesky(log_f_sd_bias + (log_f_sd_beta .* log_f_sd[t-1]), diag_pre_multiply(tau, sigma_log_f_sd));
// L_lower[t] ~ normal(L_lower[t-1], L_lower_sd);
L_lower[t] ~ normal(L_lower_bias + (L_lower_beta .* L_lower[t-1]), L_lower_sd);
y[t] ~ multi_normal(y_mu, Q[t]);
}
}
``````

The DGP is

``````# Simulate data according to Lopes and Carvalho (2007) without switching
n_T <- 100
n_X <- 5
n_Y <- 10
n_D <- 1000

# Create data strucutures
Y <- array(0, c(n_T, n_Y))
B <- array(0, c(n_T, n_Y, n_X))
log_X_sigma <- array(0, c(n_T, n_X))

# y hyperparameters
# y_sigma
y_sigma <-matrix(rep(abs(rnorm(1)), n_Y), c(n_Y, 1))

# x hyperparameters
# Covariance
log_x_sigma <- rnorm(n_X)
chol_x_sigma <- matrix(rnorm(n_X * n_X, sd=0.25), c(n_X, n_X))
diag(chol_x_sigma) <- abs(diag(chol_x_sigma))

# B hyperparameters
# Want B to be 'almost sparse' where B contains mostly values close to 0.1
base <- 0.1
p_connect <- 0.4
n_connect <- p_connect * n_Y * n_X
add <- (2 * rbinom(n_connect, 1, 0.5) - 1) * (0.5 + abs(rnorm(n_connect)))
B_ <- base * rnorm(n_Y * n_X)
B_[1:n_connect] <- B_[1:n_connect] + add
B_ <- matrix(sample(B_), c(n_Y, n_X))
# To ensure identifiability, set upper triangle to be 0 and diagonal to be 1
B_ [upper.tri(B_)] <- 0
diag(B_) <- 1

# Initialise
log_X_sigma[1, ] <- log_x_sigma + exp(chol_x_sigma) %*% rnorm(n_X)
B[1, , ] <- B_ + base * lower.tri(matrix(rnorm(n_Y * n_X), c(n_Y, n_X)))
Y[1,  ] <- (B[1, , ] %*% exp(log_X_sigma[c(1), ]) + y_sigma) * rnorm(n_Y)

# Generate data
for (i in 2:n_T){
log_X_sigma[i, ] <- log_X_sigma[i-1, ] + chol_x_sigma %*% rnorm(n_X)
B[i, , ] <- B[i-1, , ] + base * lower.tri(B[i, , ]) * matrix(rnorm(n_Y * n_X), c(n_Y, n_X))
Y[i, ] <- (B[i, , ] %*% exp(log_X_sigma[c(i), ]) + y_sigma) * rnorm(n_Y)
}
``````

I’m getting a ton of divergences and I’ve referred to Michael’s example to try and eliminate them but it doesn’t seem to be working. I’m also trying to sort out issues with Bayesian fraction of missing information and both bulk and tail ESSs.
Also, is there a way to speed it up?