Time Varying Factor Analysis

Hi all,

I’m trying to reimplement Lopes and Carvalho 2007. I’ve cross-referenced Rick Farouni’s very helpful post and ldeschamps’s thread on the forum, and have arrived at a simplified working implementation. However, I am getting various Stan warnings about divergent transitions, maximum treedepth, largest R-hat is NA etc. Could someone enlighten me about what I’m doing wrong in terms of code/efficiency/priors as it’s the first time I’m using Stan?

 data {
  int <lower=1> P; // number of dimensions 
  int <lower=1> TS; // number of time steps   
  int <lower=1> D; // number of latent factors
  vector[P] y[TS];
}
transformed data {
  int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
  beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
  // Parameters
  vector<lower=0>[D] f_sd[TS]; // latent factor standard deviations
  vector[beta_l] L_lower[TS]; // lower diagonal loadings
  vector<lower=0>[beta_l] L_lower_sd; // lower diagonal standard deviations
  vector<lower=0>[D] L_diag[TS]; // positive diagonal loadings
  vector[P] y_mu; // y means
  vector<lower=0>[P] y_sd; // y standard deviations
  // Hyperparameters
  vector<lower=0>[D] tau; // scale
  cholesky_factor_corr[D] sigma_f_sd; //prior correlation
}
transformed parameters {
  cholesky_factor_cov[P, D] beta [TS];
  cov_matrix[P] Q[TS];
  // beta[T] and Q[T]
  for (t in 1:TS){ 
    int idx;
    idx = 1;
    for (j in 1:D) {
      beta[t][j, j] = L_diag[t][j]; // set positive diagonal loadings
      for (k in (j+1):D){
        beta[t][j, k] = 0; // set upper triangle values to 0
      }
      for (i in (j+1):P){
        beta[t][i, j] = L_lower[t][idx]; // set lower diagonal loadings
        idx = idx + 1;
      }
    }
  Q[t] = beta[t] * diag_matrix(f_sd[t]) * beta[t]' + diag_matrix(y_sd); // specify covariance of y
  }
}
model {
  // priors
  f_sd[1] ~ gamma(2, 0.1);  
  L_lower[1] ~ normal(0, 1);
  L_diag[1] ~ gamma(1, 1);
  tau ~ cauchy(0, 2.5);
  sigma_f_sd ~ lkj_corr_cholesky(2);
  L_lower_sd ~ gamma(2, 0.1);
  
  for (t in 2:TS){
    f_sd[t] ~ multi_normal_cholesky(f_sd[t-1], quad_form_diag(sigma_f_sd, tau));
    L_lower[t] ~ normal(L_lower[t-1], L_lower_sd);
    y[t] ~ multi_normal(y_mu, Q[t]);
  }
}

Cheers

Whether R-hat=NA is a problem or not depends on which parameters are causing it. The upper halves of those Cholesky factors are by definition zero so their R-hat is undefined. It’s safe to ignore them.
Look at the summary and see if any non-fixed parameters with high R-hat.

That should be

f_sd[t] ~ multi_normal_cholesky(f_sd[t-1],
                                diag_pre_multiply(tau, sigma_f_sd));

The covariance matrix diagonal is variances, not standard deviations.

Q[t] = tcrossprod(diag_post_multiply(beta[t], f_sd[t]))
       + diag_matrix(square(y_sd));

Cauchy distribution has a long tail. The prior expects that one tau will be much larger than the others.
A better prior would be tau ~ normal(0, max_tau); where max_tau is the largest value tau could reasonably be.

Thanks for all the advice. Currently, my code looks like:

data {
  int <lower=1> P; // number of dimensions 
  int <lower=1> TS; // number of time steps   
  int <lower=1> D; // number of latent factors
  vector[P] y[TS];
}
transformed data {
  int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
  beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
  // Parameters
  vector[D] log_f_sd[TS]; // log latent factor standard deviations
  vector[D] log_f_sd_beta; // log latent factor standard deviation transform
  vector[D] log_f_sd_bias; // log latent factor standard deviation bias
  vector[beta_l] L_lower[TS]; // lower diagonal loadings
  vector[beta_l] L_lower_beta; // lower diagonal transform
  vector[beta_l] L_lower_bias; // lower diagonal bias
  vector<lower=0>[beta_l] L_lower_sd; // lower diagonal standard deviations
  vector<lower=0>[D] L_diag[TS]; // positive diagonal loadings
  vector[P] y_mu; // y means
  vector<lower=0>[P] y_sd; // y standard deviations
  // Hyperparameters
  vector<lower=0>[D] tau; // scale
  cholesky_factor_corr[D] sigma_log_f_sd; //prior correlation
}
transformed parameters {
  cholesky_factor_cov[P, D] beta [TS];
  cov_matrix[P] Q[TS];
  // beta[T] and Q[T]
  for (t in 1:TS){ 
    int idx;
    idx = 1;
    for (j in 1:D) {
      beta[t][j, j] = L_diag[t][j]; // set positive diagonal loadings
      for (k in (j+1):D){
        beta[t][j, k] = 0; // set upper triangle values to 0
      }
      for (i in (j+1):P){
        beta[t][i, j] = L_lower[t][idx]; // set lower diagonal loadings
        idx = idx + 1;
      }
    }
  Q[t] = tcrossprod(diag_post_multiply(beta[t], exp(log_f_sd[t]))) + diag_matrix(square(y_sd)); // specify covariance of y
  }
}
model {
  // priors
  log_f_sd[1] ~ normal(0, 1);  
  L_lower[1] ~ normal(0, 1);
  L_diag[1] ~ gamma(1, 1);
  tau ~ normal(0, 1);
  sigma_log_f_sd ~ lkj_corr_cholesky(1);
  L_lower_sd ~ gamma(2, 0.1);
  
  for (t in 2:TS){
    // log_f_sd[t] ~ multi_normal_cholesky(log_f_sd[t-1], diag_pre_multiply(tau, sigma_f_sd));
    log_f_sd[t] ~ multi_normal_cholesky(log_f_sd_bias + (log_f_sd_beta .* log_f_sd[t-1]), diag_pre_multiply(tau, sigma_log_f_sd));
    // L_lower[t] ~ normal(L_lower[t-1], L_lower_sd);
    L_lower[t] ~ normal(L_lower_bias + (L_lower_beta .* L_lower[t-1]), L_lower_sd);
    y[t] ~ multi_normal(y_mu, Q[t]);
  }
}

The DGP is

# Simulate data according to Lopes and Carvalho (2007) without switching
n_T <- 100
n_X <- 5
n_Y <- 10
n_D <- 1000

# Create data strucutures
Y <- array(0, c(n_T, n_Y))
B <- array(0, c(n_T, n_Y, n_X))
log_X_sigma <- array(0, c(n_T, n_X))

# y hyperparameters 
# y_sigma
y_sigma <-matrix(rep(abs(rnorm(1)), n_Y), c(n_Y, 1))

# x hyperparameters
# Covariance
log_x_sigma <- rnorm(n_X)
chol_x_sigma <- matrix(rnorm(n_X * n_X, sd=0.25), c(n_X, n_X))
diag(chol_x_sigma) <- abs(diag(chol_x_sigma))

# B hyperparameters
# Want B to be 'almost sparse' where B contains mostly values close to 0.1
base <- 0.1
p_connect <- 0.4
n_connect <- p_connect * n_Y * n_X 
add <- (2 * rbinom(n_connect, 1, 0.5) - 1) * (0.5 + abs(rnorm(n_connect)))
B_ <- base * rnorm(n_Y * n_X)
B_[1:n_connect] <- B_[1:n_connect] + add
B_ <- matrix(sample(B_), c(n_Y, n_X))
# To ensure identifiability, set upper triangle to be 0 and diagonal to be 1
B_ [upper.tri(B_)] <- 0
diag(B_) <- 1

# Initialise
log_X_sigma[1, ] <- log_x_sigma + exp(chol_x_sigma) %*% rnorm(n_X)
B[1, , ] <- B_ + base * lower.tri(matrix(rnorm(n_Y * n_X), c(n_Y, n_X)))
Y[1,  ] <- (B[1, , ] %*% exp(log_X_sigma[c(1), ]) + y_sigma) * rnorm(n_Y)

# Generate data
for (i in 2:n_T){
  log_X_sigma[i, ] <- log_X_sigma[i-1, ] + chol_x_sigma %*% rnorm(n_X)
  B[i, , ] <- B[i-1, , ] + base * lower.tri(B[i, , ]) * matrix(rnorm(n_Y * n_X), c(n_Y, n_X))
  Y[i, ] <- (B[i, , ] %*% exp(log_X_sigma[c(i), ]) + y_sigma) * rnorm(n_Y)
}

I’m getting a ton of divergences and I’ve referred to Michael’s example to try and eliminate them but it doesn’t seem to be working. I’m also trying to sort out issues with Bayesian fraction of missing information and both bulk and tail ESSs.
Also, is there a way to speed it up?