Help with Factor Analysis

Hi,

I’m using PyStan and fitting a factor model. To avoid funnel-like behaviour, I have reparameterised my model according to the manual and it runs pretty quickly. However, I am getting lots of warnings :

Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
Exception: std_normal_lpdf: Random variable[1] is -nan, but must not be nan!  (in 'infer.stan' at line 76)
If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.

which refers to

z_y[i] ~ std_normal();

in

data {
  int <lower=1> P; // number of dimensions 
  int <lower=1> N; // number of    
  int <lower=1> D; // number of latent factors
  vector[P] y[N];
}
transformed data {
  vector[D] fac_mu = rep_vector(0.0, D);
  vector[D] fac_sd = rep_vector(1.0, D);
  int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
  beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
  // Parameters
  vector[D] z_fac[N]; // reparameterised latent factors
  vector[D] z_log_f_sd; // normalised log latent factor standard deviations
  // vector[D] log_f_sd; // log latent factor standard deviations
  // vector[D] log_f_sd_beta; // log latent factor standard deviation transform
  // vector[D] log_f_sd_bias; // log latent factor standard deviation bias
  vector[beta_l] L_lower; // lower diagonal loadings
  // vector[beta_l] L_lower_beta; // lower diagonal transform
  // vector[beta_l] L_lower_bias; // lower diagonal bias
  // real<lower=0> L_lower_sd; // lower diagonal standard deviations
  vector<lower=0>[D] L_diag; // positive diagonal loadings
  real log_y_sd; // y standard deviations
  // Hyperparameters
  vector<lower=0>[D] tau; // scale
  cholesky_factor_corr[D] corr_log_f_sd; //prior correlation
}
transformed parameters {
  matrix[P, D] beta;
  vector[D] log_f_sd;
  vector[D] fac[N]; // latent factors
  // beta[T] and Q[T]
  {
  int idx;
  idx = 1;
  for (j in 1:D) {
    beta[j, j] = L_diag[j]; // set positive diagonal loadings
    for (k in (j+1):D){
      beta[j, k] = 0; // set upper triangle values to 0
    }
    for (i in (j+1):P){
      beta[i, j] = L_lower[idx]; // set lower diagonal loadings
      idx = idx + 1;
      }
    }
  }
  log_f_sd = tau .* (corr_log_f_sd * z_log_f_sd);
  for (i in 1:N){
    fac[i] = exp(log_f_sd) .* z_fac[i];
  }
}
model {
  vector[P] z_y[N]; // reparameterised ys
  
  // priors
  tau ~ normal(0, 1);
  corr_log_f_sd ~ lkj_corr_cholesky(1);
  L_lower ~ normal(0, 1);
  L_diag ~ gamma(2, 0.5);
  // L_lower_sd ~ gamma(2, 0.1);
  log_y_sd ~ normal(0, 1);
  
  // log_f_sd ~ multi_normal_cholesky(log_f_sd, diag_pre_multiply(tau, corr_log_f_sd));
  // L_lower ~ normal(L_lower_bias + (L_lower_beta .* L_lower), L_lower_sd);
  // L_lower ~ normal(L_lower, L_lower_sd);
  z_log_f_sd ~ std_normal();
  
  for (i in 1:N){
    // fac[i] ~ multi_normal_cholesky(fac_mu, diag_matrix(exp(log_f_sd)));
    // z_fac[i] ~ multi_normal_cholesky(fac_mu, diag_matrix(fac_sd));
    z_fac[i] ~ std_normal();
    // y[i] ~ multi_normal_cholesky(beta * fac[i], diag_matrix(exp(rep_vector(log_y_sd, P))));
    z_y[i] = (y[i] - beta * fac[i]) / exp(log_y_sd);
    z_y[i] ~ std_normal();
    // target += z_y[i];
  }
}

My data-generating process and pystan code is:

import numpy as np
import matplotlib.pyplot as plt
import pystan

import gzip
import os
import pdb
import pickle

seed = 1

np.random.seed(seed)
threads = 4
os.environ['STAN_NUM_THREADS'] = str(threads)

n_N = 100
n_F = 5
n_Y = 10
n_D = 5000
n_W = 1000
n_C = 4
AD = 0.95

Y = np.zeros((n_N, n_Y))
B = np.zeros((n_Y, n_F))
log_F_sigma = np.zeros((n_F))

# y hyperparameters
log_y_sigma = np.random.normal(0, 1) * np.ones(n_Y)

# f hyperparameters
log_f_sigma_ = np.random.normal(-1, 0.25, n_F)
# diag chol
chol_f_sigma = np.diag(np.abs(np.random.normal(0, 0.1, n_F)))
# full chol
# chol_f_sigma = np.random.normal(0, 0.25, n_F * n_F).reshape(n_F, n_F)
# row, col = np.diag_indices(n_F)
# chol_f_sigma[row, col] = np.abs(chol_f_sigma[row, col])

# B
base_order = 0.05
bias_order = 0.5
p_connect = 0.3
n_connect = np.int(0.3 * n_Y * n_F)
add = (2 * np.random.binomial(1, 0.5, n_connect) - 1) * (bias_order + np.abs(np.random.standard_normal(n_connect)))
B_ = base_order * np.random.standard_normal(n_Y * n_F)
B_[:n_connect] += add
B_ = np.random.permutation(B_).reshape(n_Y, n_F)
row, col = np.triu_indices(n_Y, 0, n_F)
B_[row, col] = 0
np.fill_diagonal(B_, 1)

# Initialise
# log_F_sigma[0] = np.random.multivariate_normal(log_f_sigma_, chol_f_sigma ** 2)
log_F_sigma = log_f_sigma_ + chol_f_sigma @ np.random.standard_normal(n_F) 
B = B_ * base_order * np.tril(np.random.standard_normal(n_Y * n_F).reshape(n_Y, n_F), k=-1)

for i in range(1, n_N):
    Y[i] = B @ np.random.multivariate_normal(np.zeros(n_F), np.diag(np.exp(2 * log_F_sigma))) + np.exp(log_y_sigma) * np.random.standard_normal(n_Y)

dat = {
    'P': n_Y,
    'D': n_F,
    'N': n_N,
    # 'fac_mu': np.zeros(n_F),
    'y': Y
    }

extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(file='infer.stan', extra_compile_args=extra_compile_args)
fit = model.sampling(data=dat, iter=n_D, warmup=n_W, seed=seed, chains=n_C, verbose=True, control={'adapt_delta':AD}, n_jobs=threads)
with gzip.open('pystan_non_tv_fit_{}_{}_{}_{}_{}.gz'.format(n_D, n_W, seed, n_C, AD), 'wb') as f:
    pickle.dump({'model' : model, 'fit' : fit}, f)

Could someone tell me if this is normal behaviour or should I be concerned?

Cheers,
Sky

This is not correct in Stan. This would require a Jacobin adjustment, since it’s a non-linear transform. See the manual for details.

Hi Andre,

I’ve realised that after I posted and have made changes to further simplify my model. Directly parameterising y seems to be quicker…

data {
  int <lower=1> P; // number of dimensions 
  int <lower=1> N; // number of    
  int <lower=1> D; // number of latent factors
  vector[P] y[N];
}
transformed data {
  // vector[D] fac_mu = rep_vector(0.0, D);
  vector[D] L_diag = rep_vector(1.0, D);
  int<lower=1> beta_l; // number of lower-triangular, non-zero loadings
  beta_l = P * (P - D) + D * (D - 1) / 2;
}
parameters {
  // Parameters
  vector[D] z_fac[N]; // reparameterised latent factors
  // vector[D] z_log_f_sd; // reparameterised log latent factor standard deviations
  // vector[beta_l] z_L_lower_sd;
  vector[D] log_f_sd; // log latent factor standard deviations
  // vector[D] log_f_sd_raw; // raw latent factor standard deviation transform
  // vector[D] log_f_sd_bias; // log latent factor standard deviation bias
  vector[beta_l] L_lower; // lower diagonal loadings
  // vector[beta_l] L_lower_raw; // raw lower diagonal loadings
  // vector[beta_l] L_lower_beta; // lower diagonal transform
  // vector[beta_l] L_lower_bias; // lower diagonal bias
  // vector<lower=0>[D] L_diag; // positive diagonal loadings
  // real log_L_lower_sd; // lower diagonal standard deviation
  real log_y_sd; // y standard deviation
  // Hyperparameters
  // vector<lower=0>[D] tau; // scale
  // cholesky_factor_corr[D] corr_log_f_sd; //prior correlation
}
transformed parameters {
  matrix[P, D] beta;
  // vector[D] log_f_sd;
  // vector[beta_l] L_lower;
  vector[D] fac[N]; // latent factors
  
  // log_f_sd = diag_pre_multiply(tau, corr_log_f_sd) * z_log_f_sd;
  // L_lower = L_lower_raw + exp(log_L_lower_sd) * z_L_lower_sd;
  
  // beta[T] and Q[T]
  {
  int idx;
  idx = 1;
  for (j in 1:D) {
    beta[j, j] = L_diag[j]; // set positive diagonal loadings
    for (k in (j+1):D){
      beta[j, k] = 0; // set upper triangle values to 0
    }
    for (i in (j+1):P){
      beta[i, j] = L_lower[idx]; // set lower diagonal loadings
      idx = idx + 1;
      }
    }
  }
  for (i in 1:N){
    fac[i] = exp(log_f_sd) .* z_fac[i];
  }
}
model {
  // priors
  log_f_sd ~ normal(0, 1);
  // tau ~ normal(0, 1);
  // corr_log_f_sd ~ lkj_corr_cholesky(1);
  L_lower ~ normal(0, 1);
  // L_diag ~ gamma(2, 0.5);
  // L_lower_sd ~ gamma(2, 0.1);
  // log_L_lower_sd ~ normal(0, 1);
  log_y_sd ~ normal(0, 1);
  
  // log_f_sd ~ multi_normal_cholesky(log_f_sd, diag_pre_multiply(tau, corr_log_f_sd));
  // L_lower ~ normal(L_lower_bias + (L_lower_beta .* L_lower), L_lower_sd);
  // z_log_f_sd ~ std_normal();
  // z_L_lower_sd ~ std_normal();
  
  for (i in 1:N){
    // fac[i] ~ multi_normal_cholesky(fac_mu, diag_matrix(exp(log_f_sd)));
    // z_fac[i] ~ multi_normal_cholesky(fac_mu, diag_matrix(fac_sd));
    z_fac[i] ~ std_normal();
    y[i] ~ normal(beta * fac[i], exp(log_y_sd));
  }
}

and the DGP

import numpy as np
import matplotlib.pyplot as plt
import pystan

import gzip
import os
import pdb
import pickle

seed = 1

np.random.seed(seed)
threads = 4
os.environ['STAN_NUM_THREADS'] = str(threads)

n_N = 100
n_F = 5
n_Y = 10
n_D = 15000
n_W = 5000
n_C = 4
AD = 0.95

Y = np.zeros((n_N, n_Y))
B = np.zeros((n_Y, n_F))
log_F_sigma = np.zeros((n_F))

# y hyperparameters
log_y_sigma = np.random.normal(0, 1) * np.ones(n_Y)

# f hyperparameters
# log_f_sigma = np.random.normal(0, 0.25, n_F)
# diag chol
# chol_log_f_sigma = np.diag(np.abs(np.random.normal(0, 0.1, n_F)))
# full chol
chol_log_f_sigma = np.random.normal(0, 1, n_F * n_F).reshape(n_F, n_F)
row, col = np.diag_indices(n_F)
chol_log_f_sigma[row, col] = np.abs(chol_log_f_sigma[row, col])

# B
base_order = 1
# base_order = 0.05
# bias_order = 0.5
# p_connect = 0.3
# n_connect = np.int(0.3 * n_Y * n_F)
# add = (2 * np.random.binomial(1, 0.5, n_connect) - 1) * (bias_order + np.abs(np.random.standard_normal(n_connect)))
B_ = base_order * np.random.standard_normal(n_Y * n_F)
# B_[:n_connect] += add
B_ = np.random.permutation(B_).reshape(n_Y, n_F)
row, col = np.triu_indices(n_Y, 0, n_F)
B_[row, col] = 0
np.fill_diagonal(B_, 1)

# Initialise
# log_F_sigma[0] = np.random.multivariate_normal(log_f_sigma_, chol_log_f_sigma ** 2)
log_F_sigma = chol_log_f_sigma @ np.random.standard_normal(n_F) 
B = B_ # + base_order * np.tril(np.random.standard_normal(n_Y * n_F).reshape(n_Y, n_F), k=-1)

for i in range(1, n_N):
    Y[i] = B @ np.random.multivariate_normal(np.zeros(n_F), np.diag(np.exp(2 * log_F_sigma))) + np.exp(log_y_sigma) * np.random.standard_normal(n_Y)

dat = {
    'P': n_Y,
    'N': n_N,
    'D': n_F,
    # 'fac_mu': np.zeros(n_F),
    'y': Y
    }

extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(file='infer.stan', extra_compile_args=extra_compile_args)
fit = model.sampling(data=dat, iter=n_D, warmup=n_W, seed=seed, chains=n_C, verbose=True, control={'adapt_delta':AD}, n_jobs=threads)
print('log_y_sigma', log_y_sigma)
# print('log_f_sigma', log_f_sigma)
print('log_F_sigma', log_F_sigma)
print(fit.stansummary(pars=['log_y_sd', 'log_f_sd'])) 

res = fit.extract(pars=['beta', 'fac'])
Y_hat = np.mean(np.einsum('ijk,ilk->ilj', res['beta'], res['fac']), axis=(0, 1))
print(np.mean(Y - Y_hat, axis=0))

with gzip.open('pystan_non_tv_fit_{}_{}_{}_{}_{}.gz'.format(n_D, n_W, seed, n_C, AD), 'wb') as f:
    pickle.dump({'model' : model, 'fit' : fit}, f)

I’m getting accurate estimates for log_y_sd and some components of log_f_sd. Is there anyway to improve the quality of estimates apart from increasing the number of samples?

It’s unusual to do like this, but not wrong. Usually we specify something like this:

real<lower=0> y_sd;
y ~ student_t(4, 0, 10);

I’m not familiar with Factor Analysis in Stan, do you have a reference paper? Especially, why did you use a loop to build the covariance matrix?

I’m actually working on a time-varying version (see Time Varying Factor Analysis). The relevant paper is Lopes and Carvalho 2007.

I’m using a loop to build beta to ensure identifiability.
What prior should I then be using for y_sd?