I spoke to my collaborators and they said that a simpler model would suffice at the moment and I’ve gotten it to work with an updated DGP and Stan model. Apart from a few divergences here and there, I get decent results. Could I get you to have a look at the model to see if you can reduce the divergences to 0/further optimise the runtime?
We’ve simplified the DGP to non-time varying f and \epsilon_y while keeping allowing \beta to change across time. In equations, we have f_t \sim N(0, I), \epsilon_{y,t} \sim N(0, \Sigma_y) where \Sigma is a diagonal matrix, \beta_t = \beta_{l} + \beta_{s}(\beta_{t-1} - \beta_l) + \epsilon_{\beta, t} where \beta_l is the location parameter, \beta_s the scale parameter and \epsilon_{\beta, t} \sim N(0, \Sigma_\beta) with \Sigma_\beta is diagonal matrix.
The weirdest thing is that generating \beta_l \sim Unif(0, 1) was the key for making inference work. I’ve checked that the results are relatively stable across priors. Some of the rhats for \Sigma_y are larger than 1.01 while for the rest they are between 0.99 and 1.01.
The DGP and Stan call:
library(posterior)
library(tidyverse)
options(tibble.print_max = Inf)
# library(reticulate)
# use_python("/usr/bin/python")
# np <- import("numpy")
library(cmdstanr)
library(rstan)
# options(mc.cores = parallel::detectCores())
seed <- 1
set.seed(seed)
# Simulate data according to Lopes and Carvalho (2007) without switching
n_T <- 100
n_F <- 3
n_X <- 10
n_D <- 1000
n_W <- 1000
R <- 0.5
AD <- 0.999
TD <- 15
# SS <- 0.1
discount <- 1
# Create data strucutures
X <- array(0, c(n_T, n_X))
B <- array(0, c(n_T, n_X, n_F))
# x hyperparameters
# x_sigma <- abs(rep(rnorm(1, sd=0.25), n_X))
x_sigma <- abs(rnorm(n_X, sd=0.25))
# x_sigma <- rep(1, n_X)
# f hyperparameters
# F_sigma <- abs(rnorm(n_F, sd=0.5))
# B hyperparameters
# Want B to be 'almost sparse' where B contains mostly values close to 0.05
base_order <- 0.1
bias_order <- 0.5
p_connect <- 0.3
B_loc <- matrix(0, n_X, n_F)
diag(B_loc) <- 1
n_l_t <- sum(lower.tri(B_loc))
B_loc[lower.tri(B_loc)] <- rnorm(n_l_t, 0, .05) +
rbinom(n_l_t, 1, .3) * (2 * rbinom(n_l_t, 1, .5) - 1) *
rnorm(n_l_t, bias_order, 1)
# (bias_order + abs(rnorm(n_l_t)))
B_scale <- matrix(0, n_X, n_F)
B_scale[lower.tri(B_scale)] <- runif(n_l_t, 0, 1) # abs(rnorm(n_l_t, mean=0.5, sd=0.5)) # rbeta(n_l_t, 2, 2) # rep(0.5, n_l_t)
B_sigma <- matrix(0, n_X, n_F)
B_sigma[lower.tri(B_sigma)] <- abs(rnorm(n_l_t, mean=0, sd=0.2)) # rbeta(n_l_t, 2, 2) # rep(0.5, n_l_t)
# Initialise
# B[1, , ] <- B_loc + base_order * lower.tri(B[1, , ]) * matrix(rnorm(n_X * n_F), c(n_X, n_F))
B[1, , ] <- B_loc + B_sigma * matrix(rnorm(n_X * n_F), c(n_X, n_F))
# X[1, ] <- (B[1, , ] %*% (F_sigma * rnorm(n_F))) + x_sigma * rnorm(n_X)
X[1, ] <- (B[1, , ] %*% rnorm(n_F)) + x_sigma * rnorm(n_X)
# Generate data
for (i in 2:n_T){
# B[i, , ] <- B_loc + (B_scale * (B[i-1, , ] - B_loc)) + discount * base_order * lower.tri(B[i, , ]) * matrix(rnorm(n_X * n_F), c(n_X, n_F))
B[i, , ] <- B_loc + (B_scale * (B[i-1, , ] - B_loc)) + discount * B_sigma * matrix(rnorm(n_X * n_F), c(n_X, n_F))
X[i, ] <- (B[i, , ] %*% rnorm(n_F)) + x_sigma * rnorm(n_X)
}
# # Standardise X
# X_mu <- colMeans(X, dims = 1)
# X_sd <- apply(X, 2, sd)
# X_s <- scale(X)
matplot(X, type='l')
matplot(matrix(B, c(n_T, n_X * n_F)), type='l')
model <- cmdstan_model('infer_tv.stan', quiet=FALSE, force_recompile=TRUE)
fit <- model$sample(data=list(P=n_X, F=n_F, TS=n_T, disc=discount, x=X),
num_samples=n_D,
num_warmup=n_W,
refresh=100,
init=R,
seed=seed,
adapt_delta=AD,
max_depth=TD,
# stepsize=SS,
num_chains=1,
num_cores=4)
stanfit <- read_stan_csv(fit$output_files())
# print(x_sigma)
# print(chol_F_sigma)
# print(log_F_sigma_loc)
# print(log_F_sigma_scale)
params <- extract(stanfit)
x_sd_hat <- colMeans(params$x_sd, dims=1)
beta_lower_sd_hat <- colMeans(params$beta_lower_sd, dims=1)
beta_lower_loc_hat <- colMeans(params$beta_lower_loc, dims=1)
beta_lower_scale_hat <- colMeans(params$beta_lower_scale, dims=1)
# F_sigma_hat <- colMeans(params$f_sd, dims=1)
pdf("comp.pdf")
par(pty="s")
plot(c(x_sd_hat) ~ c(x_sigma), cex=2, pch=10, col="dark gray", xlab="Simulated X SD", ylab="Estimated X SD")
fit_scale <- lm(c(x_sd_hat) ~ c(x_sigma)) # Fit a linear model
abline(fit_scale)
abline(0,1, lty = 2) # 1:1 line
plot(c(beta_lower_sd_hat) ~ c(B_sigma[lower.tri(B_sigma)]), cex=2, pch=10, col="dark gray", xlab="Simulated Beta SD", ylab="Estimated Beta SD")
fit_scale <- lm(c(beta_lower_sd_hat) ~ c(B_sigma[lower.tri(B_sigma)])) # Fit a linear model
abline(fit_scale)
abline(0,1, lty = 2) # 1:1 line
plot(c(beta_lower_loc_hat) ~ c(B_loc[lower.tri(B_loc)]), cex=2, pch=10, col="dark gray", xlab="Simulated Beta Loc", ylab="Estimated Beta Loc")
fit_scale <- lm(c(beta_lower_loc_hat) ~ c(B_loc[lower.tri(B_loc)])) # Fit a linear model
abline(fit_scale)
abline(0,1, lty = 2) # 1:1 line
plot(c(beta_lower_scale_hat) ~ c(B_scale[lower.tri(B_scale)]), cex=2, pch=10, col="dark gray", xlab="Simulated Beta Scale", ylab="Estimated Beta Scale")
fit_scale <- lm(c(beta_lower_scale_hat) ~ c(B_scale[lower.tri(B_scale)])) # Fit a linear model
abline(fit_scale)
abline(0,1, lty = 2) # 1:1 line
# plot(c(F_sigma_hat) ~ c(F_sigma), cex=2, pch=10, col="dark gray", xlab="Simulated F SD", ylab="Estimated F_SD")
# fit_scale <- lm(c(F_sigma_hat) ~ c(F_sigma)) # Fit a linear model
# abline(fit_scale)
# abline(0,1, lty = 2) # 1:1 line
dev.off()
print(fit$draws() %>%
subset_draws(variable = c('lp__', 'x_sd', 'beta_lower_scale'), regex = TRUE) %>%
summarise_draws())
# saveRDS(fit, "draws.rds")
#
# # Save draws as Python pickle
# py_save_object(r_to_py(params), 'draws.pkl')
The Stan model is largely the same:
data {
int <lower=1> P; // number of dimensions
int <lower=1> F; // number of latent factors
int <lower=1> TS; // number of time steps
real disc; // discount
vector[P] x[TS];
}
transformed data {
vector[F] beta_diag = rep_vector(1.0, F);
int beta_l = F * (P - F) + F * (F - 1) / 2; // number of lower-triangular, non-zero loadings
}
parameters {
// nuisance parameters
vector[F] z_fac_sd[TS]; // reparameterised latent factors
vector[beta_l] z_beta_lower_sd[TS]; // reparameterised lower diagonal loading standard deviation
// parameters
vector[beta_l] beta_lower_loc; // lower diagonal loadings location
vector<lower=0.0, upper=1.0>[beta_l] beta_lower_scale; // lower diagonal loadings scale
// real<lower=0.0, upper=1.0> raw_beta_lower_scale; // lower diagonal loadings scale
// vector<lower=0>[F] beta_diag; // positive diagonal loadings
// priors
// real<lower=0> raw_beta_lower_sd;
vector<lower=0>[beta_l] beta_lower_sd;
// real<lower=0> x_sd; // x standard deviation
vector<lower=0>[P] x_sd;
}
transformed parameters {
vector[beta_l] beta_lower[TS];
matrix[P, F] beta [TS];
// vector[beta_l] beta_lower_scale = rep_vector(raw_beta_lower_scale, beta_l);
// vector[beta_l] beta_lower_sd = rep_vector(raw_beta_lower_sd, beta_l);
for (t in 1:TS){
beta_lower[t] = beta_lower_loc + beta_lower_sd .* z_beta_lower_sd[t];
// beta_lower[t] = beta_lower_loc + beta_lower_sd * z_beta_lower_sd[t];
if (t > 1){
beta_lower[t] += beta_lower_scale .* (beta_lower[t-1] - beta_lower_loc);
}
{
int idx;
idx = 1;
for (j in 1:F) {
beta[t][j, j] = beta_diag[j]; // set positive diagonal loadings
for (k in (j+1):F){
beta[t][j, k] = 0; // set upper triangle values to 0
}
for (i in (j+1):P){
beta[t][i, j] = beta_lower[t][idx]; // set lower diagonal loadings
idx = idx + 1;
}
}
}
}
}
model {
vector[P] mu;
// priors
beta_lower_loc ~ std_normal();
// beta_lower_scale ~ std_normal();
beta_lower_scale ~ cauchy(0, 0.5);
// beta_lower_scale ~ lognormal(-2, 1);
// beta_lower_scale ~ beta(2, 5);
// beta_lower_scale ~ gamma(2, 1./10.);
// beta_lower_scale ~ beta(2., 2.);
// beta_lower_sd ~ std_normal();
beta_lower_sd ~ normal(0, 0.1);
// x_sd ~ gamma (2, 1./10.);
// x_sd ~ std_normal();
x_sd ~ normal(0, 0.1);
for (t in 1:TS){
z_fac_sd[t] ~ std_normal();
z_beta_lower_sd[t] ~ std_normal();
// x[t] ~ normal(beta[t] * (f_sd .* z_fac_sd[t]), x_sd);
// x[t] ~ normal(beta[t] * z_fac_sd[t], x_sd);
mu = beta[t] * z_fac_sd[t];
for (p in 1:P){
x[t][p] ~ normal(mu[p], x_sd[p]);
}
}
}
Let me know if you have any thoughts.