Hi,
So I have been working with estimation of a linear time varying parameter system (details not necessary for following question). The test case I am exploring is a relatively simple hidden markov model. The really odd thing is that when I include two additional parameters that are completely unused by the model sampling happens about 5 times faster (or more). I discovered this by accident when I was updating how I was defining the noise and accidentally left in these parameters.
The model without the extra parameters:
functions{
real matrix_normal_lpdf(matrix y, matrix mu, matrix LSigma){
int pdims[2] = dims(y);
matrix[pdims[1],pdims[2]] error_sc = mdivide_left_tri_low(LSigma, y - mu);
real p1 = -0.5*pdims[2]*(pdims[1]*log(2*pi()) + 2*sum(log(diagonal(LSigma))));
real p2 = -0.5*sum(error_sc .* error_sc);
return p1+p2;
}
}
data {
int<lower=0> N;
row_vector[N] u;
row_vector[N] y;
real x0;
real<lower=1e-8> P0_diag;
matrix[2, N] p;
matrix[2, N] q;
}
parameters {
row_vector[N+1] x;
row_vector[2] A;
row_vector[2] B;
row_vector[2] C;
row_vector[2] D;
cholesky_factor_cov[2] LPI;
}
transformed parameters {
row_vector[N] mu;
row_vector[N] mu_y;
matrix[2, N] mu_c;
// TODO: Actually implement a kronecker product in stan
mu = (A * p) .* x[1:N] + (B * q) .* u;
mu_y = (C * p) .* x[1:N] + (D * q) .* u;
// combine
mu_c[1, :] = mu;
mu_c[2, :] = mu_y;
}
model {
// initial state prior
x[1] ~ normal(x0, P0_diag); //
// combined distribution
target += matrix_normal_lpdf(append_row(x[2:N+1],y) | mu_c, LPI);
}
generated quantities {
cov_matrix[2] PI;
row_vector[N] yhat;
PI = LPI * LPI';
yhat = mu_y + normal_rng(0, sqrt(PI[2,2]));
}
The model with the two additional unused parameters
functions{
real matrix_normal_lpdf(matrix y, matrix mu, matrix LSigma){
int pdims[2] = dims(y);
matrix[pdims[1],pdims[2]] error_sc = mdivide_left_tri_low(LSigma, y - mu);
real p1 = -0.5*pdims[2]*(pdims[1]*log(2*pi()) + 2*sum(log(diagonal(LSigma))));
real p2 = -0.5*sum(error_sc .* error_sc);
return p1+p2;
}
}
data {
int<lower=0> N;
row_vector[N] u;
row_vector[N] y;
real x0;
real<lower=1e-8> P0_diag;
matrix[2, N] p;
matrix[2, N] q;
}
parameters {
row_vector[N+1] x;
row_vector[2] A;
row_vector[2] B;
row_vector[2] C;
row_vector[2] D;
real<lower=1e-8> sigma_v;
real<lower=1e-8> sigma_w;
cholesky_factor_cov[2] LPI;
}
transformed parameters {
row_vector[N] mu;
row_vector[N] mu_y;
matrix[2, N] mu_c;
// TODO: Actually implement a kronecker product in stan
mu = (A * p) .* x[1:N] + (B * q) .* u;
mu_y = (C * p) .* x[1:N] + (D * q) .* u;
// combine
mu_c[1, :] = mu;
mu_c[2, :] = mu_y;
}
model {
// initial state prior
x[1] ~ normal(x0, P0_diag); //
// combined distribution
target += matrix_normal_lpdf(append_row(x[2:N+1],y) | mu_c, LPI);
}
generated quantities {
cov_matrix[2] PI;
row_vector[N] yhat;
PI = LPI * LPI';
yhat = mu_y + normal_rng(0, sqrt(PI[2,2]));
}
I have been using the same data as well as initializing all chains to the true values to keep results as consistent as possible. I have also experimented with placing various priors on the covariance matrix (including transforming it and using the LKJ prior) with the same result in all cases. In all cases the resulting estimates look good.
Since I’m not sure how to upload my data (saving it using json as a .data file) I will include a short python script below that generates the data and the initial values.
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg as linalg
import json
import pandas as pd
N = 100 # Number of data samples
nx = 1 # state dim
nu = 1 # input dim
ny = 1 # output dim
nv = 2 # time varying signal size??
# true system
A1 = 0.9
B1 = 1
C1 = 0.5
D1 = 0
A2 = 0.6
B2 = 1
C2 = 0.7
D2 = 0
A = np.hstack([A1, A2])
B = np.hstack([B1, B2])
C = np.hstack([C1, C2])
D = np.hstack([D1, D2])
# Q = np.eye(nx) / 10
# R = np.eye(ny) / 10
Q = np.eye(nx) / 10
R = np.eye(ny) / 10
# exo signals
u = np.random.randn(nu,N)
p = (np.sin(2*np.pi*np.linspace(0,3,N))+1)/2
p = np.vstack([p, 1-p])
q = p
w = linalg.sqrtm(Q)*np.random.randn(nx,N)
v = linalg.sqrtm(R)*np.random.randn(ny,N)
x = np.zeros((nx, N+1))
y = np.zeros((ny, N))
y_test = np.zeros((ny, N))
for k in range(N):
x[:, k+1] = A @ np.kron(p[:, k], x[:, k]) + B @ np.kron(q[:, k], u[:, k]) + w[:, k]
y[:, k] = C @ np.kron(p[:, k], x[:, k]) + D @ np.kron(q[:, k], u[:, k]) + v[:, k]
PI = np.array([[Q[0,0], 0],[0, R[0,0]]])
# uncomment below stuff to include initialisation
stan_init = {
'x':x.flatten().tolist(),
'A':A.tolist(),
'B':B.tolist(),
'D':D.tolist(),
'C':C.tolist(),
'LPI': linalg.cholesky(PI).tolist()
}
with open('models/lpv_init.json', 'w') as outfile:
json.dump(stan_init, outfile)
# save data for inference in stan
stan_data = {
'N': int(N),
'u': u.flatten().tolist(),
'y': y.flatten().tolist(),
'x0': 0.0,
'P0_diag':0.01,
'p':p.tolist(),
'q':q.tolist(),
}
with open('models/lpv_v01.data.json', 'w') as outfile:
json.dump(stan_data, outfile)
This behavior seems somewhat confusing, my only guess is that the extra parameters somehow regulate the manifold that it is sampling from.