Hello Everyone,
I am trying to implement a model that combines a Gaussian Process with HMM. Specifically, it assumes that the observations are generated through a process as follows:
Obs ~ GP + \alpha_s
where the \alpha_s is a state dependent intercept is emitted by a Gaussian HMM process and GP is a Gaussian process with SE kernel.
I am trying to implement it in Stan. While checking the model with simulated data, the estimated HMM means are equally shifted by a random amount. For example, in the following sample output each estimated mean is shifted approximately by an amount -0.74:
Warning messages:
1: There were 1000 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
True Transition Matrix
[,1] [,2] [,3]
[1,] 0.64 0.05 0.30
[2,] 0.10 0.06 0.83
[3,] 0.14 0.20 0.64
Estimated Transition Matrix
[,1] [,2] [,3]
[1,] 0.67 0.06 0.26
[2,] 0.11 0.22 0.65
[3,] 0.24 0.15 0.59
True State Means
[1] 1.30 1.94 2.05
Estimated State Means
[1] 0.56 1.16 1.28
The amount of shift randomly differs every time I run the code with different test conditions. The mixing in the trace-plots seem to be very poor. It seems that there is an identification issue and I am not sure how to resolve that.
I have imposed ordering constraint on the HMM means. I have also tried Non-centered parameterizations for both GP and HMM parameters. Even set the initial value to zero for the latent function in GP. But, nothing seems to work.
data {
int<lower=1> N;
real x[N];
vector[N] y;
int<lower=1> S;
}
transformed data {
real delta = 1e-9;
}
parameters {
vector[N] eta;
real rho_tilde;
real<lower=0> rho_m;
real<lower=0> rho_s;
real alpha_tilde;
real<lower=0> alpha_m;
real<lower=0> alpha_s;
real sigma_tilde;
real<lower=0> sigma_m;
real<lower=0> sigma_s;
vector[S] muH_tilde;
vector<lower=0>[S] muH_s;
vector<lower=0>[S] muH_m;
simplex[S] A[S];
simplex[S] pi1;
real<lower=0> sigmaH[S];
real alphaH[N];
vector[N-1] free_pnum;
}
transformed parameters {
vector[N] f;
vector[1] z;
matrix[N, N] L_K;
real<lower=0> rho;
real<lower=0> alpha;
real<lower=0> sigma;
vector[S] muH;
vector[S] muTemp;
rho = exp(log(rho_m) + rho_s * rho_tilde);
alpha = exp(log(alpha_m) + alpha_s * alpha_tilde);
sigma = exp(log(sigma_m) + sigma_s * sigma_tilde);
for (i in 1:S)
muTemp[i] = exp(log(muH_m[i]) + muH_s[i] * muH_tilde[i]);
muH = sort_asc(muTemp);
z[1] = 0;
f = append_row(z,free_pnum);
{
matrix[N, N] K = cov_exp_quad(x, alpha, rho);
for (n in 1:N)
K[n, n] = K[n, n] + delta;
L_K = cholesky_decompose(K);
f = L_K * eta;
}
}
model {
vector[N] theta;
vector[S] logalpha[N];
target += inv_gamma_lpdf(rho_m | 2, 0.5);
target += normal_lpdf(alpha_m | 0, 2.0);
target += normal_lpdf(sigma_m | 0, 1.0);
target += normal_lpdf(muH_m | 0, 1.0);
target += normal_lpdf(rho_s | 0, 0.5);
target += normal_lpdf(alpha_s | 0, 0.5);
target += normal_lpdf(sigma_s | 0, 0.5);
target += normal_lpdf(muH_s | 0, 0.5);
target += normal_lpdf(rho_tilde | 0, 1.0);
target += normal_lpdf(alpha_tilde | 0, 1.0);
target += normal_lpdf(sigma_tilde | 0, 1.0);
target += normal_lpdf(muH_tilde | 0, 1.0);
target += normal_lpdf(eta | 0, 1.0);
target += gamma_lpdf(sigmaH | 2, 2.0);
// Forward algorithm
{
real accumulator[S];
logalpha[1] = log(pi1) + normal_lpdf(alphaH[1] | muH, sigmaH);
for (t in 2:N) {
for (j in 1:S) {
for (i in 1:S) {
accumulator[i] = logalpha[t-1, i] + log(A[i, j]) + normal_lpdf(alphaH[t] | muH[j], sigmaH[j]);
}
logalpha[t, j] = log_sum_exp(accumulator);
}
}
}
target += log_sum_exp(logalpha[N]);
for (n in 1:N)
theta[n] = f[n] + alphaH[n];
target += normal_lpdf(y | theta, sigma);
}
Data generation Code in R
library(rstan)
library(ggplot2)
library(shinystan)
set.seed(123456789)
logit <- function(x){1/(1+exp(-x))}
runif_simplex <- function(T) {
x <- -log(runif(T))
x / sum(x)
}
hmm_generate <- function(K, T) {
# 1. Parameters
pi1 <- runif_simplex(K)
A <- t(replicate(K, runif_simplex(K)))
mu <- sort(rnorm(K, 1.0 * 1:K, 1))
sigma <- 0.01
# 2. Hidden path
z <- vector("numeric", T)
z[1] <- sample(1:K, size = 1, prob = pi1)
for (t in 2:T)
z[t] <- sample(1:K, size = 1, prob = A[z[t - 1], ])
# 3. Observations
y <- vector("numeric", T)
for (t in 1:T)
y[t] <- rnorm(1, mu[z[t]], sigma)
list(y = y, z = z,
theta = list(pi1 = pi1, A = A, mu = mu, sigma = sigma))
}
K <- 3
N <- 100
dataset <- hmm_generate(K, N)
dataset
f <- function(x) x^2 + x
x <- rnorm(N, mean = 0, sd = 0.5)
yhmm <- rnorm(length(x), mean=f(x)+ dataset$y, sd=0.1)
plot(x,yhmm)
stan_data = list(N=N, x=x, y=yhmm, S=K)
fit_hmm <- stan(file = "HMMGp.stan", data=stan_data, verbose = F, chains = 1, control = list(adapt_delta = 0.99))
muH <- extract(fit_hmm, pars = 'muH')[[1]]
A <- extract(fit_hmm, pars = 'A')[[1]]
sigmaH <- extract(fit_hmm, pars = 'sigmaH')[[1]]
hmat = c()
for(i in 1:3){
for(j in 1:3)hmat=c(hmat,mean(A[][,i,][,j]))
}
A = t(matrix(hmat, nrow=3,ncol=3))
dataset$theta$A
A
mH = c()
for(i in 1:3){mH=c(mH,mean(muH[,i]))}
mH
dataset$theta$mu
sH = c()
for(i in 1:3){sH=c(sH,mean(sigmaH[,i]))}
sH
dataset$theta$sigma
Is it even possible to estimate this model? Please have a look at the code and kindly guide me.
Thank you so much.