Please share your Stan program and accompanying data if possible.
Good morning everyone,
I have a question, and I hope you can help me.
Based on the BayesHMM package (An Introduction to BayesHMM • BayesHMM), I developed a simple extension for changepoint detection using the Viterbi algorithm, where the target distribution is a mixture of Skew Normal distributions.
The Stan model works; however, when using simulated data, the estimates do not seem optimal. Specifically, it seems the model struggles to capture the asymmetry and mixture parameters.
Do you have any suggestions?
Here is the mathematical interpretation of the model along with the corresponding R and Stan code.
Model
\begin{equation}
p\left(\mathbf{z}_{1: T}, \mathbf{y}_{1: T}\right)=p\left(\mathbf{z}_{1: T}\right) p\left(\mathbf{y}_{1: T} \mid \mathbf{z}_{1: T}\right)=\left[p\left(z_1\right) \prod_{t=2}^T p\left(z_t \mid z_{t-1}\right)\right]\left[\prod_{t=1}^T p\left(\mathbf{y}_t \mid z_t\right)\right]
\end{equation}
where (p(\mathbf{z}_1)) represents the initial state distribution, specifying the probabilities of the initial hidden state. The transitions (p(z_t | z_{t-1})) capture the dynamics of the hidden states over time, while the probabilities (p(\mathbf{y}_t | z_t)) define the likelihood of observing the data (\mathbf{y}_t) given the hidden state (z_t), which are modeled as a mixture of skew-normal distributions.
Thus, formula can be hence rewritten in terms of probabilities conditioned on the observed states as
[
p\left(\mathbf{y}_t \mid z_t, \boldsymbol{\Theta}\right)=\sum_{k=1}^K \pi_k g_k\left(\mathbf{y}_t \mid \boldsymbol{\Theta}\right)
]
where g_k is the k-th density function of a skew-normal distribution (\mathcal{SN}\left(\xi_k, \omega_k^2, \lambda_k\right)) with (\boldsymbol{\Theta} \in \mathbb{R}^{k\times Z}), while the general transition for matrix \mathbf{A} with dimensions Z \times Z defined by:
which is subject to the row-sum constraint (i.e., \sum_{z=1}^Z a_{i z}=1 for each i \in\{1, \ldots, Z\}). Following this notation, a first-order Markov process will therefore be subject to the following conditional probability: (a_{i j}=p\left(z_t=j \mid z_{t-1}=i\right)).
Stan
functions {
vector normalize(vector x) {
return x / sum(x);
}
}
data {
int<lower=1> T; // number of observations (length)
int<lower=1> K; // number of hidden states
real y[T]; // observations
}
parameters {
// Discrete state model
simplex[K] pi1; // initial state probabilities
simplex[K] A[K]; // transition probabilities
// A[i][j] = p(z_t = j | z_{t-1} = i)
// Continuous observation model
ordered[K] mu1; // observation means for component 1
ordered[K] mu2; // observation means for component 2
//real<lower=0.01, upper=3> sigma1[K]; // observation standard deviations for component 1
//real<lower=0.01, upper=3> sigma2[K]; // observation standard deviations for component 2
real<lower=0> sigma1[K]; // observation standard deviations for component 1
real<lower=0> sigma2[K]; // observation standard deviations for component 2
real<lower=-1, upper=1> skewness1[K]; // skewness parameters for component 1
real<lower=-1, upper=1> skewness2[K]; // skewness parameters for component 2
// Mixture parameters
simplex[K] theta; // mixing proportions
}
transformed parameters {
vector[K] logalpha[T];
{ // Forward algorithm log p(z_t = j | x_{1:t})
real accumulator[K];
logalpha[1] = log(pi1) + log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[1] | mu1, sigma1, skewness1),
log(theta[2]) + skew_normal_lpdf(y[1] | mu2, sigma2, skewness2));
for (t in 2:T) {
for (j in 1:K) { // j = current (t)
for (i in 1:K) { // i = previous (t-1)
// Murphy (2012) Eq. 17.48
// belief state + transition prob + local evidence at t
accumulator[i] = logalpha[t-1, i] + log(A[i, j]) +
log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[j], sigma1[j], skewness1[j]),
log(theta[2]) + skew_normal_lpdf(y[t] | mu2[j], sigma2[j], skewness2[j]));
}
logalpha[t, j] = log_sum_exp(accumulator);
}
}
} // Forward
}
model {
target += log_sum_exp(logalpha[T]); // Note: update based only on last logalpha
}
generated quantities {
vector[K] logbeta[T];
vector[K] loggamma[T];
vector[K] alpha[T];
vector[K] beta[T];
vector[K] gamma[T];
int<lower=1, upper=K> zstar[T];
real logp_zstar;
{ // Forward algorithm
for (t in 1:T)
alpha[t] = softmax(logalpha[t]);
} // Forward
{ // Backward algorithm log p(x_{t+1:T} | z_t = j)
real accumulator[K];
for (j in 1:K)
logbeta[T, j] = 1;
for (tforward in 0:(T-2)) {
int t;
t = T - tforward;
for (j in 1:K) { // j = previous (t-1)
for (i in 1:K) { // i = next (t)
// Murphy (2012) Eq. 17.58
// backwards t + transition prob + local evidence at t
accumulator[i] = logbeta[t, i] + log(A[j, i]) +
log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[i], sigma1[i], skewness1[i]),
log(theta[2]) + skew_normal_lpdf(y[t] | mu2[i], sigma2[i], skewness2[i]));
}
logbeta[t-1, j] = log_sum_exp(accumulator);
}
}
for (t in 1:T)
beta[t] = softmax(logbeta[t]);
} // Backward
{ // Forwards-backwards algorithm log p(z_t = j | x_{1:T})
for(t in 1:T) {
loggamma[t] = alpha[t] .* beta[t];
}
for(t in 1:T)
gamma[t] = normalize(loggamma[t]);
} // Forwards-backwards
{ // Viterbi algorithm
int bpointer[T, K]; // backpointer to the most likely previous state on the most probable path
real delta[T, K]; // max prob for the seq up to t
// with final output from state k for time t
for (j in 1:K)
delta[1, K] = skew_normal_lpdf(y[1] | mu1[j], sigma1[j], skewness1[j]) +
skew_normal_lpdf(y[1] | mu2[j], sigma2[j], skewness2[j]);
for (t in 2:T) {
for (j in 1:K) { // j = current (t)
delta[t, j] = negative_infinity();
for (i in 1:K) { // i = previous (t-1)
real logp;
logp = delta[t-1, i] + log(A[i, j]) +
log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[j], sigma1[j], skewness1[j]),
log(theta[2]) + skew_normal_lpdf(y[t] | mu2[j], sigma2[j], skewness2[j]));
if (logp > delta[t, j]) {
bpointer[t, j] = i;
delta[t, j] = logp;
}
}
}
}
logp_zstar = max(delta[T]);
for (j in 1:K)
if (delta[T, j] == logp_zstar)
zstar[T] = j;
for (t in 1:(T - 1)) {
zstar[T - t] = bpointer[T - t + 1, zstar[T - t + 1]];
}
}
}
R Simulation
set.seed(1)
library(brms)
library(devtools)
library(rstan)
library(BayesHMM)
library(brms)
library(MASS)
# Function to generate random values from a simplex distribution
runif_simplex <- function(T) {
x <- -log(runif(T))
x / sum(x)
}
hmm_generate <- function(K, T) {
# 1. Parameters
pi1 <- round(runif_simplex(K),3)
A <- round(t(replicate(K, runif_simplex(K))),3)
mu1 <- round(sort(rnorm(K, 5 * 1:K, 1)),3)
mu2 <- round(sort(rnorm(K, 5 * 1:K, 1)),3)
sigma1 <- round(abs(rnorm(K)),3)
sigma2 <- round(abs(rnorm(K)),3)
alpha1 <- round(runif(K, -1, 1),3)
alpha2 <- round(runif(K, -1, 1),3)
lambda <- round(runif_simplex(K),3)
# 2. Hidden path
z <- vector("numeric", T)
z[1] <- sample(1:K, size = 1, prob = pi1)
for (t in 2:T)
z[t] <- sample(1:K, size = 1, prob = A[z[t - 1], ])
# 3. Observations
y <- vector("numeric", T)
for (t in 1:T) {
if (runif(1) < lambda[z[t]]) {
y[t] <- rskew_normal(1, xi = mu1[z[t]], omega = sigma1[z[t]], alpha = alpha1[z[t]])
} else {
y[t] <- rskew_normal(1, xi = mu2[z[t]], omega = sigma2[z[t]], alpha = alpha2[z[t]])
}
}
list(y = y, z = z, theta = list(pi1 = pi1, A = A, mu1 = mu1, mu2 = mu2, sigma1 = sigma1, sigma2 = sigma2, alpha1 = alpha1, alpha2 = alpha2, lambda = lambda))
}
hmm_init <- function(K, y) {
clasif <- kmeans(y, K)
# Initialize parameters for mixture of skewed normal distributions
init.mu1 <- by(y, clasif$cluster, mean)
init.mu2 <- by(y, clasif$cluster, mean)
init.sigma1 <- by(y, clasif$cluster, sd)
init.sigma2 <- by(y, clasif$cluster, sd)
init.alpha1 <- runif(K, -1, 1)
init.alpha2 <- runif(K, -1, 1)
init.lambda <- runif(K)
# Sort parameters based on means
init.order <- order(init.mu1)
list(mu1 = init.mu1[init.order],
mu2 = init.mu2[init.order],
sigma1 = init.sigma1[init.order],
sigma2 = init.sigma2[init.order],
alpha1 = init.alpha1[init.order],
alpha2 = init.alpha2[init.order],
lambda = init.lambda[init.order])
}
set.seed(5)
K <- 3
T_length <- 600
# Mixture weight for the first component
# Generate HMM data
dataset <- hmm_generate(K, T_length)
plot(dataset$y)
plot(sample(dataset$y))
# get parameter
dataset$theta
sum(dataset$theta$pi1)
dataset$z
hmm_fit <- function(K, y) {
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
stan.model <- 'stan/new_27_11_24.stan'
stan.data <- list(T = length(y), K = K, y = y)
stan(file = stan.model, data = stan.data, verbose = TRUE,
iter = 800, warmup = 400, thin = 1, chains = 2,
cores = 2,
seed = 10,init = function() { hmm_init(K, y) })
}
fit <- hmm_fit(K, dataset$y)
summary(fit)
# Get estimated parameters
mu1_samples <- extract(fit, pars = 'mu1')[[1]]
mu2_samples <- extract(fit, pars = 'mu2')[[1]]
sigma1_samples <- extract(fit, pars = 'sigma1')[[1]]
sigma2_samples <- extract(fit, pars = 'sigma2')[[1]]
alpha1_samples <- extract(fit, pars = 'skewness1')[[1]]
alpha2_samples <- extract(fit, pars = 'skewness2')[[1]]
lambda_samples <- extract(fit, pars = 'theta')[[1]]
# compute means
mu1_mean <- apply(mu1_samples, 2, mean)
mu2_mean <- apply(mu2_samples, 2, mean)
sigma1_mean <- apply(sigma1_samples, 2, mean)
sigma2_mean <- apply(sigma2_samples, 2, mean)
alpha1_mean <- apply(alpha1_samples, 2, mean)
alpha2_mean <- apply(alpha2_samples, 2, mean)
lambda_mean <- apply(lambda_samples, 2, mean)
# Simulated values
Sim_Mu1 <- dataset$theta$mu1
Sim_Mu2 <- dataset$theta$mu2
Sim_Sigma1 <- dataset$theta$sigma1
Sim_Sigma2 <- dataset$theta$sigma2
Sim_Alpha1 <- dataset$theta$alpha1
Sim_Alpha2 <- dataset$theta$alpha2
Sim_Lambda <- dataset$theta$lambda
# results
cat("\ mu1:\n")
print(data.frame(Estimated = mu1_mean, Simulated = Sim_Mu1))
cat("\ mu2:\n")
print(data.frame(Estimated = mu2_mean, Simulated = Sim_Mu2))
cat("\ sigma1:\n")
print(data.frame(Estimated = sigma1_mean, Simulated = Sim_Sigma1))
cat("\ sigma2:\n")
print(data.frame(Estimated = sigma2_mean, Simulated = Sim_Sigma2))
cat("\ alpha1 (skewness1):\n")
print(data.frame(Estimated = alpha1_mean, Simulated = Sim_Alpha1))
cat("\alpha2 (skewness2):\n")
print(data.frame(Estimated = alpha2_mean, Simulated = Sim_Alpha2))
cat("\ lambda:\n")
print(data.frame(Estimated = lambda_mean, Simulated = Sim_Lambda))