This is the Stan code. I’ll also include an R script to generate fake data and reproduce the problem (although you’ll have to change the path to the Stan file). I know it’s a lot of code - hopefully it’s still possible to find something useful.
Thanks for your help!
// Gaussian Hidden Markov Model
data {
int<lower=1> T; // number of periods
int<lower=1> N_states; // number of potential states
vector[T] y; // observations
}
transformed data {
vector<lower=0>[N_states] theta_prior = rep_vector(5.0, N_states); // prior for transition probs
}
parameters {
// transition parameters
simplex[N_states] theta[N_states]; // transition probabilities
// theta[r, s] is the probability of a transition from state r to state s
// emission parameters (observation model)
vector[N_states] mu;
positive_ordered[N_states] sigma; // ordered to break multi-modality
}
model {
vector[N_states] alpha[T]; // posterior state probabilities:
// alpha[t] is the vector of log unnormalized joint probabilities of
// the states at time t and the observations UP TO time t
// so alpha[t, n] = P(states[t] == n AND observations[1:t] == y[1:t])
// priors
// priors for emission parameters
mu ~ normal(0, 1);
sigma ~ normal(0, 1);
// initial state probability
for (n in 1:N_states) {
alpha[1, n] = normal_lpdf(y[1] | mu[n], sigma[n]);
}
// transition probability priors
for (n in 1:N_states) {
theta[n] ~ dirichlet(theta_prior);
}
//likelihood
// marginalize state probabilities
{
real accum[N_states];
for (t in 2:T) {
for (s in 1:N_states) {
for (r in 1:N_states) {
accum[r] = alpha[t - 1, r] + log(theta[r, s]);
}
alpha[t, s] = normal_lpdf(y[t] | mu[s], sigma[s]) + log_sum_exp(accum);
}
}
}
target += log_sum_exp(alpha[T]);
}
generated quantities {
vector[N_states] naive_probs[T]; // probabilities only from observations, not from time-dependence
vector[N_states] forward_probs[T]; // alpha in 'standard notation'
vector[N_states] backward_probs[T]; // backward probabilities; beta in 'standard notation'
// backward_probs[t, s] is the probability
// of state s given all observations (t+1):T
vector[N_states] smoothed_probs[T]; // product of alpha and beta in 'standard notation'
int<lower=1, upper=N_states> best_path[T]; // Viterbi path - highest probability path
real logp_best_path; // log probability of best path
for (t in 1:T) {
for (n in 1:N_states) {
naive_probs[t, n] = normal_lpdf(y[t] | mu[n], sigma[n]);
}
naive_probs[t] -= log_sum_exp(naive_probs[t]);
}
// infered state probabilities using forward algorithm
for (n in 1:N_states) {
forward_probs[1, n] = normal_lpdf(y[1] | mu[n], sigma[n]);
}
forward_probs[1] -= log_sum_exp(forward_probs[1]); // normalize immediately
{
real accum[N_states];
for (t in 2:T) {
for (s in 1:N_states) {
for (r in 1:N_states) {
accum[r] = forward_probs[t - 1, r] + log(theta[r, s]);
}
forward_probs[t, s] = normal_lpdf(y[t] | mu[s], sigma[s]) + log_sum_exp(accum);
}
forward_probs[t] -= log_sum_exp(forward_probs[t]); // normalize after every step
}
}
// forward-backward algorithm
{
real accum[N_states];
backward_probs[T] = rep_vector(- log(N_states), N_states); // initialization
for (t in 1:(T - 1)) {
for (s in 1:N_states) {
for (r in 1:N_states) {
accum[r] = backward_probs[T - t + 1, r] + log(theta[s, r]) + normal_lpdf(y[T - t + 1] | mu[r], sigma[r]);
}
backward_probs[T - t, s] = log_sum_exp(accum);
}
backward_probs[T - t + 1] -= log_sum_exp(backward_probs[T - t + 1]); // normalize after every step
}
backward_probs[1] -= log_sum_exp(backward_probs[1]);
for (t in 1:T) {
smoothed_probs[t] = forward_probs[t] + backward_probs[t];
smoothed_probs[t] -= log_sum_exp(smoothed_probs[t]);
}
}
//viterbi i.e. max-sum algorithm
{
vector[N_states] omega[T]; // omega[t, i] is the log probability of states up to t - 1,
// state t being equal to i, and observations y[1:t]
int backtrack_states[T, N_states]; // keeping track of best path so far
// backtrack_states[t, i] is the state at time t - 1 that is part of the highest probability
// path to state i at time t.
for (i in 1:N_states) {
omega[1, i] = normal_lpdf(y[1] | mu[i], sigma[i]);
}
omega[1] -= log_sum_exp(omega[1]);
for (t in 2:T) {
for (i in 1:N_states) {
omega[t, i] = negative_infinity();
for (j in 1:N_states) {
real logp;
logp = omega[t - 1, j] + log(theta[j, i]) + normal_lpdf(y[t] | mu[i], sigma[i]);
if (logp > omega[t, i]) {
backtrack_states[t, i] = j;
omega[t, i] = logp;
}
}
}
omega[t] -= log_sum_exp(omega[t]);
}
logp_best_path = max(omega[T]);
for (n in 1:N_states){
if (omega[T, n] == logp_best_path) {
best_path[T] = n;
}
}
for (t in 1:(T - 1)) {
best_path[T - t] = backtrack_states[T - t + 1, best_path[T - t + 1]];
}
}
// finally, transform to probabilities
naive_probs = exp(naive_probs);
forward_probs = exp(forward_probs);
backward_probs = exp(backward_probs);
smoothed_probs = exp(smoothed_probs);
}
R script
library(rstan)
transition_process <- function(current_state,
possible_states = 1:2,
transition_probs = matrix(0.5,
nrow = length(possible_states),
ncol = length(possible_states))) {
stopifnot(nrow(transition_probs) == length(possible_states))
stopifnot(nrow(transition_probs) == ncol(transition_probs)) # must be square matrix
stopifnot(any(rowSums(transition_probs) == 1)) # transition matrix rows must sum to 1
new_state <- base::sample(possible_states, size = 1, prob = transition_probs[current_state, ])
return(new_state)
}
observation_process <- function(current_state, state_mean, state_sd) {
observation <- rnorm(1, mean = state_mean[current_state], sd = state_sd[current_state])
}
possible_states <- 1:2
transition_probs <- matrix(c(0.8, 0.2, 0.7, 0.3), nrow = 2, ncol = 2, byrow = TRUE)
state_mean <- c(0, 3)
state_sd <- c(0.5, 1)
# generate fake data
set.seed(12)
T <- 1000
states <- numeric(T)
states[1] <- possible_states[1]
observations <- numeric(T)
observations[1] <- observation_process(current_state = states[1],
state_mean = state_mean,
state_sd = state_sd)
for (t in 2:T) {
states[t] = transition_process(current_state = states[t - 1],
possible_states = possible_states,
transition_probs = transition_probs)
observations[t] <- observation_process(current_state = states[t],
state_mean = state_mean,
state_sd = state_sd)
}
stan_data_hmm <- list(T = T,
N_states = length(possible_states),
y = observations)
hmm <- stan_model("path/to/stanmodel")
fit_hmm <- sampling(hmm, data = stan_data_hmm, iter = 10000, chains = 4, cores = 1)