HMM and Skew Normal mixture

Hello everyone,

leveraging the package library(BayesHMM) I want to propose a combination of HMM and a mixture of skew-normal distribution in a regression setting.
But I got different errors, see eg:

Error in stanc(file = file, model_code = model_code, model_name = model_name, :
0

Semantic error in 'string', line 36, column 6 to column 31:

Ill-typed arguments supplied to infix operator .*. Available signatures:
(int, int) => int
(real, real) => real
(vector, vector) => vector
(row_vector, row_vector) => row_vector
(matrix, matrix) => matrix
Instead supplied arguments of incompatible type: real[], matrix.

This is the theoretical model.

HMM involve two interconnected models. The state model consists of a discrete-time, discrete-state { } firstorder Markov chain z_t \in\{1, \ldots, K\} that transitions according to p\left(z_t \mid z_{t-1}\right). In turns, the observation model is governed by p\left(\mathbf{y}_t \mid z_t\right), where \mathbf{y}_t are the observations, emissions or output. The corresponding joint distribution is

p\left(\mathbf{z}_{1: T}, \mathbf{y}_{1: T}\right)=p\left(\mathbf{z}_{1: T}\right) p\left(\mathbf{y}_{1: T} \mid \mathbf{z}_{1: T}\right)=\left[p\left(z_1\right) \prod_{t=2}^T p\left(z_t \mid z_{t-1}\right)\right]\left[\prod_{t=1}^T p\left(\mathbf{y}_t \mid z_t\right)\right]

Each single time slice corresponds to a mixture distribution with component densities given by p\left(\mathbf{y}_t \mid z_t\right),
where in my case
\begin{equation} p\left(\mathbf{y}_t \mid z_t=k, \theta\right) = (m_1) \times \operatorname{SN}_a\left(y_a ; \xi_a, \omega_a^2, \lambda_a\right) +(m_2) \times \operatorname{SN}_b\left(y_b ; \xi_b, \omega_b^2, \lambda_b\right) \end{equation}

where \xi_a=\beta_a X; \xi_b=\beta_b X; and m \in[0,1] is the mixture weight such that m_1+m_2=1 in order o guarantee that the resulting expression remains a proper PDF


Here is the simulation in R.

library(brms)
library(rstan)
library(BayesHMM)

# Function to generate random values from a simplex distribution
runif_simplex <- function(T) {
  x <- -log(runif(T))
  x / sum(x)
}

hmm_generate <- function(K, T, lambda) {
  # 1. Parameters
  pi1 <- runif_simplex(K)
  A <- t(replicate(K, runif_simplex(K)))
  mu1 <- sort(rnorm(K, 10 * 1:K, 1))
  mu2 <- sort(rnorm(K, 10 * 1:K, 1))
  sigma1 <- abs(rnorm(K))
  sigma2 <- abs(rnorm(K))
  alpha1 <- runif(K, -1, 1)
  alpha2 <- runif(K, -1, 1)
  beta1 <- runif(K, -1, 1)  # Regression coefficient for mixture 1
  beta2 <- runif(K, -1, 1)  # Regression coefficient for mixture 2
  
  # 2. Hidden path
  z <- vector("numeric", T)
  z[1] <- sample(1:K, size = 1, prob = pi1)
  for (t in 2:T)
    z[t] <- sample(1:K, size = 1, prob = A[z[t - 1], ])
  
  # 3. Covariate
  x <- rnorm(T)
  
  # 4. Observations
  y <- vector("numeric", T)
  for (t in 1:T) {
    if (runif(1) < lambda) {
      y[t] <- rskew_normal(1, xi = mu1[z[t]] + beta1[z[t]] * x[t], omega = sigma1[z[t]], alpha = alpha1[z[t]])
    } else {
      y[t] <- rskew_normal(1, xi = mu2[z[t]] + beta2[z[t]] * x[t], omega = sigma2[z[t]], alpha = alpha2[z[t]])
    }
  }
  
  list(y = y, x = x, z = z, theta = list(pi1 = pi1, A = A, mu1 = mu1, mu2 = mu2, sigma1 = sigma1, sigma2 = sigma2, alpha1 = alpha1, alpha2 = alpha2, lambda = lambda, beta1 = beta1, beta2 = beta2))
}

# Set the seed for reproducibility
set.seed(900)

K <- 3
T_length <- 1500
lambda <- 0.7  # Mixture weight for the first component

# Generate HMM data
dataset <- hmm_generate(K, T_length, lambda)

# Create a data frame with covariate and response
data <- data.frame(x = dataset$x, y = dataset$y)


plot(data$y)
plot(data$x,data$y)




hmm_init <- function(K, y) {
  clasif <- kmeans(y, K)
  
  # Initialize parameters for mixture of skewed normal distributions
  init.mu1 <- by(y, clasif$cluster, mean)
  init.mu2 <- by(y, clasif$cluster, mean)
  init.sigma1 <- by(y, clasif$cluster, sd)
  init.sigma2 <- by(y, clasif$cluster, sd)
  init.alpha1 <- runif(K, -1, 1)
  init.alpha2 <- runif(K, -1, 1)
  init.lambda <- runif(K)
  
  # Sort parameters based on means
  init.order <- order(init.mu1)
  
  list(mu1 = init.mu1[init.order],
       mu2 = init.mu2[init.order],
       sigma1 = init.sigma1[init.order],
       sigma2 = init.sigma2[init.order],
       alpha1 = init.alpha1[init.order],
       alpha2 = init.alpha2[init.order],
       lambda = init.lambda[init.order],
       beta = rep(1, K))
}

The stan code

functions {
  vector normalize(vector x) {
    return x / sum(x);
  }
}

data {
  int<lower=1> T;                   // number of observations (length)
  int<lower=1> K;                   // number of hidden states
  real y[T];                        // observations
  real x[T]; 
  }

parameters {
  // Discrete state model
  simplex[K] pi1;                   // initial state probabilities
  simplex[K] A[K];                  // transition probabilities
  // A[i][j] = p(z_t = j | z_{t-1} = i)
  
  // Continuous observation model
  vector[K] beta1;                   // observation means for component 1
  vector[K] beta2;                   // observation means for component 2
  real<lower=0> sigma1[K];          // observation standard deviations for component 1
  real<lower=0> sigma2[K];          // observation standard deviations for component 2
  real<lower=-10, upper=10> skewness1[K];  // skewness parameters for component 1
  real<lower=-10, upper=10> skewness2[K];  // skewness parameters for component 2
  
  // Mixture parameters
  simplex[K] theta;                 // mixing proportions
}

transformed parameters {
  vector[K] logalpha[T];
  
  vector[K] mu1;
  vector[K] mu2;
  
  // Compute predicted means based on linear regression
mu1 = to_vector(x) .* beta1;
mu2 = to_vector(x) .* beta2;
  
  { // Forward algorithm log p(z_t = j | x_{1:t})
    real accumulator[K];
    
    logalpha[1] = log(pi1) + log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[1] | mu1, sigma1, skewness1),
                                         log(theta[2]) + skew_normal_lpdf(y[1] | mu2, sigma2, skewness2));
    
    for (t in 2:T) {
      for (j in 1:K) { // j = current (t)
      for (i in 1:K) { // i = previous (t-1)
      // Murphy (2012) Eq. 17.48
      // belief state      + transition prob + local evidence at t
      accumulator[i] = logalpha[t-1, i] + log(A[i, j]) +
        log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[j], sigma1[j], skewness1[j]),
                    log(theta[2]) + skew_normal_lpdf(y[t] | mu2[j], sigma2[j], skewness2[j]));
      }
      logalpha[t, j] = log_sum_exp(accumulator);
      }
    }
  } // Forward
}

model {
  target += log_sum_exp(logalpha[T]); // Note: update based only on last logalpha
}

generated quantities {
  vector[K] logbeta[T];
  vector[K] loggamma[T];
  
  vector[K] alpha[T];
  vector[K] beta[T];
  vector[K] gamma[T];
  
  int<lower=1, upper=K> zstar[T];
  real logp_zstar;
  
  { // Forward algorithm
    for (t in 1:T)
      alpha[t] = softmax(logalpha[t]);
  } // Forward
  
  { // Backward algorithm log p(x_{t+1:T} | z_t = j)
    real accumulator[K];
    
    for (j in 1:K)
      logbeta[T, j] = 1;
    
    for (tforward in 0:(T-2)) {
      int t;
      t = T - tforward;
      
      for (j in 1:K) { // j = previous (t-1)
      for (i in 1:K) { // i = next (t)
      // Murphy (2012) Eq. 17.58
      // backwards t    + transition prob + local evidence at t
      accumulator[i] = logbeta[t, i] + log(A[j, i]) +
        log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[i], sigma1[i], skewness1[i]),
                    log(theta[2]) + skew_normal_lpdf(y[t] | mu2[i], sigma2[i], skewness2[i]));
      }
      logbeta[t-1, j] = log_sum_exp(accumulator);
      }
    }
    
    for (t in 1:T)
      beta[t] = softmax(logbeta[t]);
  } // Backward
  
  { // Forwards-backwards algorithm log p(z_t = j | x_{1:T})
    for(t in 1:T) {
      loggamma[t] = alpha[t] .* beta[t];
    }
    
    for(t in 1:T)
      gamma[t] = normalize(loggamma[t]);
  } // Forwards-backwards
  
  { // Viterbi algorithm
    int bpointer[T, K];             // backpointer to the most likely previous state on the most probable path
    real delta[T, K];               // max prob for the seq up to t
    // with final output from state k for time t
    
    for (j in 1:K)
      delta[1, K] = skew_normal_lpdf(y[1] | mu1[j], sigma1[j], skewness1[j]) +
      skew_normal_lpdf(y[1] | mu2[j], sigma2[j], skewness2[j]);
    
    for (t in 2:T) {
      for (j in 1:K) { // j = current (t)
      delta[t, j] = negative_infinity();
      for (i in 1:K) { // i = previous (t-1)
      real logp;
      logp = delta[t-1, i] + log(A[i, j]) +
        log_sum_exp(log(theta[1]) + skew_normal_lpdf(y[t] | mu1[j], sigma1[j], skewness1[j]),
                    log(theta[2]) + skew_normal_lpdf(y[t] | mu2[j], sigma2[j], skewness2[j]));
      if (logp > delta[t, j]) {
        bpointer[t, j] = i;
        delta[t, j] = logp;
      }
      }
      }
    }
    
    logp_zstar = max(delta[T]);
    
    for (j in 1:K)
      if (delta[T, j] == logp_zstar)
        zstar[T] = j;
    
    for (t in 1:(T - 1)) {
      zstar[T - t] = bpointer[T - t + 1, zstar[T - t + 1]];
    }
  }
}

hmm_fit <- function(K, y, x) {
  rstan_options(auto_write = TRUE)
  options(mc.cores = parallel::detectCores())
  stan.model <- 'code.stan'
   
  stan.data <- list(T = length(y), K = K, y = y, x = x)
  stan(file = stan.model, data = stan.data, verbose = TRUE,
       iter = 600, warmup = 150, thin = 1, chains = 1,
       cores = 3,
       seed = 900, init = function() { hmm_init(K, y) })
}

fit <- hmm_fit(K, dataset$y, dataset$x)

The model compiles for me, which I suspect may be a difference in version and front-end. I would recommend you move to cmdstanr for the most up-to-date version of Stan.

That said, it still throws an error when sampling. Note that x and beta1 are different dimensions, so the operation is not possible. I don’t know enough about the model to resolve this issue.

data{
   ...
   real x[T];
}
parameters{
   ...
   vector[K] beta1;
}
transformed parameters{
   ...
   vector[K] mu1;
   mu1 = to_vector(x) .* beta1;
2 Likes