Reducing correlation between parameters of the posterior distribution

I am still trying to get my MSM-TVTP model to work properly (previous Stan and R code found here, but the current question is maybe of general interest) and just looked at the posterior correlation of the parameters. Focusing only on gamma (intercept of transition probabilities), alpha (intercept of linear model), and phi (AR1 parameter), I get the correlation matrix below. I am wondering both whether (and to which degree) this is affecting a) the efficiency of the sampling and b) the model’s ability to recover the parameters. Especially the correlation between alpha1 and phi1 worries me. The most recent set of simulations showed that the parameters are being “approximately” recovered but seem to get stuck in the same “wrong” area.

Do you have any suggestions to reduce this dependency? Reparameterizing the model with its unconditional mean seems to conflict with the ordering on the intercepts to prevent label switching.

           gamma[1] gamma[2] alpha[1] alpha[2] phi[1] phi[2]

gamma[1] 1.00 0.221 0.25 0.33 -0.267 -0.179
gamma[2] 0.22 1.000 -0.14 0.13 0.072 0.052
alpha[1] 0.25 -0.140 1.00 -0.36 -0.939 0.502
alpha[2] 0.33 0.127 -0.36 1.00 0.299 -0.812
phi[1] -0.27 0.072 -0.94 0.30 1.000 -0.569
phi[2] -0.18 0.052 0.50 -0.81 -0.569 1.000

R and Stan code for reference again

#####################################
### MC simulation                 ###
### for MSM-TVTP                  ###
#####################################

## setup
{
## libraries and options
library(rstan)
#options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library(shinystan)

#################################################################

## seed
set.seed(12345)
}

## simulation
dgp <- function(N, T, M){

  ## covariates
  z <- array(rnorm(N * M * T, 0, 1), dim = c(N, T, M))

  # state matrix
  s <- matrix(NA, nrow = N, ncol = T) # a T by N matrix indicating the state the individual i is in at point t in cell [t, i]

  # transition probabilities
  p <- array(NA, dim = c(2, 2, T, N))

  for (i in 1:N){
    for (t in 1:T){
      s_star <- gamma
      #+ t(z[i, t, ]) %*% lambda  # [t] because the TP are lagged when determining y
      # s_star <- gamma[1] + t(z[i, t, ]) %*% lambda[, 1]
      p_s <- matrix(NA, ncol = 2, nrow = 2)
      diag(p_s) <- pnorm(s_star) # variance of 1
      p_s[1, 2] <- 1 - p_s[1, 1]
      p_s[2, 1] <- 1 - p_s[2, 2]
      # p_11 + p_12 = 1 and p_21 + p_22 = 1
      p[ , , t, i] <- p_s                    # takes into consideration that lambda should have the same effect
      # in each state i.e. increasing the probability of staying
    }
    # states
    s[i, 1] <- rbinom(1, 1, (1 - p[2, 2, 1, i]) / (2 - p[2, 2, 1, i] - p[1, 1, 1, i])) + 1
    # s[i, 1] <- rbinom(1, 1, 0.5)
    for (t in 2:T){
      if        (s[i, t - 1] == 1){
        s[i, t] <- rbinom(n = 1, size = 1,prob = p[1, 2, t - 1, i]) + 1
      } else if (s[i, t - 1] == 2){
        s[i, t] <- rbinom(n = 1, size = 1,prob = p[2, 2, t - 1, i]) + 1
      }
    }
  }

  # outcome
  # individual time series
  y.pre <- array(NA, dim = c(N, T, 2))
  for (i in 1:N){
    y.pre[i, 1,] <- rnorm(2, alpha, sigma)
    for (t in 2:T){
      y.pre[i, t,] <- rnorm(2, alpha + phi * y.pre[i, (t - 1),], sigma)
    }
  }
  # switching time series
  y <- matrix(NA, nrow = N, ncol = T)
  for (i in 1:N){
    for (t in 1:T){
      y[i, t] <- y.pre[i ,t , s[i, t]]
    }
  }

  ## set up data for stan
  data_list <- list(T = T,
                    N = N,
                    M = M,
                    y = y,
                    z = z)
  ## out
  out <- list(data = data_list)
  return(out)
}

####################################################################################################
####################################################################################################

## Sourcing current directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

## parameters
M <- 2
alpha <- c(2, 6)
gamma <- c(0.5, 0.5) # intercept in TPs
#lambda <- matrix(c(-0.5, 0.3, 0.9, -0.2), byrow = T, ncol = 2, nrow = M)
phi <- c(0.8, 0.2)
sigma <- 1
n_parameters <- length(alpha) + length(gamma) + (length(phi) * 2) + length(sigma)
                # + length(lambda)


## Start time
orig_time <- Sys.time()
file <- file("progress.txt")


## Global parameters
n_chains <- 2L # number of chains
n_iter <- 2000L # number of iterations (thinning half)
n_warmup <- 1000L # number of warmup


## simulations over
T.seq <- c(300)
N.seq <- c(1)
N.sim <- seq(1, 4)
total.sim <- length(T.seq) * length(N.seq) * length(N.sim)


## container
out_par <- matrix(ncol = 9, nrow = 0)
colnames(out_par) <- c("N", "T", "sim", "name", "mean", "sd", "q025", "q50", "q975")
data.fit.list <- list()


## Simulation
sim_counter <- 0
for (t1 in T.seq){
  for (n1 in N.seq){
    out <- dgp(n1, t1, M)
    for (i_sim in N.sim){
      sim_counter <- sim_counter + 1
      fit <- stan("05_msm_simulation.stan",
                data = out[["data"]],
                chains = n_chains,
                iter = n_iter,
                warmup = n_warmup,
                init_r = 0.5,
                control = list(adapt_delta = .99))
      ## output
      fit.ext <- as.matrix(fit)
      for (ii in 1:n_parameters){
        i_mean <- mean(fit.ext[, ii])
        i_sd <- sd(fit.ext[, ii])
        i_quan <- quantile(fit.ext[, ii], probs = c(0.025, 0.50, 0.975), names = F)
        out_par <- rbind(out_par, c(n1, t1, sim_counter, colnames(fit.ext)[ii], i_mean, i_sd, i_quan ))
      }
      data.fit.list[[sim_counter]] <- list(fit.mat = fit,
                                           data    = out[["data"]])
      progress <- paste("I've done", sim_counter, "of", total.sim, "simulations so far.", "The last size of N was", n1, ". The last size of T was",
                          t1 ,"The current date and time are", as.character(Sys.time()), "and I started at", as.character(orig_time))
      writeLines(progress, file)
    }
  }
}

## clean output
out_par <- as.data.frame(out_par)

i_seq <- c(1, 2, 3, 5, 6, 7, 8, 9)
for (i in i_seq){
  out_par[, i] <- as.numeric(as.character(out_par[, i]))
}

## save output
save(out_par, file = "04_msm_simulation.RData")
save(data.fit.list, file = "02_msm_simulation.RData")
functions {
  vector vlnormal_pdf(vector y, vector x, real sigma){
    return log(1/sqrt(2 * pi() * sigma) *  exp(- ((y - x).*(y - x) ) ./ (2 * sigma) ));
  }
}

data {
  int<lower = 0> T; // number of observed time periods
  int<lower = 0> N; // number of observed units
  matrix[N, T] y; // outcome
  int<lower = 0> M;
  row_vector[M] z[N, T]; // exogenous random variable affecting transition probabilities
}

parameters {
  real gamma[2];
  ordered[2] alpha;
  real<lower = 0, upper = 1> phi[2];        
  real<lower = 0> sigma;      

}
transformed parameters {
  real fy[N, T];

  for (i in 1:N) {
    vector[T] s[2];
    vector[2] p[T, 2];
    real p_s1[T, 2];
    // t1
    for (t in 1:T) {
      // transition probabilities
      p[t, 1, 1] = normal_cdf(gamma[1], 0, 1);
      p[t, 2, 2] = normal_cdf(gamma[2], 0, 1);
      p[t, 2, 1] = 1 - p[t, 2, 2];
    }
    for (j in 1:2){
      s[j, 1] =   normal_lpdf(y[i, 1] | alpha[j], sigma);
      s[j, 2:T] = vlnormal_pdf(to_vector(y[i, 2:T]), to_vector(alpha[j] + phi[j] * y[i, 1:(T - 1)]), sigma);
    }

    // t = 1
    // Initial probability
    p_s1[1, 2] = (1 - p[1, 2, 2]) / (2 - p[1, 1, 1] - p[1, 2, 2]);

    p_s1[1, 1] =  p[1, 1, 1] *      p_s1[1, 2] +
                  p[1, 2, 1] * (1 - p_s1[1, 2]);
    fy[i,1] = log_mix(p_s1[1, 1], s[1, 1], s[2, 1]);
    p_s1[1, 2] = exp(s[1, 1] + log(p_s1[1,1]) - fy[i,1]);

    // t = 2:T
    for (t in 2:T){
      p_s1[t, 1] =  p[(t - 1), 1, 1] *      p_s1[(t - 1), 2]
                  + p[(t - 1), 2, 1] * (1 - p_s1[(t - 1), 2]);
      fy[i,t] = log_mix(p_s1[t, 1], s[1, t], s[2, t]);
      p_s1[t, 2] = exp(s[1, t] + log(p_s1[t, 1]) - fy[i,t]);
    }
  }
}
model {
  // likelihood
  target += fy;

  // priors
  alpha ~ normal(0, 4);
  phi ~ normal(0.5, 0.25);
  gamma ~ normal(0, 1);
}

You can order the unconditional means.

Yes, but that seems to put restrictions on the AR1 parameters that in the case of alpha1 < alpha2 and phi1 < phi2 cause the model to recover parameter values that are far off the true values. Not that it works for alpha1 < alpha2 and phi1 > phi2 but it’s closer.

For:
alpha <- c(2, 6)
gamma <- c(0.5, 0.5)
phi <- c(0.2, 0.8)

For
alpha <- c(2, 6)
gamma <- c(0.5, 0.5)
phi <- c(0.8, 0.2)

The Stan code for the reparameterization:

// MSM-TVTP, AR(1), 2 REGIMES, KNOWN VARIANCE ACROSS BOTH REGIMES
// only parameter that varies across regimes in intercept in outcome and TVTP

functions {
  vector vlnormal_pdf(vector y, vector x, real sigma){
    return log(1/sqrt(2 * pi() * sigma) *  exp(- ((y - x).*(y - x) ) ./ (2 * sigma) ));
  }
}


data {
  int<lower = 0> T; // number of observed time periods
  int<lower = 0> N; // number of observed units
  matrix[N, T] y; // outcome
  int<lower = 0> M;
  row_vector[M] z[N, T]; // exogenous random variable affecting transition probabilities
}

parameters {
  real gamma[2];
  ordered[2] mu;
  real<lower = 0, upper = 1> phi[2];                                // raw_phi;
  real<lower = 0> sigma;                      // variance term

}
transformed parameters {
  vector[2] alpha;
  real fy[N, T];
  for (j in 1:2){
    alpha[j] = mu[j] * (1 - phi[j]);
  }
  for (i in 1:N) {
    vector[T] s[2];
    vector[2] p[T, 2];
    real p_s1[T, 2];
    // t1
    for (t in 1:T) {
      p[t, 1, 1] = normal_cdf(gamma[1], 0, 1);
      p[t, 2, 2] = normal_cdf(gamma[2], 0, 1);
      p[t, 2, 1] = 1 - p[t, 2, 2];

    }
    for (j in 1:2){
      s[j, 1] =   normal_lpdf(y[i, 1] | alpha[j], sigma);
      s[j, 2:T] = vlnormal_pdf(to_vector(y[i, 2:T]), to_vector(alpha[j] + phi[j] * y[i, 1:(T - 1)]), sigma);
    }

    // t = 1
    // Initial probability
    p_s1[1, 2] = (1 - p[1, 2, 2]) / (2 - p[1, 1, 1] - p[1, 2, 2]);

    p_s1[1, 1] =  p[1, 1, 1] *      p_s1[1, 2] +
                  p[1, 2, 1] * (1 - p_s1[1, 2]);
    fy[i,1] = log_mix(p_s1[1, 1], s[1, 1], s[2, 1]);
    p_s1[1, 2] = exp(s[1, 1] + log(p_s1[1,1]) - fy[i,1]);

    // t = 2:T
    for (t in 2:T){
      p_s1[t, 1] =  p[(t - 1), 1, 1] *      p_s1[(t - 1), 2]
                  + p[(t - 1), 2, 1] * (1 - p_s1[(t - 1), 2]);
      fy[i,t] = log_mix(p_s1[t, 1], s[1, t], s[2, t]);
      p_s1[t, 2] = exp(s[1, t] + log(p_s1[t, 1]) - fy[i,t]);
    }
  }
}
model {
  // likelihood
  target += fy;
  // priors
  mu ~ normal(0, 4);
  phi ~ normal(0.5, 0.25);
  gamma ~ normal(0.5, 1);
}