LFO-CV for multivariate autoregressive model

Hi everyone,

I am trying to implement the approximate LFO-CV from this vignette using Stan directly instead of brms. The model I am using is a multivariate autoregressive model with latent variables to infer the variance-covariance matrix. Also, I defined the log_lik in the generated quantities block.

I am bit stuck translating some of the parts of the vignette to make it work without brms. In the vignette, the line

fit_past <- update(fit, newdata = df_past, recompile = FALSE, cores = 4)

refits the model to the data up to L, and the line

loglik <- log_lik(fit_past, newdata = df_oos, oos = oos)

obtains the the log-likfor the data up to L+1. Is that right?

What would be the equivalent using Stan directly? Should I just refit the model to the time series going from 1 to L+1 and extract the log_lik for L+1?

Here is the Stan model:

data {
  int<lower = 0> N;                   // Number of observations (rows in the data)
  int<lower=1> S;                     // Number of Secies
  int<lower=1> T;                     // Numer of years
  // as the N, indicating which reef each row corresponds to
  int<lower = 1, upper = T> time[N];  // integer vector with reef-time 
  // (not absolute time). Each reef starts at 1
  int <lower=1> D;        // Number of latent dimensions
  int Y[N,S];    // The outcome matrix
}
transformed data{
  int<lower=1> M;
  M  = D*(S-D)+ D*(D-1)/2;  // number of non-zero loadings
}
parameters {
  real beta_mu;
  real alpha_mu;
  vector[S] log_lambda[N];  // latent poisson parameters
  
  // top level, covariance matrix with latent factors
  vector[M] L_t;   // lower diagonal elements of L
  vector[D] L_d;   // diagonal elements of L
  vector<lower=0>[S] psi;         // vector of variances
  
  // species level
  real<lower=0> beta_sigma;
  real<lower=0> alpha_sigma;
  vector[S] z_alpha;
  vector[S] z_beta;

  
}
transformed parameters{
  matrix[S,D] L;  //lower triangular factor loadings Matrix 
  cov_matrix[S] Omega;   // Covariance matrix
  matrix[S,S] L_Omega;
  vector[S] alpha;   // Vector of intrinsic growth rates
  vector[S] beta;   // matrix with competitive parameter

  alpha = alpha_mu + alpha_sigma * z_alpha;
  beta = beta_mu + beta_sigma * z_beta;
  


{
  int idx;
  idx = 0;
  for(i in 1:(D-1)){
    for(j in (i+1):D){
      L[i,j] = 0; //constrain the upper triangular elements to zero 
    }
  }
  for(j in 1:D) {
    L[j,j] = L_d[j];
    for(i in (j+1):S) {
      idx = idx + 1;
      L[i,j] = L_t[idx];
    } 
  }
} 

  Omega = L*L';
  for (i in 1:rows(Omega))
  Omega[i,i] = Omega[i,i] + psi[i];
  L_Omega = cholesky_decompose(Omega);

}
model {

  // The hyperpriors
  alpha_mu ~ normal(0,1);
  alpha_sigma ~ exponential(1);
  z_alpha ~ normal(0,1);
  
  beta_mu ~ normal(0,1);
  beta_sigma ~ exponential(1);
  z_beta ~ normal(0,1);
  
// the priors 
  L_d ~ normal(0,10);
  L_t ~ normal(0,10);
  psi ~ exponential(5);
  
  for(n in 1:N){
    if(time[n]==1){
      log_lambda[n] ~ normal(0,5);
    }
    if(time[n]>1){
      log_lambda[n] ~ multi_normal_cholesky(alpha + diag_matrix(beta)*log_lambda[n-1], L_Omega);
    }
  }
  
  
//The likelihood
  for(s in 1:S){
      target += poisson_log_lpmf(Y[,s] | log_lambda[1:N,s]);
  }
}
generated quantities{
  int<lower = 0> Y_pred[N,S];
  vector[N] log_lik;
  vector[S] log_lambda_pred[N];
  matrix[S,S] Rho; // Correlation matrix
  vector[S] sde;   // species' sd
  matrix[S,D] L_invar = L; // constrained factor loadings

    for(d in 1:D){
      if(L[d,d]<0){
        L_invar[,d] = -1 * L[,d];
    }
  }
  for(i in 1:S)
    for(j in 1:S)
      Rho[i,j] = Omega[i,j]/sqrt(Omega[i,i] * Omega[j,j]);
  
  sde = sqrt(diagonal(Omega));
  
  for(n in 1:N){
    if(time[n]==1){
      log_lambda_pred[n] = log_lambda[n];
    }
    if(time[n]>1){
      log_lambda_pred[n] = multi_normal_cholesky_rng(alpha + diag_matrix(beta)*log_lambda_pred[n-1], L_Omega);
    }
  }
  
  for(s in 1:S){
    for(n in 1:N){
      if(log_lambda_pred[n,s]>20){
        Y_pred[n,s] = poisson_log_rng(20);
      }else{
        Y_pred[n,s] = poisson_log_rng(log_lambda_pred[n,s]);
      }
      log_lik[n] = poisson_log_lpmf(Y[n] | log_lambda[n]);
    }
  }
  
}

And here is R code generating toy data and fitting the model (NB for the data simulated below, the model takes around 20 minutes to fit using my pc)

# simulate time series
Gompertz.sim <- function(init.pop, alpha, beta, time, sp, Rho, sd_noise){
  sim_data <- matrix(NA, nrow = time, ncol = sp+1)
  colnames(sim_data)<-c(c("Year"),as.character(1:sp))
  sim_data<- as.data.frame(sim_data)
  sim_data$Year <- 1:time
  community <- matrix(NA, nrow = time, ncol = sp)
  community[1,] <- init.pop 
  log_lambda <- matrix(NA, nrow = time, ncol = sp)
  log_lambda[1,] <- init.pop
  for (t in 2:time) {
    log_lambda[t,]<- alpha + beta*log_lambda[t-1,] + 
      t(rmvnorm2(1, Mu = rep(0, sp), 
                 sigma = diag(sd_noise), 
                 Rho = Rho))
    for(s in 1:sp){
      community[t,s] <- rpois(1,lambda = exp(log_lambda[t,s]))
    }
  }
  
  for(index in 1:nrow(sim_data)){
    sim_data[index,-(1)]<-community[index,]
  }
  
  return(results = list(abundance.array = community, Simulation = sim_data))
}

set.seed(2021)

P0<- 1
sp <- 20
t <- 50
alpha <- rnorm(sp, 0.4, 0.1)

beta <- rtruncnorm(sp, a = -Inf, b = 0.95, mean = 0.9, sd = 0.1)

sd_noise <- matrix(0, nrow = sp, ncol = sp)
diag(sd_noise) <- rlnorm(sp, -1.5, 0.1)

Rho <- rlkjcorr(1, sp, eta=0.5)
# Rho <- matrix(0, ncol = sp, nrow = sp)
# diag(Rho) <- 1

Sigma <- sd_noise%*%Rho%*%sd_noise
Sim.data <- Gompertz.sim(init.pop = P0, alpha = alpha, beta = beta, time = t, sp = sp, sd_noise = sd_noise, Rho = Rho)
matplot(matrix(rep(1:t, sp), nrow = t, ncol = sp), log(Sim.data$abundance.array), type = rep("l",sp), lty = 1,lwd = 2, col = 1:sp)

sim.data_2 <- Sim.data$Simulation

sim.data_2 <- sim.data_2 %>%
  mutate(time = 1:n())

dat<-list(time = sim.data_2$time,
          T = max(sim.data_2$time),
          N = nrow(sim.data_2),
          Y = sim.data_2[,-(c(1,ncol(sim.data_2)))],
          S = ncol(sim.data_2[,-(c(1,ncol(sim.data_2)))]),
          D = 9
)


MAR_model <- stan(file = "poi_MVLN_model_ppc.stan",
                  data = dat,
                  chains = 4, 
                  cores = 4,
                  iter = 3000,
                  control = list(adapt_delta = 0.9, max_treedepth = 10))


Thanks for your time,
Alfonso

Yes, that’s IMHO what this does.

Do I get correctly that you have no further questions and are just showing the model/data for completeness? Or do you have some additional concerns?

Best of luck with your model!