Hi everyone,
I am trying to implement the approximate LFO-CV from this vignette using Stan directly instead of brms. The model I am using is a multivariate autoregressive model with latent variables to infer the variance-covariance matrix. Also, I defined the log_lik in the generated quantities block.
I am bit stuck translating some of the parts of the vignette to make it work without brms. In the vignette, the line
fit_past <- update(fit, newdata = df_past, recompile = FALSE, cores = 4)
refits the model to the data up to L, and the line
loglik <- log_lik(fit_past, newdata = df_oos, oos = oos)
obtains the the log-likfor the data up to L+1. Is that right?
What would be the equivalent using Stan directly? Should I just refit the model to the time series going from 1 to L+1 and extract the log_lik for L+1?
Here is the Stan model:
data {
int<lower = 0> N; // Number of observations (rows in the data)
int<lower=1> S; // Number of Secies
int<lower=1> T; // Numer of years
// as the N, indicating which reef each row corresponds to
int<lower = 1, upper = T> time[N]; // integer vector with reef-time
// (not absolute time). Each reef starts at 1
int <lower=1> D; // Number of latent dimensions
int Y[N,S]; // The outcome matrix
}
transformed data{
int<lower=1> M;
M = D*(S-D)+ D*(D-1)/2; // number of non-zero loadings
}
parameters {
real beta_mu;
real alpha_mu;
vector[S] log_lambda[N]; // latent poisson parameters
// top level, covariance matrix with latent factors
vector[M] L_t; // lower diagonal elements of L
vector[D] L_d; // diagonal elements of L
vector<lower=0>[S] psi; // vector of variances
// species level
real<lower=0> beta_sigma;
real<lower=0> alpha_sigma;
vector[S] z_alpha;
vector[S] z_beta;
}
transformed parameters{
matrix[S,D] L; //lower triangular factor loadings Matrix
cov_matrix[S] Omega; // Covariance matrix
matrix[S,S] L_Omega;
vector[S] alpha; // Vector of intrinsic growth rates
vector[S] beta; // matrix with competitive parameter
alpha = alpha_mu + alpha_sigma * z_alpha;
beta = beta_mu + beta_sigma * z_beta;
{
int idx;
idx = 0;
for(i in 1:(D-1)){
for(j in (i+1):D){
L[i,j] = 0; //constrain the upper triangular elements to zero
}
}
for(j in 1:D) {
L[j,j] = L_d[j];
for(i in (j+1):S) {
idx = idx + 1;
L[i,j] = L_t[idx];
}
}
}
Omega = L*L';
for (i in 1:rows(Omega))
Omega[i,i] = Omega[i,i] + psi[i];
L_Omega = cholesky_decompose(Omega);
}
model {
// The hyperpriors
alpha_mu ~ normal(0,1);
alpha_sigma ~ exponential(1);
z_alpha ~ normal(0,1);
beta_mu ~ normal(0,1);
beta_sigma ~ exponential(1);
z_beta ~ normal(0,1);
// the priors
L_d ~ normal(0,10);
L_t ~ normal(0,10);
psi ~ exponential(5);
for(n in 1:N){
if(time[n]==1){
log_lambda[n] ~ normal(0,5);
}
if(time[n]>1){
log_lambda[n] ~ multi_normal_cholesky(alpha + diag_matrix(beta)*log_lambda[n-1], L_Omega);
}
}
//The likelihood
for(s in 1:S){
target += poisson_log_lpmf(Y[,s] | log_lambda[1:N,s]);
}
}
generated quantities{
int<lower = 0> Y_pred[N,S];
vector[N] log_lik;
vector[S] log_lambda_pred[N];
matrix[S,S] Rho; // Correlation matrix
vector[S] sde; // species' sd
matrix[S,D] L_invar = L; // constrained factor loadings
for(d in 1:D){
if(L[d,d]<0){
L_invar[,d] = -1 * L[,d];
}
}
for(i in 1:S)
for(j in 1:S)
Rho[i,j] = Omega[i,j]/sqrt(Omega[i,i] * Omega[j,j]);
sde = sqrt(diagonal(Omega));
for(n in 1:N){
if(time[n]==1){
log_lambda_pred[n] = log_lambda[n];
}
if(time[n]>1){
log_lambda_pred[n] = multi_normal_cholesky_rng(alpha + diag_matrix(beta)*log_lambda_pred[n-1], L_Omega);
}
}
for(s in 1:S){
for(n in 1:N){
if(log_lambda_pred[n,s]>20){
Y_pred[n,s] = poisson_log_rng(20);
}else{
Y_pred[n,s] = poisson_log_rng(log_lambda_pred[n,s]);
}
log_lik[n] = poisson_log_lpmf(Y[n] | log_lambda[n]);
}
}
}
And here is R code generating toy data and fitting the model (NB for the data simulated below, the model takes around 20 minutes to fit using my pc)
# simulate time series
Gompertz.sim <- function(init.pop, alpha, beta, time, sp, Rho, sd_noise){
sim_data <- matrix(NA, nrow = time, ncol = sp+1)
colnames(sim_data)<-c(c("Year"),as.character(1:sp))
sim_data<- as.data.frame(sim_data)
sim_data$Year <- 1:time
community <- matrix(NA, nrow = time, ncol = sp)
community[1,] <- init.pop
log_lambda <- matrix(NA, nrow = time, ncol = sp)
log_lambda[1,] <- init.pop
for (t in 2:time) {
log_lambda[t,]<- alpha + beta*log_lambda[t-1,] +
t(rmvnorm2(1, Mu = rep(0, sp),
sigma = diag(sd_noise),
Rho = Rho))
for(s in 1:sp){
community[t,s] <- rpois(1,lambda = exp(log_lambda[t,s]))
}
}
for(index in 1:nrow(sim_data)){
sim_data[index,-(1)]<-community[index,]
}
return(results = list(abundance.array = community, Simulation = sim_data))
}
set.seed(2021)
P0<- 1
sp <- 20
t <- 50
alpha <- rnorm(sp, 0.4, 0.1)
beta <- rtruncnorm(sp, a = -Inf, b = 0.95, mean = 0.9, sd = 0.1)
sd_noise <- matrix(0, nrow = sp, ncol = sp)
diag(sd_noise) <- rlnorm(sp, -1.5, 0.1)
Rho <- rlkjcorr(1, sp, eta=0.5)
# Rho <- matrix(0, ncol = sp, nrow = sp)
# diag(Rho) <- 1
Sigma <- sd_noise%*%Rho%*%sd_noise
Sim.data <- Gompertz.sim(init.pop = P0, alpha = alpha, beta = beta, time = t, sp = sp, sd_noise = sd_noise, Rho = Rho)
matplot(matrix(rep(1:t, sp), nrow = t, ncol = sp), log(Sim.data$abundance.array), type = rep("l",sp), lty = 1,lwd = 2, col = 1:sp)
sim.data_2 <- Sim.data$Simulation
sim.data_2 <- sim.data_2 %>%
mutate(time = 1:n())
dat<-list(time = sim.data_2$time,
T = max(sim.data_2$time),
N = nrow(sim.data_2),
Y = sim.data_2[,-(c(1,ncol(sim.data_2)))],
S = ncol(sim.data_2[,-(c(1,ncol(sim.data_2)))]),
D = 9
)
MAR_model <- stan(file = "poi_MVLN_model_ppc.stan",
data = dat,
chains = 4,
cores = 4,
iter = 3000,
control = list(adapt_delta = 0.9, max_treedepth = 10))
Thanks for your time,
Alfonso