Speeding up hurdle model with latent factors and covariates?

Thank you very much for your answer!

The following code suggests the solutions are not equivalent :

Stan code

functions {
  /* hurdle lognormal log-PDF of a single response 
   * GENERATED BY BRMS v.2.14.4
   * logit parameterization of the hurdle part
   * Args: 
   *   y: the response value 
   *   mu: mean parameter of the lognormal distribution 
   *   sigma: sd parameter of the lognormal distribution
   *   hu: linear predictor for the hurdle part 
   * Returns:  
   *   a scalar to be added to the log posterior 
   */ 
  real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) { 
    if (y == 0) { 
      return bernoulli_logit_lpmf(1 | hu); 
    } else { 
      return bernoulli_logit_lpmf(0 | hu) +  
             lognormal_lpdf(y | mu, sigma); 
    } 
  } 
}
data{
  int N;
  int S;
  int N_nz;
  
  real Pi_wide[N,S];
  real Lambda_wide[N,S];
  
  real Y_wide[N,S];
  int Y_bin_long[N*S];
  real Y_long[N_nz];
  
  int site[N*S];
  int sp[N*S];
}
parameters{
  real a;
}
model{
  a ~ normal(0,1);
}
generated quantities{
  matrix[N,S] lp1;
  matrix[N,S] lp2; 
  matrix[N,S] diff;
  
  // Non-vectorized hurdle
  for(n in 1:N){
    for(s in 1:S){
      lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
    }
  }
  // Vectorized try
  for(n in 1:(N*S)){
    lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
    if(Y_bin_long[n] > 0) 
      lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
  }
  
  diff = lp1 - lp2;
}

Data generation

# Empty the environment
rm(list = ls())

# Load libraries
library(tidyverse)
library(cmdstanr)
library(brms)

# Number of data points
N <- 50

# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S1))),
                S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))

Y_wide <- Y_bin_wide %>% 
  mutate(S1 = S1 * exp(Lambda_wide$S1),
         S2 = S2 * exp(Lambda_wide$S2))

## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin", 
                                          names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund", 
                                  names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))

StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
                Pi_wide = Pi_wide %>% select(-site),
                Lambda_wide = Lambda_wide %>% select(-site),
                Y_wide = Y_wide %>% select(-site),
                Y_bin_long = Y_bin_long %>% pull(bin),
                Y_long = Y_long %>% pull(abund),
                site = Y_bin_long%>% pull(site),
                sp = Y_bin_long%>% pull(sp_num))

# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")

fit <- mod$sample(data = StanDat, 
                  iter_warmup = 200, iter_sampling = 200,
                  chains = 1)
fit$summary("diff") %>% View

I think this is because the likelihood is “inversed”. In the likelihood generated by brms, a zero in data has the likelihood to observe 1 given hu, and a non-zero has the likelihood to observe 0 given hu + the lognormal likelihood.

Frankly, I do not know how to translate that using logistic regression, as in the vectorized example. I reat @betanalpha evoquing this, maybe you could have a suggestion?

Thank you very much :)
Lucas