Speeding up hurdle model with latent factors and covariates?

Hello!

I want to fit two quite big latent factor model with a lot of zero, with two response matrix having correlated factor scores (N=199xS=32 and N=199xS=28). Data are hurdle-lognormally distributed, just like this :

Site V1   V2  V3
1     0   200 0
2    100  200 0
3     0    0  50

Using the wide-matrix form and the if-else statement for the likelihood, the model takes a prohibitively long-time to run. I thought about using a long format to benefit from vectorization. For sake of simplicity, I will present my idea using only one of the two matrices.

I would split the above data set into two separate data sets, one for binary responses, Y_bin :

Site  Sp  Abund
1     V1    0
1     V2    1
1     V3    0
2     V1    1
2     V2    1
2     V3    0
3     V4    0
3     V5    0
3     V6    1

And one for non-zero responses, Y :

Site  Sp  Abund
1     V2   200
2     V1   100
2     V2   200
3     V6    50

Given a NxD factor matrix FS and a SxD loading matrix L, I could then write a likelihood like this

target += bernoulli_logit_lpmf(Y_bin | tau * (FS[site,] * L[sp,]'))
target += lognormal_lpdf(Y | FS[site,] * L[sp,]')

Would it be equivalent to a hurdle-lorgnormal model as defined in the manual?

If yes, can I hope for speed-up?

Thank you very much for reading!
All the best,
Lucas

EDIT : Adding precision to the title.

For reference, here is my actual code, without the generated quantities block :

functions {
  /* hurdle lognormal log-PDF of a single response 
   * GENERATED BY BRMS v.2.14.4
   * logit parameterization of the hurdle part
   * Args: 
   *   y: the response value 
   *   mu: mean parameter of the lognormal distribution 
   *   sigma: sd parameter of the lognormal distribution
   *   hu: linear predictor for the hurdle part 
   * Returns:  
   *   a scalar to be added to the log posterior 
   */ 
  real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) { 
    if (y == 0) { 
      return bernoulli_logit_lpmf(1 | hu); 
    } else { 
      return bernoulli_logit_lpmf(0 | hu) +  
             lognormal_lpdf(y | mu, sigma); 
    } 
  } 
    /* hurdle poisson log-PDF of a single response 
   * GENERATED BY BRMS v.2.14.4
   * log parameterization for the poisson part
   * logit parameterization of the hurdle part
   * Args: 
   *   y: the response value 
   *   eta: linear predictor for poisson part 
   *   hu: linear predictor for hurdle part 
   * Returns:  
   *   a scalar to be added to the log posterior 
   */ 
  real hurdle_poisson_log_logit_lpmf(int y, real eta, real hu) { 
    if (y == 0) { 
      return bernoulli_logit_lpmf(1 | hu); 
    } else { 
      return bernoulli_logit_lpmf(0 | hu) +  
             poisson_log_lpmf(y | eta) - 
             log1m_exp(-exp(eta)); 
    } 
  }
  // Function to fill loading matrices
  matrix fill_loadings(int D, int S, vector L_lower, vector L_diag){
    matrix[S,D] L;
    
    int idx2; //Index for the lower diagonal loadings
    idx2 = 0;
    for(i in 1:(D-1)) { for(j in (i+1):(D)){ L[i,j] = 0; } } //0 on upper diagonal
    for(i in 1:D) L[i,i] = L_diag[i]; //Positive values on diagonal
    for(j in 1:D) {
     for(i in (j+1):S) {
       idx2 = idx2+1;
       L[i,j] = L_lower[idx2]; //Insert lower diagonal values in loadings matrix
       }
      }
      return(L);
    }
}
data {
  // Flags
  int<lower=0,upper=1> prior_only; //number of data points
  
  // Integers
  int<lower=0> N; //number of data points
  int<lower=0> SV; //number of vascular species
  int<lower=0> SM; //number of moss species
  int<lower=0> DV; //number of factors for vascular species
  int<lower=0> DM; //number of factors for moss species
  int<lower=0> P; //number of unique plots
  
  // Abundances
  int VP[N,SV]; //vascular plants
  real Moss[N,SM]; //vascular plants
  
  // Covariates
  int plot_idx[N];
  
  // Prior parameters
  real lam_sigma_plot; //sd of common plot intercepts
  real lam_sigma_FS; //sd of factor scores
  real eta_rho; //correlation between scores
  real mean_a; real sd_a; //common intercepts
  real lam_tau; //conversion factors between abundances and presence-absence
  real lam_sigma_Moss; // Residual log-sd
}
transformed data {
  int<lower=1> MV = DV*(SV-DV)+ DV*(DV-1)/2;  // number of lower-triangular, non-zero loadings
  int<lower=1> MM = DM*(SM-DM)+ DM*(DM-1)/2;  // number of lower-triangular, non-zero loadings
}

// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
  // Hyperparameters
  real<lower=0> sigma_plot[2]; //sd of common plot intercepts
  positive_ordered[DV] sigma_FS_VP; //sd of vascular plants factor scores
  positive_ordered[DM] sigma_FS_Moss; //sd of vascular plants factor scores
  cholesky_factor_corr[DV+DM] chol_rho; //correlation between scores
  // Parameters
  /// Raw factor scores
  matrix[N,DV+DM] Z; //Un-centered factor scores of vascular plants
  /// Loadings
  //// Vascular plants
  vector[MV] L_lower_VP; //Lower diagonal loadings of vascular plants
  vector[DV] L_diag_VP; //Diagonal loadings of vascular plants
  //// Mosses
  vector[MM] L_lower_Moss; //Lower diagonal loadings of mosses
  vector[DV] L_diag_Moss; //Lower diagonal loadings of mosses
  /// Common intercepts
  real a[2];
  /// Common plot intercepts
  vector<multiplier=sigma_plot[1]>[P] a_plot_VP;
  vector<multiplier=sigma_plot[2]>[P] a_plot_Moss;
  /// Conversion factors between abundances and presence-absence
  real<lower=0> tau[2];
  /// Residual log-sd
  vector<lower=0>[SM] sigma_Moss;
}
transformed parameters {
  // Final factor scores
  matrix[N,DV+DM] FS = Z * diag_post_multiply(chol_rho, append_row(sigma_FS_VP, sigma_FS_Moss));
  matrix[N,DV] FS_VP = block(Z, 1, 1, N, DV);
  matrix[N,DM] FS_Moss = block(Z, 1, DV+1, N, DM);
  // Final loading matrices
  matrix[SV,DV] L_VP = fill_loadings(DV, SV, L_lower_VP, L_diag_VP);
  matrix[SM,DM] L_Moss = fill_loadings(DM, SM, L_lower_Moss, L_diag_Moss);
  // Linear predictors
  /// Vascular plants
  matrix[N,SV] log_Lambda = FS_VP * L_VP';
  matrix[N,SV] logit_pi_VP;
  /// Mosses
  matrix[N,SM] log_Mu = FS_Moss * L_Moss';
  matrix[N,SM] logit_pi_Moss;
  
  // Add other predictors
  for(s in 1:SV) log_Lambda[,s] += a[1] + a_plot_VP[plot_idx];
  for(s in 1:SV) log_Mu[,s] += a[2] + a_plot_Moss[plot_idx];
  
  // Compute linear predictors for presence-absence
  logit_pi_VP = tau[1] * log_Lambda;
  logit_pi_Moss = tau[2] * log_Mu;
}
model {
  // Hyperparameters
  target += exponential_lpdf(sigma_plot | lam_sigma_plot); //sd of common plot intercepts
  target += exponential_lpdf(sigma_FS_VP | lam_sigma_FS);  //sd of factor scores
  target += exponential_lpdf(sigma_FS_Moss | lam_sigma_FS);  //sd of factor scores
  target += lkj_corr_cholesky_lpdf(chol_rho | eta_rho ); //correlation between scores
  // Parameters
  /// Raw factor scores
  target += std_normal_lpdf(to_vector(Z)); //Un-centered factor scores of vascular plants
  /// Loadings
  //// Vascular plants
  target += std_normal_lpdf(L_lower_VP); //Lower diagonal loadings of vascular plants
  target += std_normal_lpdf(L_diag_VP); //Diagonal loadings of vascular plants
  //// Mosses
  target += std_normal_lpdf(L_lower_Moss); //Lower diagonal loadings of mosses
  target += std_normal_lpdf(L_diag_Moss); //Diagonal loadings of mosses
  /// Common plot intercepts
  target += normal_lpdf(a | mean_a, sd_a);
  /// Common plot intercepts
  target += normal_lpdf(a_plot_VP | 0, sigma_plot[1]);
  target += normal_lpdf(a_plot_Moss | 0, sigma_plot[2]);
  /// Conversion factors between abundances and presence-absence
  target += exponential_lpdf(tau | lam_tau);
  /// Residual log-sd
  target += exponential_lpdf(sigma_Moss | lam_sigma_Moss);
    
  // likelihood including all constants
  if (!prior_only) {
    for (n in 1:N) {
      for(s in 1:SV) target += hurdle_poisson_log_logit_lpmf(VP[n,s] | log_Lambda[n,s], logit_pi_VP[n,s]);
      for(s in 1:SM) target += hurdle_lognormal_logit_lpdf(Moss[n,s] | log_Mu[n,s], sigma_Moss[s], logit_pi_Moss[n,s]);
    }
  }
}

Makes sense to me.

The simplest way to check this will be to just write a function for both and make sure they’re returning the same value.

Like in generated quantities do:

generated quantities {
  real lp1 = my_slow_lpdf_i_trust(...);
  real lp2 = my_fast_lpdf_i_want_to_check(...);
  real error = abs(lp1 - lp2);
}

And then just look at the output distribution of error.

Probably. There will be a new release of cmdstan tomorrow that has a profiling feature that should make it easier to figure this out: CmdStan 2.26.0 release candidate (cmdstanr will support the profiling too)

Thank you very much for your answer!

The following code suggests the solutions are not equivalent :

Stan code

functions {
  /* hurdle lognormal log-PDF of a single response 
   * GENERATED BY BRMS v.2.14.4
   * logit parameterization of the hurdle part
   * Args: 
   *   y: the response value 
   *   mu: mean parameter of the lognormal distribution 
   *   sigma: sd parameter of the lognormal distribution
   *   hu: linear predictor for the hurdle part 
   * Returns:  
   *   a scalar to be added to the log posterior 
   */ 
  real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) { 
    if (y == 0) { 
      return bernoulli_logit_lpmf(1 | hu); 
    } else { 
      return bernoulli_logit_lpmf(0 | hu) +  
             lognormal_lpdf(y | mu, sigma); 
    } 
  } 
}
data{
  int N;
  int S;
  int N_nz;
  
  real Pi_wide[N,S];
  real Lambda_wide[N,S];
  
  real Y_wide[N,S];
  int Y_bin_long[N*S];
  real Y_long[N_nz];
  
  int site[N*S];
  int sp[N*S];
}
parameters{
  real a;
}
model{
  a ~ normal(0,1);
}
generated quantities{
  matrix[N,S] lp1;
  matrix[N,S] lp2; 
  matrix[N,S] diff;
  
  // Non-vectorized hurdle
  for(n in 1:N){
    for(s in 1:S){
      lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
    }
  }
  // Vectorized try
  for(n in 1:(N*S)){
    lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
    if(Y_bin_long[n] > 0) 
      lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
  }
  
  diff = lp1 - lp2;
}

Data generation

# Empty the environment
rm(list = ls())

# Load libraries
library(tidyverse)
library(cmdstanr)
library(brms)

# Number of data points
N <- 50

# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S1))),
                S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))

Y_wide <- Y_bin_wide %>% 
  mutate(S1 = S1 * exp(Lambda_wide$S1),
         S2 = S2 * exp(Lambda_wide$S2))

## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin", 
                                          names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund", 
                                  names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))

StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
                Pi_wide = Pi_wide %>% select(-site),
                Lambda_wide = Lambda_wide %>% select(-site),
                Y_wide = Y_wide %>% select(-site),
                Y_bin_long = Y_bin_long %>% pull(bin),
                Y_long = Y_long %>% pull(abund),
                site = Y_bin_long%>% pull(site),
                sp = Y_bin_long%>% pull(sp_num))

# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")

fit <- mod$sample(data = StanDat, 
                  iter_warmup = 200, iter_sampling = 200,
                  chains = 1)
fit$summary("diff") %>% View

I think this is because the likelihood is “inversed”. In the likelihood generated by brms, a zero in data has the likelihood to observe 1 given hu, and a non-zero has the likelihood to observe 0 given hu + the lognormal likelihood.

Frankly, I do not know how to translate that using logistic regression, as in the vectorized example. I reat @betanalpha evoquing this, maybe you could have a suggestion?

Thank you very much :)
Lucas

I got it!!!

In conclusion : the 0 and 1 of the logistic regression part have to be inverted for the vectorized hurdle to work. It took me a long time to succeed because when I modified Y_bin_long, the if statement of the “vectorized” likelihood was messed up at the same time -_-

Currently, the code is not vectorized, because I focused more about testing the likelihood than producing an efficient code. But because it is already defined for the long format, it is mostly about removing the loop around the likelihood.

Final stan code :

functions {
  /* hurdle lognormal log-PDF of a single response 
   * GENERATED BY BRMS v.2.14.4
   * logit parameterization of the hurdle part
   * Args: 
   *   y: the response value 
   *   mu: mean parameter of the lognormal distribution 
   *   sigma: sd parameter of the lognormal distribution
   *   hu: linear predictor for the hurdle part 
   * Returns:  
   *   a scalar to be added to the log posterior 
   */ 
  real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) { 
    if (y == 0) { 
      return bernoulli_logit_lpmf(1 | hu); 
    } else { 
      return bernoulli_logit_lpmf(0 | hu) +  
             lognormal_lpdf(y | mu, sigma); 
    } 
  } 
}
data{
  int N;
  int S;
  int N_nz;
  
  real Pi_wide[N,S];
  real Lambda_wide[N,S];
  
  real Y_wide[N,S];
  int Y_bin_long[N*S];
  real Y_long[N_nz];
  
  int site[N*S];
  int sp[N*S];
}
parameters{
  real a;
}
model{
  a ~ normal(0,1);
}
generated quantities{
  matrix[N,S] lp1;
  matrix[N,S] lp2; 
  matrix[N,S] diff;
  
  // Non-vectorized hurdle
  for(n in 1:N){
    for(s in 1:S){
      lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
    }
  }
  // Vectorized try
  for(n in 1:(N*S)){
    lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
    if(Y_long[n] > 0) 
      lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
  }
  
  diff = lp1 - lp2;
}

Final data generating code

# Empty the environment
rm(list = ls())

# Load libraries
library(tidyverse)
library(cmdstanr)
library(brms)

# Number of data points
N <- 5

# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S1))),
                S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))

Y_wide <- Y_bin_wide %>% 
  mutate(S1 = S1 * exp(Lambda_wide$S1),
         S2 = S2 * exp(Lambda_wide$S2))

## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin", 
                                          names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund", 
                                  names_to = "sp") %>% 
  mutate(sp_num = as.numeric(as.factor(sp)))

StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
                Pi_wide = Pi_wide %>% select(-site),
                Lambda_wide = Lambda_wide %>% select(-site),
                Y_wide = Y_wide %>% select(-site),
                Y_bin_long = Y_bin_long %>% mutate(bin = ifelse(bin == 0, 1, 0)) %>% pull(bin),
                Y_long = Y_long %>% pull(abund),
                site = Y_bin_long %>% pull(site),
                sp = Y_bin_long %>% pull(sp_num))

# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")

fit <- mod$sample(data = StanDat, 
                  iter_warmup = 200, iter_sampling = 200,
                  chains = 1)
fit$summary("lp1")
fit$summary("lp2")
fit$summary("diff")

All the best,
Lucas

1 Like

Good stuff! If you get a chance post the fully vectorized solution too so if someone in the future stumbles on this they have it!