Speeding up hurdle model with latent factors and covariates?

Hello!

I want to fit two quite big latent factor model with a lot of zero, with two response matrix having correlated factor scores (N=199xS=32 and N=199xS=28). Data are hurdle-lognormally distributed, just like this :

``````Site V1   V2  V3
1     0   200 0
2    100  200 0
3     0    0  50
``````

Using the wide-matrix form and the if-else statement for the likelihood, the model takes a prohibitively long-time to run. I thought about using a long format to benefit from vectorization. For sake of simplicity, I will present my idea using only one of the two matrices.

I would split the above data set into two separate data sets, one for binary responses, `Y_bin` :

``````Site  Sp  Abund
1     V1    0
1     V2    1
1     V3    0
2     V1    1
2     V2    1
2     V3    0
3     V4    0
3     V5    0
3     V6    1
``````

And one for non-zero responses, `Y` :

``````Site  Sp  Abund
1     V2   200
2     V1   100
2     V2   200
3     V6    50
``````

Given a NxD factor matrix `FS` and a SxD loading matrix `L`, I could then write a likelihood like this

``````target += bernoulli_logit_lpmf(Y_bin | tau * (FS[site,] * L[sp,]'))
target += lognormal_lpdf(Y | FS[site,] * L[sp,]')
``````

Would it be equivalent to a hurdle-lorgnormal model as defined in the manual?

If yes, can I hope for speed-up?

Thank you very much for reading!
All the best,
Lucas

EDIT : Adding precision to the title.

For reference, here is my actual code, without the generated quantities block :

``````functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
*   y: the response value
*   mu: mean parameter of the lognormal distribution
*   sigma: sd parameter of the lognormal distribution
*   hu: linear predictor for the hurdle part
* Returns:
*   a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
/* hurdle poisson log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* log parameterization for the poisson part
* logit parameterization of the hurdle part
* Args:
*   y: the response value
*   eta: linear predictor for poisson part
*   hu: linear predictor for hurdle part
* Returns:
*   a scalar to be added to the log posterior
*/
real hurdle_poisson_log_logit_lpmf(int y, real eta, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
poisson_log_lpmf(y | eta) -
log1m_exp(-exp(eta));
}
}
matrix[S,D] L;

idx2 = 0;
for(i in 1:(D-1)) { for(j in (i+1):(D)){ L[i,j] = 0; } } //0 on upper diagonal
for(i in 1:D) L[i,i] = L_diag[i]; //Positive values on diagonal
for(j in 1:D) {
for(i in (j+1):S) {
idx2 = idx2+1;
}
}
return(L);
}
}
data {
// Flags
int<lower=0,upper=1> prior_only; //number of data points

// Integers
int<lower=0> N; //number of data points
int<lower=0> SV; //number of vascular species
int<lower=0> SM; //number of moss species
int<lower=0> DV; //number of factors for vascular species
int<lower=0> DM; //number of factors for moss species
int<lower=0> P; //number of unique plots

// Abundances
int VP[N,SV]; //vascular plants
real Moss[N,SM]; //vascular plants

// Covariates
int plot_idx[N];

// Prior parameters
real lam_sigma_plot; //sd of common plot intercepts
real lam_sigma_FS; //sd of factor scores
real eta_rho; //correlation between scores
real mean_a; real sd_a; //common intercepts
real lam_tau; //conversion factors between abundances and presence-absence
real lam_sigma_Moss; // Residual log-sd
}
transformed data {
}

// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
// Hyperparameters
real<lower=0> sigma_plot[2]; //sd of common plot intercepts
positive_ordered[DV] sigma_FS_VP; //sd of vascular plants factor scores
positive_ordered[DM] sigma_FS_Moss; //sd of vascular plants factor scores
cholesky_factor_corr[DV+DM] chol_rho; //correlation between scores
// Parameters
/// Raw factor scores
matrix[N,DV+DM] Z; //Un-centered factor scores of vascular plants
//// Vascular plants
//// Mosses
/// Common intercepts
real a[2];
/// Common plot intercepts
vector<multiplier=sigma_plot[1]>[P] a_plot_VP;
vector<multiplier=sigma_plot[2]>[P] a_plot_Moss;
/// Conversion factors between abundances and presence-absence
real<lower=0> tau[2];
/// Residual log-sd
vector<lower=0>[SM] sigma_Moss;
}
transformed parameters {
// Final factor scores
matrix[N,DV+DM] FS = Z * diag_post_multiply(chol_rho, append_row(sigma_FS_VP, sigma_FS_Moss));
matrix[N,DV] FS_VP = block(Z, 1, 1, N, DV);
matrix[N,DM] FS_Moss = block(Z, 1, DV+1, N, DM);
// Linear predictors
/// Vascular plants
matrix[N,SV] log_Lambda = FS_VP * L_VP';
matrix[N,SV] logit_pi_VP;
/// Mosses
matrix[N,SM] log_Mu = FS_Moss * L_Moss';
matrix[N,SM] logit_pi_Moss;

for(s in 1:SV) log_Lambda[,s] += a[1] + a_plot_VP[plot_idx];
for(s in 1:SV) log_Mu[,s] += a[2] + a_plot_Moss[plot_idx];

// Compute linear predictors for presence-absence
logit_pi_VP = tau[1] * log_Lambda;
logit_pi_Moss = tau[2] * log_Mu;
}
model {
// Hyperparameters
target += exponential_lpdf(sigma_plot | lam_sigma_plot); //sd of common plot intercepts
target += exponential_lpdf(sigma_FS_VP | lam_sigma_FS);  //sd of factor scores
target += exponential_lpdf(sigma_FS_Moss | lam_sigma_FS);  //sd of factor scores
target += lkj_corr_cholesky_lpdf(chol_rho | eta_rho ); //correlation between scores
// Parameters
/// Raw factor scores
target += std_normal_lpdf(to_vector(Z)); //Un-centered factor scores of vascular plants
//// Vascular plants
//// Mosses
/// Common plot intercepts
target += normal_lpdf(a | mean_a, sd_a);
/// Common plot intercepts
target += normal_lpdf(a_plot_VP | 0, sigma_plot[1]);
target += normal_lpdf(a_plot_Moss | 0, sigma_plot[2]);
/// Conversion factors between abundances and presence-absence
target += exponential_lpdf(tau | lam_tau);
/// Residual log-sd
target += exponential_lpdf(sigma_Moss | lam_sigma_Moss);

// likelihood including all constants
if (!prior_only) {
for (n in 1:N) {
for(s in 1:SV) target += hurdle_poisson_log_logit_lpmf(VP[n,s] | log_Lambda[n,s], logit_pi_VP[n,s]);
for(s in 1:SM) target += hurdle_lognormal_logit_lpdf(Moss[n,s] | log_Mu[n,s], sigma_Moss[s], logit_pi_Moss[n,s]);
}
}
}
``````

Makes sense to me.

The simplest way to check this will be to just write a function for both and make sure they’re returning the same value.

Like in generated quantities do:

``````generated quantities {
real lp1 = my_slow_lpdf_i_trust(...);
real lp2 = my_fast_lpdf_i_want_to_check(...);
real error = abs(lp1 - lp2);
}
``````

And then just look at the output distribution of `error`.

Probably. There will be a new release of cmdstan tomorrow that has a profiling feature that should make it easier to figure this out: CmdStan 2.26.0 release candidate (cmdstanr will support the profiling too)

The following code suggests the solutions are not equivalent :

Stan code

``````functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
*   y: the response value
*   mu: mean parameter of the lognormal distribution
*   sigma: sd parameter of the lognormal distribution
*   hu: linear predictor for the hurdle part
* Returns:
*   a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
}
data{
int N;
int S;
int N_nz;

real Pi_wide[N,S];
real Lambda_wide[N,S];

real Y_wide[N,S];
int Y_bin_long[N*S];
real Y_long[N_nz];

int site[N*S];
int sp[N*S];
}
parameters{
real a;
}
model{
a ~ normal(0,1);
}
generated quantities{
matrix[N,S] lp1;
matrix[N,S] lp2;
matrix[N,S] diff;

// Non-vectorized hurdle
for(n in 1:N){
for(s in 1:S){
lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
}
}
// Vectorized try
for(n in 1:(N*S)){
lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
if(Y_bin_long[n] > 0)
lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
}

diff = lp1 - lp2;
}
``````

Data generation

``````# Empty the environment
rm(list = ls())

library(tidyverse)
library(cmdstanr)
library(brms)

# Number of data points
N <- 50

# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide\$S1))),
S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide\$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))

Y_wide <- Y_bin_wide %>%
mutate(S1 = S1 * exp(Lambda_wide\$S1),
S2 = S2 * exp(Lambda_wide\$S2))

## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))

StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
Pi_wide = Pi_wide %>% select(-site),
Lambda_wide = Lambda_wide %>% select(-site),
Y_wide = Y_wide %>% select(-site),
Y_bin_long = Y_bin_long %>% pull(bin),
Y_long = Y_long %>% pull(abund),
site = Y_bin_long%>% pull(site),
sp = Y_bin_long%>% pull(sp_num))

# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")

fit <- mod\$sample(data = StanDat,
iter_warmup = 200, iter_sampling = 200,
chains = 1)
fit\$summary("diff") %>% View
``````

I think this is because the likelihood is “inversed”. In the likelihood generated by brms, a zero in data has the likelihood to observe 1 given hu, and a non-zero has the likelihood to observe 0 given hu + the lognormal likelihood.

Frankly, I do not know how to translate that using logistic regression, as in the vectorized example. I reat @betanalpha evoquing this, maybe you could have a suggestion?

Thank you very much :)
Lucas

I got it!!!

In conclusion : the 0 and 1 of the logistic regression part have to be inverted for the vectorized hurdle to work. It took me a long time to succeed because when I modified `Y_bin_long`, the `if` statement of the “vectorized” likelihood was messed up at the same time -_-

Currently, the code is not vectorized, because I focused more about testing the likelihood than producing an efficient code. But because it is already defined for the long format, it is mostly about removing the loop around the likelihood.

Final stan code :

``````functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
*   y: the response value
*   mu: mean parameter of the lognormal distribution
*   sigma: sd parameter of the lognormal distribution
*   hu: linear predictor for the hurdle part
* Returns:
*   a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
}
data{
int N;
int S;
int N_nz;

real Pi_wide[N,S];
real Lambda_wide[N,S];

real Y_wide[N,S];
int Y_bin_long[N*S];
real Y_long[N_nz];

int site[N*S];
int sp[N*S];
}
parameters{
real a;
}
model{
a ~ normal(0,1);
}
generated quantities{
matrix[N,S] lp1;
matrix[N,S] lp2;
matrix[N,S] diff;

// Non-vectorized hurdle
for(n in 1:N){
for(s in 1:S){
lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
}
}
// Vectorized try
for(n in 1:(N*S)){
lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
if(Y_long[n] > 0)
lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
}

diff = lp1 - lp2;
}
``````

Final data generating code

``````# Empty the environment
rm(list = ls())

library(tidyverse)
library(cmdstanr)
library(brms)

# Number of data points
N <- 5

# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide\$S1))),
S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide\$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))

Y_wide <- Y_bin_wide %>%
mutate(S1 = S1 * exp(Lambda_wide\$S1),
S2 = S2 * exp(Lambda_wide\$S2))

## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))

StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
Pi_wide = Pi_wide %>% select(-site),
Lambda_wide = Lambda_wide %>% select(-site),
Y_wide = Y_wide %>% select(-site),
Y_bin_long = Y_bin_long %>% mutate(bin = ifelse(bin == 0, 1, 0)) %>% pull(bin),
Y_long = Y_long %>% pull(abund),
site = Y_bin_long %>% pull(site),
sp = Y_bin_long %>% pull(sp_num))

# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")

fit <- mod\$sample(data = StanDat,
iter_warmup = 200, iter_sampling = 200,
chains = 1)
fit\$summary("lp1")
fit\$summary("lp2")
fit\$summary("diff")
``````

All the best,
Lucas

1 Like

Good stuff! If you get a chance post the fully vectorized solution too so if someone in the future stumbles on this they have it!