I want to fit two quite big latent factor model with a lot of zero, with two response matrix having correlated factor scores (N=199xS=32 and N=199xS=28). Data are hurdle-lognormally distributed, just like this :
Site V1 V2 V3
1 0 200 0
2 100 200 0
3 0 0 50
Using the wide-matrix form and the if-else statement for the likelihood, the model takes a prohibitively long-time to run. I thought about using a long format to benefit from vectorization. For sake of simplicity, I will present my idea using only one of the two matrices.
I would split the above data set into two separate data sets, one for binary responses, Y_bin :
For reference, here is my actual code, without the generated quantities block :
functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
* y: the response value
* mu: mean parameter of the lognormal distribution
* sigma: sd parameter of the lognormal distribution
* hu: linear predictor for the hurdle part
* Returns:
* a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
/* hurdle poisson log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* log parameterization for the poisson part
* logit parameterization of the hurdle part
* Args:
* y: the response value
* eta: linear predictor for poisson part
* hu: linear predictor for hurdle part
* Returns:
* a scalar to be added to the log posterior
*/
real hurdle_poisson_log_logit_lpmf(int y, real eta, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
poisson_log_lpmf(y | eta) -
log1m_exp(-exp(eta));
}
}
// Function to fill loading matrices
matrix fill_loadings(int D, int S, vector L_lower, vector L_diag){
matrix[S,D] L;
int idx2; //Index for the lower diagonal loadings
idx2 = 0;
for(i in 1:(D-1)) { for(j in (i+1):(D)){ L[i,j] = 0; } } //0 on upper diagonal
for(i in 1:D) L[i,i] = L_diag[i]; //Positive values on diagonal
for(j in 1:D) {
for(i in (j+1):S) {
idx2 = idx2+1;
L[i,j] = L_lower[idx2]; //Insert lower diagonal values in loadings matrix
}
}
return(L);
}
}
data {
// Flags
int<lower=0,upper=1> prior_only; //number of data points
// Integers
int<lower=0> N; //number of data points
int<lower=0> SV; //number of vascular species
int<lower=0> SM; //number of moss species
int<lower=0> DV; //number of factors for vascular species
int<lower=0> DM; //number of factors for moss species
int<lower=0> P; //number of unique plots
// Abundances
int VP[N,SV]; //vascular plants
real Moss[N,SM]; //vascular plants
// Covariates
int plot_idx[N];
// Prior parameters
real lam_sigma_plot; //sd of common plot intercepts
real lam_sigma_FS; //sd of factor scores
real eta_rho; //correlation between scores
real mean_a; real sd_a; //common intercepts
real lam_tau; //conversion factors between abundances and presence-absence
real lam_sigma_Moss; // Residual log-sd
}
transformed data {
int<lower=1> MV = DV*(SV-DV)+ DV*(DV-1)/2; // number of lower-triangular, non-zero loadings
int<lower=1> MM = DM*(SM-DM)+ DM*(DM-1)/2; // number of lower-triangular, non-zero loadings
}
// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
// Hyperparameters
real<lower=0> sigma_plot[2]; //sd of common plot intercepts
positive_ordered[DV] sigma_FS_VP; //sd of vascular plants factor scores
positive_ordered[DM] sigma_FS_Moss; //sd of vascular plants factor scores
cholesky_factor_corr[DV+DM] chol_rho; //correlation between scores
// Parameters
/// Raw factor scores
matrix[N,DV+DM] Z; //Un-centered factor scores of vascular plants
/// Loadings
//// Vascular plants
vector[MV] L_lower_VP; //Lower diagonal loadings of vascular plants
vector[DV] L_diag_VP; //Diagonal loadings of vascular plants
//// Mosses
vector[MM] L_lower_Moss; //Lower diagonal loadings of mosses
vector[DV] L_diag_Moss; //Lower diagonal loadings of mosses
/// Common intercepts
real a[2];
/// Common plot intercepts
vector<multiplier=sigma_plot[1]>[P] a_plot_VP;
vector<multiplier=sigma_plot[2]>[P] a_plot_Moss;
/// Conversion factors between abundances and presence-absence
real<lower=0> tau[2];
/// Residual log-sd
vector<lower=0>[SM] sigma_Moss;
}
transformed parameters {
// Final factor scores
matrix[N,DV+DM] FS = Z * diag_post_multiply(chol_rho, append_row(sigma_FS_VP, sigma_FS_Moss));
matrix[N,DV] FS_VP = block(Z, 1, 1, N, DV);
matrix[N,DM] FS_Moss = block(Z, 1, DV+1, N, DM);
// Final loading matrices
matrix[SV,DV] L_VP = fill_loadings(DV, SV, L_lower_VP, L_diag_VP);
matrix[SM,DM] L_Moss = fill_loadings(DM, SM, L_lower_Moss, L_diag_Moss);
// Linear predictors
/// Vascular plants
matrix[N,SV] log_Lambda = FS_VP * L_VP';
matrix[N,SV] logit_pi_VP;
/// Mosses
matrix[N,SM] log_Mu = FS_Moss * L_Moss';
matrix[N,SM] logit_pi_Moss;
// Add other predictors
for(s in 1:SV) log_Lambda[,s] += a[1] + a_plot_VP[plot_idx];
for(s in 1:SV) log_Mu[,s] += a[2] + a_plot_Moss[plot_idx];
// Compute linear predictors for presence-absence
logit_pi_VP = tau[1] * log_Lambda;
logit_pi_Moss = tau[2] * log_Mu;
}
model {
// Hyperparameters
target += exponential_lpdf(sigma_plot | lam_sigma_plot); //sd of common plot intercepts
target += exponential_lpdf(sigma_FS_VP | lam_sigma_FS); //sd of factor scores
target += exponential_lpdf(sigma_FS_Moss | lam_sigma_FS); //sd of factor scores
target += lkj_corr_cholesky_lpdf(chol_rho | eta_rho ); //correlation between scores
// Parameters
/// Raw factor scores
target += std_normal_lpdf(to_vector(Z)); //Un-centered factor scores of vascular plants
/// Loadings
//// Vascular plants
target += std_normal_lpdf(L_lower_VP); //Lower diagonal loadings of vascular plants
target += std_normal_lpdf(L_diag_VP); //Diagonal loadings of vascular plants
//// Mosses
target += std_normal_lpdf(L_lower_Moss); //Lower diagonal loadings of mosses
target += std_normal_lpdf(L_diag_Moss); //Diagonal loadings of mosses
/// Common plot intercepts
target += normal_lpdf(a | mean_a, sd_a);
/// Common plot intercepts
target += normal_lpdf(a_plot_VP | 0, sigma_plot[1]);
target += normal_lpdf(a_plot_Moss | 0, sigma_plot[2]);
/// Conversion factors between abundances and presence-absence
target += exponential_lpdf(tau | lam_tau);
/// Residual log-sd
target += exponential_lpdf(sigma_Moss | lam_sigma_Moss);
// likelihood including all constants
if (!prior_only) {
for (n in 1:N) {
for(s in 1:SV) target += hurdle_poisson_log_logit_lpmf(VP[n,s] | log_Lambda[n,s], logit_pi_VP[n,s]);
for(s in 1:SM) target += hurdle_lognormal_logit_lpdf(Moss[n,s] | log_Mu[n,s], sigma_Moss[s], logit_pi_Moss[n,s]);
}
}
}
The simplest way to check this will be to just write a function for both and make sure they’re returning the same value.
Like in generated quantities do:
generated quantities {
real lp1 = my_slow_lpdf_i_trust(...);
real lp2 = my_fast_lpdf_i_want_to_check(...);
real error = abs(lp1 - lp2);
}
And then just look at the output distribution of error.
Probably. There will be a new release of cmdstan tomorrow that has a profiling feature that should make it easier to figure this out: CmdStan 2.26.0 release candidate (cmdstanr will support the profiling too)
The following code suggests the solutions are not equivalent :
Stan code
functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
* y: the response value
* mu: mean parameter of the lognormal distribution
* sigma: sd parameter of the lognormal distribution
* hu: linear predictor for the hurdle part
* Returns:
* a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
}
data{
int N;
int S;
int N_nz;
real Pi_wide[N,S];
real Lambda_wide[N,S];
real Y_wide[N,S];
int Y_bin_long[N*S];
real Y_long[N_nz];
int site[N*S];
int sp[N*S];
}
parameters{
real a;
}
model{
a ~ normal(0,1);
}
generated quantities{
matrix[N,S] lp1;
matrix[N,S] lp2;
matrix[N,S] diff;
// Non-vectorized hurdle
for(n in 1:N){
for(s in 1:S){
lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
}
}
// Vectorized try
for(n in 1:(N*S)){
lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
if(Y_bin_long[n] > 0)
lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
}
diff = lp1 - lp2;
}
I think this is because the likelihood is “inversed”. In the likelihood generated by brms, a zero in data has the likelihood to observe 1 given hu, and a non-zero has the likelihood to observe 0 given hu + the lognormal likelihood.
Frankly, I do not know how to translate that using logistic regression, as in the vectorized example. I reat @betanalpha evoquing this, maybe you could have a suggestion?
In conclusion : the 0 and 1 of the logistic regression part have to be inverted for the vectorized hurdle to work. It took me a long time to succeed because when I modified Y_bin_long, the if statement of the “vectorized” likelihood was messed up at the same time -_-
Currently, the code is not vectorized, because I focused more about testing the likelihood than producing an efficient code. But because it is already defined for the long format, it is mostly about removing the loop around the likelihood.
Final stan code :
functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
* y: the response value
* mu: mean parameter of the lognormal distribution
* sigma: sd parameter of the lognormal distribution
* hu: linear predictor for the hurdle part
* Returns:
* a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
}
data{
int N;
int S;
int N_nz;
real Pi_wide[N,S];
real Lambda_wide[N,S];
real Y_wide[N,S];
int Y_bin_long[N*S];
real Y_long[N_nz];
int site[N*S];
int sp[N*S];
}
parameters{
real a;
}
model{
a ~ normal(0,1);
}
generated quantities{
matrix[N,S] lp1;
matrix[N,S] lp2;
matrix[N,S] diff;
// Non-vectorized hurdle
for(n in 1:N){
for(s in 1:S){
lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
}
}
// Vectorized try
for(n in 1:(N*S)){
lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
if(Y_long[n] > 0)
lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
}
diff = lp1 - lp2;
}