Thank you very much for your answer!
The following code suggests the solutions are not equivalent :
Stan code
functions {
/* hurdle lognormal log-PDF of a single response
* GENERATED BY BRMS v.2.14.4
* logit parameterization of the hurdle part
* Args:
* y: the response value
* mu: mean parameter of the lognormal distribution
* sigma: sd parameter of the lognormal distribution
* hu: linear predictor for the hurdle part
* Returns:
* a scalar to be added to the log posterior
*/
real hurdle_lognormal_logit_lpdf(real y, real mu, real sigma, real hu) {
if (y == 0) {
return bernoulli_logit_lpmf(1 | hu);
} else {
return bernoulli_logit_lpmf(0 | hu) +
lognormal_lpdf(y | mu, sigma);
}
}
}
data{
int N;
int S;
int N_nz;
real Pi_wide[N,S];
real Lambda_wide[N,S];
real Y_wide[N,S];
int Y_bin_long[N*S];
real Y_long[N_nz];
int site[N*S];
int sp[N*S];
}
parameters{
real a;
}
model{
a ~ normal(0,1);
}
generated quantities{
matrix[N,S] lp1;
matrix[N,S] lp2;
matrix[N,S] diff;
// Non-vectorized hurdle
for(n in 1:N){
for(s in 1:S){
lp1[n,s] = hurdle_lognormal_logit_lpdf(Y_wide[n,s] | Lambda_wide[n,s], 1, Pi_wide[n,s]);
}
}
// Vectorized try
for(n in 1:(N*S)){
lp2[site[n],sp[n]] = bernoulli_logit_lpmf(Y_bin_long[n] | Pi_wide[site[n],sp[n]]);
if(Y_bin_long[n] > 0)
lp2[site[n],sp[n]] += lognormal_lpdf(Y_long[n] | Lambda_wide[site[n],sp[n]],1);
}
diff = lp1 - lp2;
}
Data generation
# Empty the environment
rm(list = ls())
# Load libraries
library(tidyverse)
library(cmdstanr)
library(brms)
# Number of data points
N <- 50
# Create binary values for S species
## Generate proba of presence
Pi_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_bin_wide <- tibble(site = 1:N, S1 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S1))),
S2 = as.numeric(rbernoulli(N, inv_logit_scaled(Pi_wide$S2))))
## Generate abundances when present
Lambda_wide <- data.frame(site = 1:N, S1 = rnorm(N), S2 = rnorm(N))
Y_wide <- Y_bin_wide %>%
mutate(S1 = S1 * exp(Lambda_wide$S1),
S2 = S2 * exp(Lambda_wide$S2))
## Generate long format
Y_bin_long <- Y_bin_wide %>% pivot_longer(c(S1, S2), values_to = "bin",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))
Y_long <- Y_wide %>% pivot_longer(c(S1, S2), values_to = "abund",
names_to = "sp") %>%
mutate(sp_num = as.numeric(as.factor(sp)))
StanDat <- list(N = nrow(Y_wide), S = 2, N_nz = nrow(Y_long),
Pi_wide = Pi_wide %>% select(-site),
Lambda_wide = Lambda_wide %>% select(-site),
Y_wide = Y_wide %>% select(-site),
Y_bin_long = Y_bin_long %>% pull(bin),
Y_long = Y_long %>% pull(abund),
site = Y_bin_long%>% pull(site),
sp = Y_bin_long%>% pull(sp_num))
# Compile model
mod <- cmdstan_model("Codes/Results_MossesPV_interactions/Test_Distr_WideToLong.stan")
fit <- mod$sample(data = StanDat,
iter_warmup = 200, iter_sampling = 200,
chains = 1)
fit$summary("diff") %>% View
I think this is because the likelihood is “inversed”. In the likelihood generated by brms, a zero in data has the likelihood to observe 1 given hu, and a non-zero has the likelihood to observe 0 given hu + the lognormal likelihood.
Frankly, I do not know how to translate that using logistic regression, as in the vectorized example. I reat @betanalpha evoquing this, maybe you could have a suggestion?
Thank you very much :)
Lucas