I am trying to fit a model of pathogen decay where we have three point sources of pathogens “farms”, using a laplace dispersal kernel model.
I am seeing an issue, where the peak estimate of pathogen density is really high at the second and sometimes third farm in the sequence that isn’t supported by the data. Any ideas why this might be or how I can redefine the model to avoid this behaviour?
Stan code
// Laplace model of pathogen decay
functions {
// piece-wise exponential function
//Farm A
real yA(real bA, real x, real a1, real a2, real fAu, real fAd) {
if (x < fAu) {
return bA * exp(-a1 * (fAu - x));
} else {
return bA * exp(-a2 * (x - fAd));
}
}
//Farm B
real yB(real bB, real x, real a3, real a4, real fBu, real fBd) {
if (x < fBu) {
return bB * exp(-a3 * (fBu - x));
} else {
return bB * exp(-a4 * (x - fBd));
}
}
//Farm C
real yC(real bC, real x, real a5, real a6, real fCu, real fCd) {
if (x < fCu) {
return bC * exp(-a5 * (fCu - x));
} else {
return bC * exp(-a6 * (x - fCd));
}
}
}
data {
int<lower = 1> N; // number of samples
vector[N] x; // sample locations
vector[N] y; // measured pathogen density
real fAu; // location of upstream edge of farm A
real fAd; // location of downstream edge of farm A
real fBu; // location of upstream edge of farm B
real fBd; // location of downstream edge of farm B
real fCu; // location of upstream edge of farm C
real fCd; // location of downstream edge of farm C
}
parameters {
real<lower = 0> bA;// peak density at farm A
real<lower = 0> bB; // peak density at farm B
real<lower = 0> bC; // peak density at farm C
real<lower = 0> a1; // upstream decay coefficient for farm A
real<lower = 0> a2; // downstream decay coefficient for farm A
real<lower = 0> a3; // upstream decay coefficient for farm B
real<lower = 0> a4; // downstream decay coefficient for farm B
real<lower = 0> a5; // upstream decay coefficient for farm C
real<lower = 0> a6; // downstream decay coefficient for farm C
real<lower=0, upper=1> theta; //flat probabilty of non-detections
real<lower=0> coef; //coefficient making sigma proportional to mu
real<lower=0> c; //flat background pathogen level
real<lower=0> tau; //amount of overdispersion in the zeros of the NB
}
transformed parameters {
vector<lower = 0>[N] mu; // expected pathogen density
vector<lower = 0>[N] muA; // expected pathogen density at farm A
vector<lower = 0>[N] muB; // expected pathogen density at farm B
vector<lower = 0>[N] muC; // expected pathogen density at farm C
vector[N] alpha;
vector[N] beta;
vector[N] sigma;
vector[N] phi; //probability of getting a zero at low values of mu
vector[N] omega; //omega is the probability of getting a zero
for (n in 1:N) {
muA[n] = yA(bA, x[n], a1, a2, fAu, fAd);
muB[n] = yB(bB, x[n], a3, a4, fBu, fBd);
muC[n] = yC(bC, x[n], a5, a6, fCu, fCd);
mu[n] = muA[n] + muB[n] + muC[n] + c;
sigma[n] = sqrt(coef*mu[n]);
alpha[n] = mu[n]^2 / sigma[n]^2;
beta[n] = mu[n] / sigma[n]^2;
phi[n] = exp(neg_binomial_lpmf(0 | mu[n] / tau, 1/tau));
omega[n] = theta + phi[n] - theta * phi[n];
}
}
model {
// priors
bA ~ normal(0, 1000);
bB ~ normal(0, 1000);
bC ~ normal(0, 1000);
a1 ~ normal(0.001, 0.1);
a2 ~ normal(0.001, 0.1);
a3 ~ normal(0.001, 0.1);
a4 ~ normal(0.001, 0.1);
a5 ~ normal(0.001, 0.1);
a6 ~ normal(0.001, 0.1);
theta ~ normal(0, 1);
c ~ normal(0, 10000);
coef ~ normal(0, 50);
tau ~ normal(0, 50);
// likelihood
for (i in 1:N) {
if (y[i] == 0) {
target += bernoulli_lpmf(1 | omega[i]);
} else {
target += bernoulli_lpmf(0 | omega[i]);
target += gamma_lpdf(y[i] | alpha[i], beta[i]);
}
}
}
generated quantities {
//Generate replicated data from the model prediction
real y_rep[N]; //add bernoulli draws here too
for (i in 1:N){
y_rep[i] = bernoulli_rng(1 - omega[i]) * gamma_rng(alpha[i], beta[i]);
}
//y_rep = gamma_rng(alpha, beta);
}
Define the laplace function in R
####Laplace function----
laplace <- function(bA, bB, bC, a1, a2, a3, a4, a5, a6,
x, fAu, fAd, fBu, fBd, fCu, fCd){ #where bk is a scaling factor, a1 and a2 are exponential coefficients, x is a sample and
# #xk is the location of the farm
#Farm A
if (x<fAu){
yA <- (bA*exp(-a1*abs(fAu-x))) #this is the upstream function
}
else if (x>fAd){
yA <- (bA*exp(-a2*abs(x-fAd))) #and this is the downstream function
}
#Farm B
if (x<fBu){
yB <- (bB*exp(-a3*abs(fBu-x))) #this is the upstream function
}
else if (x>fBd){
yB <- (bB*exp(-a4*abs(x-fBd))) #and this is the downstream function
}
#Farm C
if (x<fCu){
yC <- (bB*exp(-a5*abs(fCu-x))) #this is the upstream function
}
else if (x>fCd){
yC <- (bC*exp(-a6*abs(x-fCd))) #and this is the downstream function
}
y <- yA + yB + yC
}
Data simulation
x <- c(0.0, 0.0, 1745.9, 1745.9, 2873.3, 2873.3, 3706.9, 3706.9, 4181.4,
4181.4, 4531.7, 4531.7, 4783.5, 4783.5, 4941.6, 4941.6, 5052.6, 5052.6,
5101.7, 5101.7, 5101.7, 5101.7, 5121.4, 5121.4, 5135.9, 5135.9, 5374.6,
5374.6, 5393.7, 5393.7, 5441.3, 5441.3, 5441.3, 5441.3, 5544.0, 5544.0,
5691.0, 5691.0, 5893.4, 5893.4, 6198.6, 6198.6, 6378.8, 6378.8, 6525.5,
6525.5, 6664.8, 6664.8, 6806.1, 6806.1, 6889.2, 6889.2, 6934.9, 6934.9,
6964.0, 6964.0, 7309.8, 7309.8, 7336.2, 7336.2, 7385.1, 7385.1, 7451.2,
7451.2, 7596.8, 7596.8, 7802.6, 7802.6, 8322.9, 8322.9, 8524.1, 8524.1,
8687.2, 8687.2, 8790.4, 8790.4, 8838.7, 8838.7, 8871.3, 8871.3, 9111.2,
9111.2, 9126.8, 9126.8, 9148.3, 9148.3, 9197.1, 9197.1, 9298.1, 9298.1,
9444.2, 9444.2, 9591.2, 9591.2, 9591.2, 9591.2, 9829.3, 9829.3, 10233.7,
10233.7, 12305.1, 12305.1, 14182.6, 14182.6)
#Set farm locations
fAd <- 5180.1 #downstream edge of the farm
fAu <- 5350.1 #upstream edge of farm
fBd <- 7600.1
fBu <- 7780.1
fCd <- 9600.1
fCu <- 9800.1
#Set other parameters
tau <- 1 #overdispersion parameter in the neg binom
c <- 25 #constant background pathogen level
theta <- 0.25 #constant probability of getting zero in the bernoulli
coef <- 1
a1 <- 0.0019
a2 <- 0.0009
a3 <- 0.06
a4 <- 0.06
a5 <- 0.003
a6 <- 0.0007
bA <- 50
bB <- 50
bC <- 50
#Generate other parameters:
set.seed(2)
mu <- numeric(length(x))
phi <- numeric(length(x))
sigma <- numeric(length(x))
for (i in 1:length(x)){
mu[i] <- laplace(bA, bB, bC, a1, a2, a3, a4, a5, a6,
x[i], fAu, fAd, fBu, fBd, fCu, fCd) + c
phi[i] <- dnbinom(0, mu=mu[i], size=mu[i] / tau) #probability of observing zero given size and mu
sigma[i] <- sqrt(coef*mu[i])
}
omega <- theta+phi-theta*phi
ygamma <- rgamma(length(x), shape=mu^2/sigma^2, scale=sigma^2/mu)
#bernoulli is a special case of a binomial distribution with a single trial:
s <- 1 - rbinom(length(x), size=1, p=omega) #randomly drawn number of successes given omega
y <- s*ygamma
Model fitting
#Put data into a list for stan
sim.data <- list(N = length(x),
x = x,
y = y,
fAu = fAu,
fAd = fAd,
fBu = fBu,
fBd = fBd,
fCu = fCu,
fCd = fCd
)
#Plot data
plot(sim.data$x, sim.data$y)
rect(xleft = c(fAd, fBd, fCd), xright = c(fAu, fBu, fCu),
ybottom = min(sim.data$y), ytop = max(sim.data$y),
border = NA, col = adjustcolor("red", alpha = 0.3))
#Now we try to fit this simulated data to the model in stan
sim.fit <- stan(
file='./models/laplace_multifarm_multidecay_multipeak.stan',
data=sim.data,
chains=4,
cores=4,
iter=4000,
warmup=2000
)
Plot mean model and deterministic version of the model with true parameter values
posterior <- rstan::extract(sim.fit, inc_warmup=FALSE, permuted=TRUE)
nsamp <- length(posterior$a1) / 10
x.rep <- seq(from =0, to= 16000, by=10)
mu.rep <- matrix(data=NA, nrow=nsamp, ncol=length(x.rep))
phi.rep <- matrix(data=NA, nrow=nsamp, ncol=length(x.rep))
omega.rep <- matrix(data=NA, nrow=nsamp, ncol=length(x.rep))
true.mu <- c()
for (i in 1:nsamp){
for (j in 1:length(x.rep)){
i.samp <- 10 * i
true.mu[j] <- laplace(bA, bB, bC, a1, a2, a3, a4, a5, a6,
x.rep[j], fAu, fAd, fBu, fBd, fCu, fCd)
mu.rep[i, j] <- laplace(posterior$bA[i], posterior$bB[i],
posterior$bC[i], posterior$a1[i], posterior$a2[i],
posterior$a3[i], posterior$a4[i], posterior$a5[i], posterior$a6[i],
x.rep[j], fAu, fAd, fBu, fBd, fCu, fCd) + posterior$c[i]
phi.rep[i, j] <- dnbinom(0, mu=mu.rep[i, j], size=mu.rep[i, j] / posterior$tau[i.samp]) #calculates probability of success
#Now we calculate omega, the probability of getting a zero, combining a constant probability of zero (theta) and a
#variable probabilty arising from a NB distribution where the probability is higher when mu is low (phi)
omega.rep[i, j] <- posterior$theta[i.samp] + phi.rep[i, j] - posterior$theta[i.samp] * phi.rep[i, j]
}
}
#Calculate mean from each observation of mu at each smooth x:
mean.mu <- apply(mu.rep, 2, mean) #the two here indicates to take mean over columns
adj.mu <- (1 - omega.rep) * mu.rep
mean.mu.zeros <- apply(adj.mu, 2, mean)
#Calculate 2.5% and 97.5% quantiles for each mean mu
quant2.mu.zeros <- apply(adj.mu, 2, quantile, probs=0.025)
quant97.mu.zeros <- apply(adj.mu, 2, quantile, probs=0.975)
mu.df <- as.data.frame(cbind(x=x.rep,
y = mean.mu.zeros,
quant2.zeros = quant2.mu.zeros,
quant97.zeros = quant97.mu.zeros,
true.mu = true.mu))
sim.data.df <- data.frame(x = sim.data$x,
y = sim.data$y)
ggplot() +
geom_point(data=sim.data.df, aes(x=x, y=y), shape=1, size =2) +
geom_ribbon(data = mu.df, aes(x = x.rep, ymin=quant2.zeros, ymax=quant97.zeros), fill="black", alpha=0.3)+
geom_line(data = mu.df, aes(x = x.rep, y=y), color="black")+
geom_line(data = mu.df, aes(x = x.rep, y=true.mu), color = "purple")+ #add the deterministic model with the true parameters
ylab("eDNA concentration (copies/µL)")+
xlab("Sample location")+
theme_bw(base_size = 20)+
#geom_hline(yintercept = 100)# +
geom_rect(aes(xmin=c(fAu, fBu, fCu),
xmax=c(fAd, fBd, fCd),ymin=-Inf,ymax=Inf),
fill='red', alpha= 0.3) +
ylim(c(0, 1000))