Hello,
I uploaded two covariates in file test_rstan_2.txt
(header, covariates in columns, one row per time point):
test_rstan_2_data.txt (1.8 KB)
I combined these covariates in the vector q
(one value per time point) and then I simulated data like this dat = cumul(q) + noise
. I want to estimate three parameters: two used to compute q
from the covariates, the third being the error stdev. Only a subset of all time points are used for inference.
Here is the function generating q
from the covariates:
M77 <- function(x, params){
## extract data
PAR <- x$PAR
LAI <- x$LAI
plantDensity <- x$plantDensity
epsilon_PAR <- x$epsilon_PAR
## extract parameters
k_B <- params$k_B
RUE <- params$RUE
## compute biomass
q <- ((PAR * epsilon_PAR) / plantDensity) *
(1 - exp(-k_B * LAI)) *
RUE
return(cumsum(q))
}
Here is how I simulated the data with R:
set.seed(1234)
tmp <- read.table("test_rstan_2_data.txt", header=TRUE, sep="\t")
covars <- list("T"=nrow(tmp), "PAR"=tmp$PAR, "LAI"=tmp$LAI,
"plantDensity"=160, "epsilon_PAR"=0.95)
true_params <- list("k_B"=0.59, "RUE"=2.8, "sigma_e2"=0.1)
cumulq <- M77(x=covars, params=true_params)
dat <- abs(rnorm(n=length(cumulq),
mean=cumulq,
sd=sqrt(true_params$sigma_e2)))
length(sub_idx <- seq(1, length(cumulq), by=15))
plot(1:covars$T, dat, type="b", las=1, col="darkgrey")
points(sub_idx, dat[sub_idx], col="red", pch=19)
Here is Stan’s code:
functions {
vector M77(int T, vector PAR, vector LAI, int plantDensity, real epsilon_PAR,
real k_B, real RUE){
vector[T] q;
q = ((PAR * epsilon_PAR) / plantDensity) .* (1 - exp(-k_B * LAI)) * RUE;
return cumulative_sum(q);
}
}
data {
int<lower=1> T;
int<lower=1> plantDensity;
real<lower=0,upper=1> epsilon_PAR;
vector[T] PAR;
vector[T] LAI;
real<lower=0> y[T];
}
parameters {
real<lower=0,upper=1> k_B;
real<lower=0> RUE;
real<lower=0> sigma_e;
}
model {
vector[T] q;
k_B ~ normal(0.5, 0.5);
RUE ~ normal(2.5, 0.5);
sigma_e ~ cauchy(0, 0.1);
q = M77(T, PAR, LAI, plantDensity, epsilon_PAR, k_B, RUE);
y ~ normal(q, sigma_e);
}
I saved it into a character variable and then into a file:
modelName <- "M77"
stanF <- paste0(modelName, ".stan")
write(stan_code, stanF)
Then it compiled successfully:
library(rstan)
cm <- stanc(file=stanF, model_name=modelName)
sm <- stan_model(stanc_ret=cm)
Sampling worked well (n_eff
above 1500 for all three parameters):
stan_dat <- list("T"=length(sub_idx), "PAR"=covars$PAR[sub_idx],
"LAI"=covars$LAI[sub_idx],
"plantDensity"=160, "epsilon_PAR"=0.95,
"y"=dat[sub_idx])
fit <- sampling(sm, data=stan_dat,
warmup=500, iter=1000, chains=4, cores=4, thin=1)
fit
Confirmed when looking at the trace plots:
traceplot(fit, pars=c("k_B")) + geom_hline(yintercept=true_params$k_B)
traceplot(fit, pars=c("RUE")) + geom_hline(yintercept=true_params$RUE)
traceplot(fit, pars=c("sigma_e")) + geom_hline(yintercept=sqrt(true_params$sigma_e2))
The parameters k_B
and RUE
are well estimated.
However, sigma_e
is systematically over-estimated: its posterior mean is around 5 whereas the true value used to simulate the data is \sqrt(0.1)
.
Is it expected? Should I change something?
Thanks in advance!