I am trying to fit a hierarchical, exponential decay model with n phases to grouped data. That is, if an individual belongs to group k, then their expected value is
V_k(t) = \sum_{i=1}^n \lambda_{ik} exp((-1/r_i) t),
with the constraint \sum_{i=1}^n \lambda_i = 1.
Effectively, my model is representing fixed decay rates per exponential decay phase, with the only differences between groups being the proportion of the individual in each phase.
I am assuming that my data is normally distributed with rate above, and fixed variance \sigma, and that r_j is drawn from some population level distribution. That is, individual j, in group k, and considering just exponential phase l gives
r_{jkl} \sim \mathcal{N}(\hat{r}_l + \epsilon_{ik}).
While highly structured, this is fairly simple to model. The challenge comes from the structure of my data. Inside each group is a varying number of individuals.
This means that when I set up my square data structures, I have to pad parameters (the \epsilon's here). This runs fine but I am concerned it is affecting the sampling.
Here is some R code to generate some synthetic data that has the same structure I am on about:
r_hyper <- 20
groups <- 1:2
num_groups <- length(groups)
num_individuals <- c(7, 12)
obs_times <- seq(0, 100, length.out=10)
num_obs <- length(obs_times)
rates <- rnorm(n=sum(num_individuals), mean = r_hyper, sd = 0.1) + rnorm(n=sum(num_individuals), mean = 0, sd = 5)
d <- lapply(1:length(rates), function(i) {
value <- exp(-1/rates[i] * obs_times)
if (i <= 7)
group <- 1
else
group <- 2
data.frame(t=obs_times, group=group, individual = i, value=value)
}) %>% do.call(rbind, .)
setupStanData <- function(data, id = individual, time = t, group = group, value = value) {
group <- enquo(group)
id <- enquo(id)
time <- enquo(time)
value <- enquo(value)
groups <- data %>% select(!!group) %>% unique() %>% .[,1]
num_groups <- length(groups)
num_individuals <- sapply(groups, function(g) {
df <- filter(data, (!!group) == g)
length(unique(df[,quo_name(id)]))
})
num_obs <- lapply(groups, function(g) {
df <- dplyr::filter(data, (!!group) == g)
monkeys <- unique(df[, quo_name(id)])
num_obs <- by(df, df[, quo_name(id)], function(m) length(m[,quo_name(id)])) %>% as.numeric()
num_obs <- c(num_obs, rep(-1, times=max(num_individuals)-length(num_obs)))
}) %>% do.call(rbind, .)
max_obs <- max(num_obs)
times <- lapply(groups, function(g) {
df <- filter(data, (!!group) == g)
monkeys <- unique(df[, quo_name(id)])
t <- lapply(monkeys, function(m) {
df2 <- filter(df, (!!id) == m)
obs <- df2[, quo_name(time)]
obs <- c(obs, rep(-1, times=max(max_obs)-length(obs)))
}) %>% do.call(rbind, .)
})
max_monkeys <- sapply(times, nrow) %>% max()
for (i in 1:length(times)) {
padding <- matrix(-1, nrow = max_monkeys - nrow(times[[i]]), ncol = max_obs)
times[[i]] <- rbind(times[[i]], padding)
}
a <- array(NA, dim=c(num_groups, max_monkeys, max_obs))
for (i in 1:length(times)) {
a[i,,] <- times[[i]]
}
times <- a
y <- lapply(groups, function(g) {
df <- filter(data, (!!group) == g)
monkeys <- unique(df[, quo_name(id)])
v <- lapply(monkeys, function(m) {
df2 <- filter(df, (!!id) == m)
obs <- df2[, quo_name(value)]
obs <- c(obs, rep(-1, times=max(max_obs)-length(obs)))
}) %>% do.call(rbind, .)
})
for (i in 1:length(y)) {
padding <- matrix(-1, nrow = max_monkeys - nrow(y[[i]]), ncol = max_obs)
y[[i]] <- rbind(y[[i]], padding)
}
a <- array(NA, dim=c(num_groups, max_monkeys, max_obs))
for (i in 1:length(y)) {
a[i,,] <- y[[i]]
}
y <- a
return (list(num_obs=num_obs, times=times, y=y, num_groups=num_groups, max_obs=max_obs, num_individuals=as.array(num_individuals)))
}
s <- setupStanData(d)
(change s$num_phases
to be how many decay phases you would like)
and the corresponding stan file:
data {
int num_groups;
int num_phases;
int num_individuals[num_groups];
int max_obs;
int num_obs[num_groups,max(num_individuals)];
real times[num_groups,max(num_individuals), max_obs];
real y[num_groups, max(num_individuals), max_obs];
}
parameters {
real<lower=0> r_hyper[num_phases];
real error[num_groups, max(num_individuals)];
real<lower=0> sigma;
simplex[num_phases] lambda[num_groups];
}
transformed parameters {
real<lower=0>r[num_groups, num_phases, max(num_individuals)];
for (g in 1:num_groups) {
for (p in 1:num_phases) {
for (i in 1:max(num_individuals)) {
if (i > num_individuals[g]) {
r[g,p,i] = 0;
} else {
r[g,p,i] = r_hyper[p] + error[g,i];
}
}
}
}
}
model {
r_hyper ~ normal(5, 20);
for (g in 1:num_groups) {
for (n in 1:num_individuals[g]) {
error[g,n] ~ normal(0, 0.1);
}
}
for (g in 1:num_groups) {
for (j in 1:num_individuals[g]) {
for (t in 1:num_obs[g][j]) {
real rate = 0;
for (k in 1:num_phases) {
rate += lambda[g][k]*exp(-1/r[g,k,j] * times[g,j,t]);
}
y[g,j,t] ~ normal(rate, sigma);
}
}
}
}
I have made heavy use of 3D arrays to structure the data in some way that made sense to me.
Here, I get estimates for error[1,8]
through error[1,12]
, which are ridiculous and unnecessary - group 1 only has 7 individuals.
Is there any way I can get stan to not estimate these parameters, but still estimate error[2,8]
through error[2,12]
, and still continue only requiring a single stan file?