Hi all,
First-time poster here… I’m quite new to stan and I have an application where I need to combine observations of the same population, but where some of the most recent observations have a different (known) variance. I’m struggling with how to specify this in a way that works.
Here is the code to simulate and the dataset:
library(tidyverse)
library(rstan)
set.seed(100)
n <- 20
data <- as.data.frame(matrix(ncol = 16, nrow = 0))
params <- as.data.frame(matrix(ncol = 2, nrow = 0))
true_17 <- as.data.frame(matrix(ncol = 2, nrow = 0))
obs_17 <- as.data.frame(matrix(ncol = 2, nrow = 0))
for(i in c(1:n)) {
beta_1 <- rnorm(1, 0, .25) # E[beta_1] = 1 across all time series
beta_0 <- rnorm(1, 7, .25) # E[beta_0] = 7 across all time series (rough log mean of populations)
ts <- c(i)
for(j in c(1:16)) {
y <- beta_0 + beta_1*j + rnorm(1, 0, 2) # data follows some linear trajectory with noise
ts <- c(ts, y)
}
# extend the true sequence out by one
v17 <- beta_0 + beta_1*17 + rnorm(1, 0, 2)
true_17 <- rbind(true_17, c(i, v17))
# then observe that with some additional noise
obs_17 <- rbind(obs_17, c(i, v17 + rnorm(1, 0, 0.5)))
data <- rbind(data, ts)
params <- rbind(params, c(beta_1, beta_0))
}
names(data) <- c("i", paste0("t", c(1:16)))
names(params) <- c("beta_1", "beta_0")
names(obs_17) <- c("i", "obs_17")
names(true_17) <- c("i", "true_17")
obs_17$t <- 17
obs_17 <- rename(obs_17, "p_i" = 'obs_17')
data_long <- data %>%
pivot_longer(-one_of("i"), names_to = "t", values_to = "p_i") %>%
mutate(t = as.numeric(str_sub(t, 2)))
# plot raw data with true and observed last point
data_long %>%
filter(i %in% sample(1:n, min(20, n))) %>%
ggplot(aes(x = t, y = p_i)) +
geom_path() +
geom_point(data = true_17, aes(x = 17, y = true_17), color = 'darkgreen') +
geom_point(data = obs_17, aes(x = 17, y = p_i), color = 'red') +
facet_wrap(.~i)
And here is the model as I have it right now. Basically, I’m just fitting lines through 16 points for n separate groups. I’ve tried implementing the additional point with known variance in the likelihood.
stan_model <- "
data {
int N_obs; // number of obs
int N_acs; // number of acs obs
int N_ig; // number of ig obs
int N_i; // number of areas
int K; // number of betas in linear eqtn
int id[N_obs]; // id vector
matrix[N_obs, K] x; // matrix of intercept + predictor
real y[N_obs]; // outcome vector
real sigma_ig; // infogroup standard error
}
parameters {
vector[K] beta[N_i]; // N_i beta_0s
real<lower=0> sigma[N_i]; // sigma on regression, estimated here for now
}
model {
// priors for each subgroup
for (i in 1:N_i) {
beta[i] ~ student_t(4, 0, 2.5);
sigma[i] ~ exponential(1);
}
// likelihood for each observation
for (j in 1:N_acs) {
y[j] ~ normal(x[j] * (beta[id[j]]), sigma[id[j]]);
}
for (k in N_acs:N_ig) {
y[k] ~ normal(x[k] * (beta[id[k]]), sigma[id[k]]);
}
}
At the bottom of this post I’ve entered code to fit this model, get predictions from it and plot them. The plot it produces looks like this:
The red line is a linear fit to the data including the 17th point, while the blue line includes only the first 16 points. I expected that the second for loop in the stan model would update the predictions from the model (dashed line) to look more like the red line than the blue one, but that’s clearly not the case… it seems more like these 17th data points don’t do anything at all. Is this not how to specify observations with different errors? How should I do it? Thanks in advance!
N_acs <- nrow(data_long)
N_ig <- nrow(obs_17)
all_data <- rbind(data_long, obs_17)
model_data <- list(
N_obs = N_acs + N_ig,
N_acs = N_acs,
N_ig = N_ig,
N_i = length(unique(all_data$i)),
K = 2,
id = all_data$i,
x = matrix(
c(
rep(1, N_acs + N_ig),
as.numeric(scale(all_data$t))
),
ncol = 2
),
y = as.numeric(scale(all_data$p_i)),
sigma_ig = 1
)
# Fit model
fit3 <- stan(
model_code = stan_model,
data = model_data,
chains = 4,
warmup = 1000,
iter = 2000,
cores = parallel::detectCores(),
refresh = 0
)
samples <- extract(fit3)
# -------------
# prediction function
# -------------
predict <- function(x_new, ids, samples) {
x_new <- as.numeric(scale(x_new))
n_draws <- dim(samples$beta)[1]
n_data <- length(x_new)
pb <- txtProgressBar(min = 0, max = n_draws*n_data, style = 3)
p <- 0
y <- matrix(nrow = n_draws, ncol = n_data)
for(k in seq_len(n_draws)) {
for(j in seq_len(n_data)) {
i <- ids[j]
beta_0 <- samples$beta[k,i,1]
beta_1 <- samples$beta[k,i,2]
y[k,j] <- beta_0 + beta_1*x_new[j]
p <- p + 1
setTxtProgressBar(pb, p)
}
}
close(pb)
y_pred <- colMeans(y)
return(y_pred)
}
y_pred <- predict(all_data$t, all_data$i, samples)
all_data$yhat <- y_pred * sd(all_data$p_i) + mean(all_data$p_i)
# plot fit lines
all_data %>%
filter(i %in% sample(1:n, min(20, n))) %>%
ggplot(aes(x = t, y = p_i)) +
geom_path() +
geom_path(aes(y = yhat), color = 'red', linetype = 'dashed') +
geom_smooth(data = all_data %>% filter(t < 17), method = 'lm', color = 'blue', fill = "NA") +
geom_smooth(data = all_data, method = 'lm', color = 'red', fill = "NA") +
geom_point(data = true_17, aes(x = 17, y = true_17), color = 'darkgreen') +
geom_point(data = obs_17, aes(x = 17, y = p_i), color = 'red') +
facet_wrap(.~i)