Hello all,
I am trying to fit a GP model where I cannot observe time directly but I know the event happened at some point between a start date and an end date. At this stage there is no way to know the duration of the event or when it most likely happened. For each time range, I have an observed variable I want to measure, and the goal is to get a measurement error for that value that is accounting for the uncertainty in these observations. Also, I want to stratify by four different categories (animal species in this example) so that I get a curve for each.
I do not come from a mathematical/CS background (but from Social Sciences) so I apologise in advance for the trivial question and for obviously resorting to other automated forms of help before asking here - as I never got formal training in Statistics.
With a subset the sampling works fine, and the results on a simulated dataset are not so far off. However, with my real dataset (5000+ observations) the warmup doesn’t even start even after 48h. Googling a bit, I found Hilbert Space GP approximation instead of decomposing the matrix (which might explain why it takes so long), but - as far as I can tell - it wouldn’t be straightforward to keep the measurement error in the predictor (time) propagated into the observed variable?
I have also uploaded for convenience the script for the simulation (which work) because they have a small number of observations. If you prefer I can also upload my real dataset (it comes from an open access publication). I have also uploaded a screenshot of how the data looks like in terms of time ranges and the value they represent.
Very sorry for the long post, but I really don’t know what steps to take to fit this model (or another model with the same goal). I have also thought of something similar to spatial modelling where points that are close in space with similar values are condensed into one but I really wouldn’t know how to do that in Stan and how would I keep the measurement errors in that way.
I would appreciate any help or lead into how to manage a large dataset for GP, or alternatives, or anything I can try to study to solve this problem.
Thank you so much in advance!
Model Script
data {
int<lower=1> N; // Total number of observations
int<lower=1> N_taxa; // Number of taxa
array[N] int<lower=1, upper=N_taxa> taxon_id; // Taxon indicator for each obs
array[N] real y; // Observed LSI values
array[N] real start_date; // Lower bound of date range
array[N] real end_date; // Upper bound of date range
// Prediction grid
int<lower=1> N_pred; // Number of prediction points
array[N_pred] real x_pred; // Prediction time points
}
transformed data {
// Normalize time to [0, 1] for better numerical behavior
real time_min = min(start_date);
real time_max = max(end_date);
real time_range = time_max - time_min;
// Normalized start/end dates
array[N] real start_norm;
array[N] real end_norm;
// Normalized prediction points
array[N_pred] real x_pred_norm;
for (n in 1:N) {
start_norm[n] = (start_date[n] - time_min) / time_range;
end_norm[n] = (end_date[n] - time_min) / time_range;
}
for (p in 1:N_pred) {
x_pred_norm[p] = (x_pred[p] - time_min) / time_range;
}
}
parameters {
// Latent true dates (on unit interval, will be transformed)
array[N] real<lower=0, upper=1> true_date_raw;
// GP hyperparameters (one set per taxon)
array[N_taxa] real<lower=0> alpha; // Marginal standard deviation
array[N_taxa] real<lower=0> rho; // Length-scale
// Observation noise
real<lower=0> sigma;
// Mean LSI for each taxon
array[N_taxa] real mu;
}
transformed parameters {
// Transform raw dates to actual date range [start, end]
array[N] real true_date_norm;
for (n in 1:N) {
// Linear interpolation between normalized start and end dates
true_date_norm[n] = start_norm[n] + true_date_raw[n] * (end_norm[n] - start_norm[n]);
}
}
model {
// Priors on GP hyperparameters
alpha ~ normal(0, 0.5); // Prior on marginal SD
rho ~ inv_gamma(5, 1); // Prior on length-scale (prevents too short)
sigma ~ exponential(10); // Prior on observation noise
mu ~ normal(0, 0.5); // Prior on mean LSI
// Uniform prior on true dates (implicit via bounds on true_date_raw)
// Model each taxon with its own GP
for (t in 1:N_taxa) {
// Get indices for this taxon
int n_t = 0;
for (n in 1:N) {
if (taxon_id[n] == t) n_t += 1;
}
// Skip if no observations for this taxon
if (n_t > 0) {
// Extract data for this taxon
array[n_t] real x_t;
vector[n_t] y_t;
{
int idx = 1;
for (n in 1:N) {
if (taxon_id[n] == t) {
x_t[idx] = true_date_norm[n];
y_t[idx] = y[n];
idx += 1;
}
}
}
// Build covariance matrix using Stan's built-in function
matrix[n_t, n_t] K = gp_exp_quad_cov(x_t, alpha[t], rho[t]);
// Add jitter to diagonal for numerical stability
for (i in 1:n_t) {
K[i, i] += 1e-9;
}
// Add observation noise
for (i in 1:n_t) {
K[i, i] += square(sigma);
}
// Likelihood (multivariate normal)
y_t ~ multi_normal(rep_vector(mu[t], n_t), K);
}
}
}
generated quantities {
// Posterior predictions at grid points for each taxon
array[N_taxa, N_pred] real f_pred;
for (t in 1:N_taxa) {
// Get data for this taxon
int n_t = 0;
for (n in 1:N) {
if (taxon_id[n] == t) n_t += 1;
}
if (n_t > 0) {
// Extract data
array[n_t] real x_t;
vector[n_t] y_t;
{
int idx = 1;
for (n in 1:N) {
if (taxon_id[n] == t) {
x_t[idx] = true_date_norm[n];
y_t[idx] = y[n];
idx += 1;
}
}
}
// Covariance matrices for prediction
matrix[n_t, n_t] K = gp_exp_quad_cov(x_t, alpha[t], rho[t]);
// Add jitter for numerical stability
for (i in 1:n_t) {
K[i, i] += 1e-9;
}
for (i in 1:n_t) {
K[i, i] += square(sigma);
}
// Cross-covariance between training and prediction points
matrix[n_t, N_pred] K_x_xpred;
for (i in 1:n_t) {
for (j in 1:N_pred) {
K_x_xpred[i, j] = square(alpha[t]) *
exp(-0.5 * square(x_t[i] - x_pred_norm[j]) / square(rho[t]));
}
}
// Predictive covariance at prediction points
matrix[N_pred, N_pred] K_pred = gp_exp_quad_cov(x_pred_norm, alpha[t], rho[t]);
// Add jitter to K_pred for numerical stability
for (i in 1:N_pred) {
K_pred[i, i] += 1e-9;
}
// GP prediction (mean)
matrix[n_t, n_t] L_K = cholesky_decompose(K);
vector[n_t] y_centered = y_t - mu[t];
vector[n_t] L_K_div_y = mdivide_left_tri_low(L_K, y_centered);
vector[n_t] K_inv_y = mdivide_right_tri_low(L_K_div_y', L_K)';
vector[N_pred] f_mean = mu[t] + K_x_xpred' * K_inv_y;
// Predictive variance
matrix[n_t, N_pred] v = mdivide_left_tri_low(L_K, K_x_xpred);
matrix[N_pred, N_pred] f_cov = K_pred - v' * v;
// Sample from predictive distribution
f_pred[t] = to_array_1d(multi_normal_rng(f_mean, f_cov + diag_matrix(rep_vector(1e-9, N_pred))));
} else {
// No data for this taxon, sample from prior
for (p in 1:N_pred) {
f_pred[t, p] = normal_rng(0, alpha[t]);
}
}
}
// Output the actual true dates (back-transformed)
array[N] real true_date_actual;
for (n in 1:N) {
true_date_actual[n] = time_min + true_date_norm[n] * time_range;
}
}
01_Script_GP_Simulated.R (5.3 KB)
02_Visualise_Simulated_Data.R (5.1 KB)
03_Fit_and_Visualise.R (7.6 KB)




