Hi all,
I am fitting a Gaussian Process model on about 200 obs. with 4 fixed covariates and I would like to use this model to predict on about 5000 locations. To reduce the dimension of the problem I am passing the parameters for the exponential covariance as constants, having estimated them by fitting a variogram on the residuals of a GLM model.
The program has been running for more than 7 hours, is this to be expected? And do you know any way I can speed this up?
I attached the data, python code and stan model.
Thanks!!
Data:
data.txt (761.0 KB)
Python code:
import numpy as np
import json
import pystan
with open('data.json') as f:
data_json = json.load(f)
N1 = data_json['N1'] // number of non-missing obs.
N = data_json['N'] // number of prediction points
K = data_json['K'] // number of covariates (including intercept)
X1 = np.array(data_json['X1']) // covariates (including intercept) for non-missing obs.
X = np.array(data_json['X']) // covariates (including intercept) for prediction points
y1 = np.array(data_json['y1']) // response variable for non-missing obs.
coords1 = np.array(data_json['coords1']) // coords of non-missing obs.
coords = np.array(data_json['coords']) // coords of prediction points
pho = data_json['pho'] // range
sigma = data_json['sigma'] // sill
tau = data_json['tau'] // nugget
data = {'N1': X1.shape[0],'N': X.shape[0], 'K': X1.shape[1],
'X1': X1, 'X': X, 'y1': y1, 'coords1':coords1,'coords':coords,
'pho':pho, 'sigma':sigma, 'tau':tau}
# Compile the model
stanm = pystan.StanModel(file='./stan/GP_gamma_model_exact.stan')
# Train the model and generate samples
stan_fit = stanm.sampling(data=data, iter=1000, chains=3,
warmup=500, thin=1, seed=101)
Stan code:
functions{
matrix cov_exp_L(real sigmasq,real tausq,real pho, vector vdist, int N){
int h = 0;
matrix[N, N] K;
for (j in 1:(N - 1)){
K[j, j] = sigmasq + tausq;
for (k in (j + 1):N){
h = h + 1;
K[j, k] = sigmasq * exp(- pho * vdist[h]);
K[k, j] = K[j, k];
}
}
K[N, N] = sigmasq + tausq;
return cholesky_decompose(K);
}
vector get_vdist(matrix coords){
int h = 0;
int N = dims(coords)[1];
vector[N * (N - 1) / 2] vdist;
for(j in 1:(N - 1)){
for(k in (j + 1):N){
h = h + 1;
vdist[h] = distance(coords[j, ], coords[k, ]);
}
}
return vdist;
}
}
data {
int N; //the number of observations
int N1; //the number of observations (non-missing)
int K; //the number of columns in the model matrix
matrix[N,K] X; //the model matrix
matrix[N, 2] coords; //the coordinates matrix
real y1[N1]; //the response (non-missing)
matrix[N1,K] X1; //the model matrix (non-missing)
matrix[N1, 2] coords1; //the coordinates matrix (non-missing)
real sigma;
real tau;
real pho;
}
transformed data{
vector[N * (N - 1) / 2] vdist;
matrix[N, N] L;
vdist = get_vdist(coords);
L = cov_exp_L(square(sigma), square(tau), pho, vdist, N);
}
parameters {
vector[K] betas; //the regression parameters
real<lower=0> inverse_phi; //the variance parameter
vector[N] eta;
}
transformed parameters {
vector[N] mu; //the expected values (linear predictor)
vector[N] beta; //rate parameter for the gamma distribution
vector[N1] mu1; //the expected values (linear predictor, non-missing)
vector[N1] beta1; //rate parameter for the gamma distribution (non-missing)
vector[N] gp;
gp = L*eta;
mu1 = exp(X1*betas + gp[1:N1]); //using the log link
beta1 = rep_vector(inverse_phi, N1) ./ mu1;
mu = exp(X*betas + gp); //using the log link
beta = rep_vector(inverse_phi, N) ./ mu;
}
model {
betas ~ cauchy(0,2.5); // prior on the fixed effects
inverse_phi ~ exponential(1); // prior on inverse dispersion parameter
eta ~ std_normal();
y1 ~ gamma(inverse_phi,beta1);
}
generated quantities {
vector[N] y_rep;
for(n in 1:N){
y_rep[n] <- gamma_rng(inverse_phi,beta[n]); //posterior draws to get posterior predictive checks
}
}