I am currently running MCMC to sample a posterior distribution. The model used is Hierarchical Bayesian model, and my Stan program is below:
data {
int<lower=1> ncol;
int<lower=1> nrow;
vector[ncol] yH;
vector[nrow] x;
matrix[nrow, ncol] A;
vector[nrow] sigma_x;
vector[ncol] sigma_y;
#matrix[nrow,ncol] sigma_x; #covariance matrix for sampling yT from multiNormal(u_1, sigma_x)
#matrix[nrow,ncol] sigma_y; #covariance matrix for sampling x from multiNormal(u_2, sigma_y)
vector[nrow] epsilon;
}
parameters {
vector<lower = 0>[ncol] yT;
}
model {
yT ~ normal(yH, sigma_y); //prior
//yT ~ multi_normal(yH, sigma_y);
x ~ normal(A*yT + epsilon, sigma_x); //likelihood
//x ~ multi_normal(A*yT + epsilon, sigma_x);
}
Below is my Rcode to run the Stan model above:
library(rstan)
library(MASS)
#library(truncnorm)
library(bayesplot)
library(bridgesampling)
library(MCMCpack)
library(bayesplot)
library(ggplot2)
library(coda)
library(clusterGeneration)
require(MASS)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
#data preparation for inputting into Stan
for(i in 1:15){
nrow = nrow(rte_m[[i]]);
ncol = ncol(rte_m[[i]]);
A <- as.matrix(rte_m[[i]]);
sigma_x <- as.vector(sample.int(10, nrow(kf_vect[[i]]), replace=TRUE))
sigma_y <- as.vector(eps_vect[[i]])
yH <- as.vector(dh_vect[[i]]$X);
yT <- yH + as.vector(eps_vect[[i]]);
epsilon <- sample.int(15, nrow(kf_vect[[i]]), replace=TRUE)
x <- as.vector(as.matrix(rte_m[[i]])%*%yT) + epsilon
iterations = 100;
#input data into a list called stan_data
# stan_data = list(nrow = nrow, ncol = ncol,
# yH = yH,
# x = x, epsilon = epsilon,
# A = A, sigma_x = sigma_x, sigma_y = sigma_y);
#input it into our Stan model file "stamodeling.stan"
stanmodel1 <- stan_model(file = "stamodeling.stan",
model_name = "stanmodel1");
#NUTS (No U-Turn) sampler to generate posterior distribution
stanfit <- sampling(stanmodel1, data = list(ncol = ncol,nrow = nrow,
yH = yH,
x=x, epsilon = epsilon,
A = A, sigma_x = sigma_x, sigma_y = sigma_y)
,iter=iterations, chains = 3, cores = 2, control = list(max_treedepth = 15));
print(i)
#extract the result and plot graph
posterior_yTtheta <- as.array(stanfit)
file_name10 <- paste("Comparison_KFnnls_Day", i, ".jpeg")
jpeg(file_name10)
plot(yH, col=1, type='l', ylab="d", xlab = "Zone index", ylim=c(0,max(yT)))
lines(colMeans(posterior_yTtheta[,2,]),col=2, type = 'l', ylab="d", xlab = "Zone index")
lines(yT,col=3)
lines(mean_kf[[i]], col=4, type = 'l', ylab = "d")
legend(x = "topleft",legend = c("prior", "recovered", "true", "Kalman-Filter"), col=1:4, lty=c(1,1,1,1))
dev.off()
fit_summary <- summary(stanfit)
file_name6 = paste("resultsummary_KFnnls_ Day", i)
jpeg(file_name6)
print(fit_summary$summary)
dev.off()
color_scheme_set("mix-brightblue-gray")
file_name = paste("Rhatnnls_histogram_Day", i)
jpeg(file_name)
rhats <- rhat(stanfit)
rhat_plt <- mcmc_rhat_hist(rhats) + yaxis_text(hjust = 1) + ggtitle(paste("Rhatnnls_histogram_Day", i))
print(rhat_plt)
dev.off()
}
My question Can anyone please help aid me in some ways to speed up this sampling process?? As you can see from the code, I am doing MCMC sampling with high-dimensional, large dataset (matrix A might be 875\times 875). I let Rstudio run for 18 hours straight on a Mac Air with 8GB RAM and 65.7 GB available in memory, but it is unable to complete 3 chains with only 100 iterations. I even turn on options(mc.cores = parallel::detectCores())
and rstan_options(auto_write = TRUE)
, but nothing helps speed up the sampling process.
Another thing I found weird is that I frequently encountered hash mismatching so recompiling....
message, even though I always click on rstan_options(auto_write = TRUE)
and check my dev.off()
to make sure it is 1
.
Link to the datafile loaded onto R
Here is the blog of activity monitor on my laptop: