Add GPU in Dynamic Liner Stan Model

Hi there,

We have a dynamic linear model and we would like to add GPU option to increase the speed of the model. However, the speed didn’t increase much after adding the GPU. I wonder if anyone can help take a look at the model and see how to revise the code to increase GPU speed. In addition, the current model is in the Mac system, does anyone know how to revise that to run it in a Linux system?

Thanks a lot and looking forward to your reply!

library(cmdstanr)
library(rstan)
library(mvtnorm)
library(MCMCpack)
library(writexl)
library(openxlsx)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

set.seed(123)  
set_cmdstan_path("/Downloads/cmdstan-2.33.1")

#############################
# 1. Setup the Stan Model ##
#############################

modelst ="
data {
  int<lower=0> T;        // Time points
  int<lower=0> N;        // Number of panels
  int<lower=0> K1;        // State dimension (number of time varying parameters)
  int<lower=0> K2;        // State dimension (number of time varying parameters)
  int<lower=0> K3;        // State dimension (number of time varying parameters)
  matrix[3, T * N] Y;    // Observations (Dependent Variables)
  matrix[K1, T * N] F1;   // F matrix for observation equation 1
  matrix[K2, T * N] F2;   // F matrix for observation equation 2
  matrix[K3, T * N] F3;   // F matrix for observation equation 3
  matrix[4, T * N] G1;   // Independent variables for observation equation 1
  matrix[3, T * N] G2;   // Independent variables for observation equation 2
  matrix[4, T * N] G3;   // Independent variables for observation equation 3
}

parameters {
  vector[N] alpha1;                       // Panel-specific intercepts for observation equation 1
  vector[N] alpha2;                       // Panel-specific intercepts for observation equation 2
  vector[N] alpha3;                       // Panel-specific intercepts for observation equation 3
  matrix[K1, T] s1;                       // States (Time Varying Coefficients theta) for observation equation 1
  matrix[K2, T] s2;                       // States (Time Varying Coefficients theta) for observation equation 2
  matrix[K3, T] s3;                       // States (Time Varying Coefficients theta) for observation equation 3
  vector[4] beta1;                       // Non Time Varying Coefficients for G1
  vector[3] beta2;                       // Non Time Varying Coefficients for G2
  vector[4] beta3;                       // Non Time Varying Coefficients for G3
  vector<lower=0>[3] V_diag;
  real<lower=1> nu;                      // Ensure nu is always positive and greater than 1
  real<lower=0> sigma_s1;                // System noise for state 1
  real<lower=0> sigma_s2;                // System noise for state 2
  real<lower=0> sigma_s3;                // System noise for state 3
}
  

model {
// Add print statements for debugging
  matrix[3, 3] V = diag_matrix(V_diag); // Construct V from diagonal elements
  // Observation Equations - Likelihood
  for(n in 1:N) {
    for(t in 1:T) {
      int index = t + (n - 1) * T;
      vector[3] mu;
      mu[1] = alpha1[n] + dot_product(F1[:, index], s1[:, t]) + dot_product(beta1, G1[:, index]);
      mu[2] = alpha2[n] + dot_product(F2[:, index], s2[:, t]) + dot_product(beta2, G2[:, index]);
      mu[3] = alpha3[n] + dot_product(F3[:, index], s3[:, t]) + dot_product(beta3, G3[:, index]);
      Y[:, index] ~ multi_student_t(nu, mu, V);
    }
  }
  
  // State Equations - State transitions
  s1[:, 1] ~ normal(0, 1);  // Initial state priors
  s2[:, 1] ~ normal(0, 1);
  s3[:, 1] ~ normal(0, 1);
  for(t in 2:T) {
    s1[:, t] ~ normal(s1[:, t-1], sigma_s1);
    s2[:, t] ~ normal(s2[:, t-1], sigma_s2);
    s3[:, t] ~ normal(s3[:, t-1], sigma_s3);
  }

  // Priors for Model Parameters
  alpha1 ~ normal(0, 10);
  alpha2 ~ normal(0, 10);
  alpha3 ~ normal(0, 10);
  beta1 ~ normal(0, 2);
  beta2 ~ normal(0, 2);
  beta3 ~ normal(0, 2);
  nu ~ gamma(2, 0.5); // Example: Gamma distribution with shape=2, rate=0.5
  V_diag ~ normal(1, 0.1); // Mean of 1 and a small standard deviation
  sigma_s1 ~ normal(0, 1);
  sigma_s2 ~ normal(0, 1);
  sigma_s3 ~ normal(0, 1);
}
"

stan_file <- write_stan_file(modelst)
stanmodel <- cmdstan_model(stan_file, cpp_options = list(stan_opencl = TRUE))


##################################
## 2. Generating Simulated data ##
##################################

panel_data <- read.csv("/a.csv")

# Settings
N <- 93        # Number of panels
T <- 29        # Time points
K1 <- 10      # State1 dimension
K2 <- 4        # State2 dimension
K3 <- 4        # State3 dimension

# Simulate state dynamics
state_transition <- function(prev_state, sigma) {
  rnorm(length(prev_state), prev_state, sigma)
}

# Initial states (Time Varying Coefficients)
s1_initial <- rnorm(K1, 0, 0.1)
s2_initial <- rnorm(K2, 0, 0.1)
s3_initial <- rnorm(K3, 0, 0.1)

s1 <- matrix(0, K1, T)
s2 <- matrix(0, K2, T)
s3 <- matrix(0, K3, T)

s1[, 1] <- s1_initial
s2[, 1] <- s2_initial
s3[, 1] <- s3_initial

for(t in 2:T) {
  s1[, t] <- state_transition(s1[, t-1], 1)
  s2[, t] <- state_transition(s2[, t-1], 1)
  s3[, t] <- state_transition(s3[, t-1], 1)
}

# Panel-specific intercepts
alpha1 <- rnorm(N, 0, 0.1)
alpha2 <- rnorm(N, 0, 0.1)
alpha3 <- rnorm(N, 0, 0.1)

# Non Time Varying Coefficients
beta1 <- rnorm(4, 0, 0.1)
beta2 <- rnorm(3, 0, 0.1)
beta3 <- rnorm(4, 0, 0.1)

Y1_matrix <- matrix(panel_data$logbox_t, ncol = T*N, byrow = TRUE)
Y2_matrix <- matrix(panel_data$logvalenceaud_t, ncol = T*N, byrow = TRUE)
Y3_matrix <- matrix(panel_data$logvolumeaud_t, ncol = T*N, byrow = TRUE)

# Stack the matrices to create Y
Y <- rbind(Y1_matrix, Y2_matrix, Y3_matrix)
#create F1
X11_matrix <- matrix(panel_data$x11_t_1, ncol = T*N, byrow = TRUE)
X12_matrix <- matrix(panel_data$x12_t_1, ncol = T*N, byrow = TRUE)
X13_matrix <- matrix(panel_data$x13_t_1, ncol = T*N, byrow = TRUE)
X14_matrix <- matrix(panel_data$x14_t_1, ncol = T*N, byrow = TRUE)
X15_matrix <- matrix(panel_data$x15_t_1, ncol = T*N, byrow = TRUE)


# Stack the matrices to create F3
F1 <- rbind(X11_matrix, X12_matrix, X13_matrix,X14_matrix,X15_matrix)


#create F2
X21_matrix <- matrix(panel_data$x21_t_1, ncol = T*N, byrow = TRUE)
X22_matrix <- matrix(panel_data$x22_t_1, ncol = T*N, byrow = TRUE)
X23_matrix <- matrix(panel_data$x23_t_1, ncol = T*N, byrow = TRUE)
X24_matrix <- matrix(panel_data$x24_t_1, ncol = T*N, byrow = TRUE)


# Stack the matrices to create F3
F2 <- rbind(X21_matrix, X22_matrix, X23_matrix,X24_matrix)

#create F3
X31_matrix <- matrix(panel_data$x31_t_1, ncol = T*N, byrow = TRUE)
X32_matrix <- matrix(panel_data$x32_t_1, ncol = T*N, byrow = TRUE)
X33_matrix <- matrix(panel_data$x33_t_1, ncol = T*N, byrow = TRUE)
X34_matrix <- matrix(panel_data$x34_t_1, ncol = T*N, byrow = TRUE)

F3 <- rbind(X31_matrix, X32_matrix, X33_matrix,X34_matrix)

#seven time-invariant variables
W11_matrix <- matrix(panel_data$w11_t, ncol = T*N, byrow = TRUE)
W21_matrix <- matrix(panel_data$w21_t, ncol = T*N, byrow = TRUE)
W31_matrix <- matrix(panel_data$w22_t, ncol = T*N, byrow = TRUE)
W41_matrix <- matrix(panel_data$w23_t, ncol = T*N, byrow = TRUE)

G1 <- rbind(W11_matrix, W21_matrix,W31_matrix,W41_matrix)

W12_matrix <- matrix(panel_data$w12_t, ncol = T*N, byrow = TRUE)
W22_matrix <- matrix(panel_data$w22_t, ncol = T*N, byrow = TRUE)
W32_matrix <- matrix(panel_data$w22_t, ncol = T*N, byrow = TRUE)

G2 <- rbind(W12_matrix, W22_matrix,W32_matrix)

W13_matrix <- matrix(panel_data$w13_t, ncol = T*N, byrow = TRUE)
W23_matrix <- matrix(panel_data$w23_t, ncol = T*N, byrow = TRUE)
W33_matrix <- matrix(panel_data$w33_t, ncol = T*N, byrow = TRUE)
W43_matrix <- matrix(panel_data$w43_t, ncol = T*N, byrow = TRUE)

G3 <- rbind(W13_matrix, W23_matrix, W33_matrix,W43_matrix)

V_diag <- matrix(c(1, 0.5, 0.3, 0.5, 1, 0.2, 0.3, 0.2, 1), 3)  # Example covariance matrix
nu <- 5  # Degrees of freedom for t-distribution
#V_diag <- rep(1, 3)

for(n in 1:N) {
  for(t in 1:T) {
    index <- t + (n - 1) * T
    mu <- c(alpha1[n] + crossprod(F1[, index], s1[, t]) + crossprod(beta1, G1[, index]),
            alpha2[n] + crossprod(F2[, index], s2[, t]) + crossprod(beta2, G2[, index]),
            alpha3[n] + crossprod(F3[, index], s3[, t]) + crossprod(beta3, G3[, index]))
    
    Y[, index] <- rmvt(n = 1, delta = mu, sigma = V_diag, df = nu)
  }
}

stan_data <- list(T = T, N = N, K1 = K1, K2=K2, K3=K3, Y = Y, F1 = F1, F2 = F2, F3 = F3, G1 = G1, G2 = G2, G3 = G3, V_diag=V_diag)

# specify inital valeus funtion
#init_func <- function() {
 # list(alpha1 = rnorm(N,0,0.1), alpha2 = rnorm(N,0,0.1),alpha3 = rnorm(N,0,0.1),beta1 = rnorm(4,0,0.1), beta2 = rnorm(3,0,0.1),beta3=rnorm(4, 0, 0.1),s1_initial <- rnorm(K1, 0, 0.1), s2_initial <- rnorm(K2, 0, 0.1), s3_initial <- rnorm(K3, 0, 0.1),s1 <- matrix(0, K1, T), s2 <- matrix(0, K2, T), s3 <- matrix(0, K3, T),  nu=5, V ~ inv_wishart(4, diag_matrix(rep_vector(1, 3))),sigma_s1 ~ normal(0, 1), sigma_s2 ~ normal(0, 1), sigma_s3 ~ normal(0, 1))
#}


# Fit the model using CmdStanR with GPU
fit_stan <- stanmodel$sample(
  data = stan_data,
  chains = 4,
  iter_warmup = 4500,
  iter_sampling = 4500,
  refresh = 2,
  #control = list(max_treedepth = 15, adapt_delta = 0.995),
  opencl_ids = c(0, 2),  # Specify your GPU device IDs if needed
  #init = init_func
)


If V is diagonal matrix then the student_t distribution would be sufficient.

[quote=“mandy_j, post:1, topic:33826”]

I’m confident that the loops can be optimized through vectorization. Achieving notable speed-ups on a GPU is unlikely unless your code has been vectorized.