Confused about why the logitudinal tractory model codes moves extremely slow

Could some one help me about these Rstan code? It seems to work extremely slow, and it reports when I run it in R:
SAMPLING FOR MODEL ‘anon_model’ NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 0.090726 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 907.26 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 10000 [ 0%] (Warmup)

The stan code are:

data {
  int<lower=1> I; // Total number of observations
  int<lower=1> Q; // Total numbers of covariates
  int<lower=1> J; // Numbers of repeated measurements in time dimension
  int<lower=1> S; // Dimension of alpha
  int<lower=1> L; // Length of each beta
  int<lower=1> K; // Total number of classes
  int<lower=1, upper=K> g[I]; // Class of each observation
  real y[I, Q, J];  // observed data
  real Z[I, Q, J, S];  // covariates for alpha
  real B[I, Q, J, L];  // covariates for beta
  int<lower=0, upper=1> data_index[I, Q, J]; // Indicator for non-missing data
}

parameters {
  vector[S] alpha[Q];
  vector[L] beta[K, Q];
  cholesky_factor_corr[Q] L_Sigma_Omega;
  vector<lower=0>[Q] L_sigma;
  vector[Q] omega[I];
  vector<lower=0>[Q] sigma2;
}

model {
  for (q in 1:Q) {
    alpha[q] ~ multi_normal(rep_vector(0, S), diag_matrix(rep_vector(100, S))); // Prior for alpha
    sigma2[q] ~ inv_gamma(1, 1); // Prior for sigma2
  }
  
  for (k in 1:K) {
    for (q in 1:Q) {
      beta[k, q] ~ multi_normal(rep_vector(0, L), diag_matrix(rep_vector(100, L))); // Prior for beta
    }
  }
  
  L_Sigma_Omega ~ lkj_corr_cholesky(1);
  L_sigma ~ cauchy(0, 2.5);
  
  for (i in 1:I) {
    matrix[Q, Q] L_Sigma;
    L_Sigma = diag_pre_multiply(L_sigma, L_Sigma_Omega);
    omega[i] ~  multi_normal_cholesky(rep_vector(0, Q), L_Sigma); // Prior for omega
  }
  
  // vectorized code
  for (i in 1:I) {
    for (q in 1:Q) {
      vector[J] mu;
      vector[S] alpha_q = alpha[q];
      int g_i = g[i];
      vector[L] beta_giq = beta[g_i, q];
      vector[L] beta_Kq = beta[K, q];
      
      for (j in 1:J) {
        if (data_index[i, q, j]) {
          vector[S] Z_iq = to_vector(Z[i, q, j, :]);
          vector[L] B_iq = to_vector(B[i, q, j, :]);
          
          if (g_i == K) {
            mu[j] = dot_product(Z_iq, alpha_q) + dot_product(B_iq, beta_Kq) + omega[i, q];
          } else {
            mu[j] = dot_product(Z_iq, alpha_q) + dot_product(B_iq, beta_giq) + dot_product(B_iq, beta_Kq) + omega[i, q];
          }
        }
      }
      y[i, q] ~ normal(mu, sqrt(sigma2[q]));
    }
  }
}

As for the simulation, I write it in R with:
library(MASS)
library(Matrix)
library(“rstan”)
library(splines)
library(mvtnorm)
rstan_options(auto_write = TRUE)

Set dimensions

I ← 1000
Q ← 10
J ← 10
S ← 3
L ← 3
K ← 5
t_min ← 1
t_max ← 5

Create storage for data

g ← integer(I)
y ← array(NA, c(I, Q, J))
Z ← array(NA, c(I, Q, J, S))
B ← array(NA, c(I, Q, J, L))
data_index ← array(1, c(I, Q, J))

Generate data

set.seed(1234) # Set seed for reproducibility

Generate the matrix B(I,Q,J,L)

t ← matrix(nrow = I, ncol = Q) # first generate random t
for (i in 1:I) {
means ← runif(Q, min = t_min, max = t_max) # uniformly distributed means between t_min and t_max
sigmas ← runif(Q, min = 1, max = 2) # uniformly distributed standard deviations between 1 and 2
covariance ← diag(sigmas) # assume independence so covariance matrix is diagonal
t[i,] ← mvtnorm::rmvnorm(1, mean = means, sigma = covariance) # draw from multivariate normal
}

spline_basis ← ns(seq(t_min, t_max, by=0.1), df=L)

apply B-spline basis function

for (i in 1:I) {
for (q in 1:Q){
data_index_iq ← which(data_index[i,q,] == 1) # index of data for i-th subject at q-th response # index of data for i-th subject at q-th response
B[i,q,data_index_iq,] ← predict(spline_basis, t[i,data_index_iq])
}
}

normalize B-spline basis matrix

for (l in 1:L) {
B_hat ← B[ , , , l][!is.na(B[ , , , l])]
B[ , , , l] ← (B[ , , , l] - mean(B_hat)) / sd(B_hat)
}

for (i in 1:I) {
g[i] ← ((i-1) %% K) + 1
for (q in 1:Q) {
# Generate Z and B
Z[i, q, , ] ← MASS::mvrnorm(J, rep(0, S), diag(S))
}
}

Generate alpha, beta_0, beta_k, and Omega

alpha ← mvrnorm(Q, rep(1, S), diag(S))
beta_0 ← mvrnorm(1, rep(0, L), diag(L))
beta_k ← mvrnorm(K-1, rep(0, L), diag(L))
Omega ← matrix(0, I, Q)
epsilon ← matrix(0, I, Q)

Generate Omega and epsilon

for (i in 1:I) {

Generate Omega[i,]

p ← qr.Q(qr(matrix(rnorm(Q^2), Q)))
Sigma ← crossprod(p, p*(Q:1)*0.2)
Omega[i, ] ← mvrnorm(1, rep(0, Q), Sigma)
}

for (q in 1:Q) {

Generate sigma^2 from an Inverse Gamma distribution

sigma_sq ← 1 / rgamma(1, shape=1, scale=1)

Generate epsilon[ ,q] from a normal distribution with mean 0 and variance sigma^2

epsilon[ ,q] ← rnorm(I, mean=0, sd=sqrt(sigma_sq))
}

Generate Y

Y ← array(0, c(I, Q, J))
for (i in 1:I) {
for (q in 1:Q) {
# calculate B_iq and Z_iq
B_iq ← B[i, q, , ]
Z_iq ← Z[i, q, , ]

if (g[i] == K) {
  Y_iq <- Z_iq %*% alpha[q,] + B_iq %*% beta_0 + 
    rep(Omega[i,q], J) + 
    rep(epsilon[i,q], J)
} else {
  Y_iq <- Z_iq %*% alpha[q,] + B_iq %*% beta_0 + 
    B_iq %*% beta_k[g[i]] + 
    rep(Omega[i,q], J) + 
    rep(epsilon[i,q], J)
  }

# Store Y_iq in Y
Y[i, q, ] <- Y_iq

}
}

Generate data_index_nonzero

data_index_nonzero ← array(0, c(I, Q, J))
for(i in 1:I) {
for(q in 1:Q) {
data_index_nonzero[i, q, ] ← which(data_index[i, q, ] == 1)
}
}

nz ← matrix(NA, I, Q)
for (i in 1:I) {
for (q in 1:Q) {
nz[i, q] ← sum(data_index[i, q, ])
}
}

List the data to be passed to Stan

stan_data ← list(I = I, Q = Q, J = J, S = S, L = L, K = K, g = g, y = y, Z = Z, B = B,
data_index = data_index, data_index_nonzero = data_index_nonzero,
nz = nz)
fit ← stan(file = ‘D:\Research\May\trajectory\stancode\try1\V3.stan’,
data = stan_data, iter = 10000, warmup = 1000, chains = 4)

And my model is defined as:

\boldsymbol{Y}_{\boldsymbol{iq}}=\boldsymbol{Z}_{\boldsymbol{iq}} \boldsymbol{\alpha_q}+\boldsymbol{B}_{iq} \beta_{\mathbf{0}}+\sum_{k=1}^{K-1} I\left(g_i=k\right) \boldsymbol{B}_{iq} \boldsymbol{\beta}_{\boldsymbol{k}}+\boldsymbol{\omega_{iq}}+\boldsymbol{\epsilon_{iq}}

and

\begin{gathered} \boldsymbol{\omega}_i=\left(\omega_{i 1}, \ldots, \omega_{i Q}\right) \sim \mathcal{N}\left(\mathbf{0}, \boldsymbol{\Sigma}_{\boldsymbol{\omega}}\right), \\ \epsilon_{i q j} \sim \mathcal{N}\left(0, \sigma_q^2\right) . \end{gathered}

The prior is defined as:

\begin{gathered} \boldsymbol{\alpha}_q \sim \mathcal{N}\left(\mathbf{0}, 100 \boldsymbol{I}_S\right), \\ \boldsymbol{\beta}_{k q} \sim \mathcal{N}\left(\mathbf{0}, 100 \boldsymbol{I}_L\right), \\ p\left(\boldsymbol{\Sigma}_{\boldsymbol{\omega}}\right) \propto 1, \\ \sigma_q^2 \sim \text { Inverse-Gamma }(1,1) \end{gathered}

I would really appreciate that if someone could know why it goes so slow, it almost takes 30minutes for each iteration! I will reply immediately!