How to speed up this code

I ran this code in rstan with warmup=5 and iter=6 inside a bigger for loop t=1:4000.
Within each iteration, the stan code is called once by “sampling”, and then the discrete variable Z is updated using Gibbs by my own R code. However, even warmup=5 and iter=6 takes 2-9 seconds, which is time consuming for the whole loop.
From the attached code, can it be speeded up?
Thank you.

GMMBLCA_forum.stan (2.0 KB)

For your convenience, I also copy the code here.
Let me know if you need additional information.
Thanks.

functions{
matrix tan2U(real tan_half, int TT){
matrix[TT,TT] U;
real cos1;
real sin1;
real divisor; //=sqrt(tan_half^2+1)
row_vector[TT] linear1;
row_vector[TT] linear2;
int index;
index=1;
U=diag_matrix(rep_vector(1,TT));
for(i in 1:(TT-1)){
for(j in (i+1):TT){
divisor=tan_half[index]^2+1;
cos1=2/divisor-1;
sin1=2tan_half[index]/divisor;
linear1=cos1
U[i,:]-sin1U[j,:];
linear2=sin1
U[i,:]+cos1U[j,:];
U[i,:]=linear1;
U[j,:]=linear2;
index+=1;
}
}
return U;
}
}
data {
int<lower=0> V; // number of data points
int<lower=0> TT; // dimensionality
int<lower=0> C; // number of GMM components
matrix[TT,V] X; // prewhitened data matrix: Notice that each column is a sample
int<lower=0,upper=C> z[V,TT];
vector<lower=0>[C] alpha_w;
real<lower=0> sd_prior_mu_prime; //mu’~N(0,sd_prior_mu_prime^2)
real<lower=0> shape_gamma_prime;
real<lower=0> scale_gamma_prime;
}
parameters {
simplex[C] w[TT];
vector[C] mu_prime[TT];
vector<lower=0>[C] sigma_prime_sq[TT];
real tan_half[TT
(TT-1)/2]; //tan(theta[i,j]): i=1,…,TT-1, j=i+1,…,TT
}
transformed parameters {
vector[C] mu[TT];
vector[C] mu_prime_prime[TT];
vector<lower=0>[C] sigma[TT];
real sd1;
matrix[TT,TT] U;

for(q in 1:TT){
mu_prime_prime[q]=mu_prime[q]-sum(w[q] .* mu_prime[q]);
sd1=sqrt(sum(w[q] .* (mu_prime_prime[q] .* mu_prime_prime[q]+sigma_prime_sq[q])));
mu[q]=mu_prime_prime[q]/sd1;
sigma[q]=sqrt(sigma_prime_sq[q])/sd1;
}
U=tan2U(tan_half,TT);
}
model {
matrix[TT,V] mu_z;
matrix[TT,V] sigma_z;
matrix[TT,V] sqrt1;

for(q in 1:TT){
w[q] ~ dirichlet(alpha_w);
mu_prime[q] ~ normal(0,sd_prior_mu_prime);
sigma_prime_sq[q] ~ inv_gamma(shape_gamma_prime,scale_gamma_prime);
z[,q] ~ categorical(w[q]);

mu_z[q,]=to_row_vector(mu[q][z[,q]]);
sigma_z[q,]=to_row_vector(sigma[q][z[,q]]);

}
sqrt1=(UX-mu_z) ./ sigma_z;
target += -sum(log(sigma_z))-sum(sqrt1 .
sqrt1)/2;
}