I wrote a two-component mixture model on estimating the true effect size of GWAS summary statistics for phenotypes with genetic correlation (Stan code pasted at the end). The model works fine on small dataset (N \approx 200, M = 2), and takes around 20 minutes to run 100 warmup + 1000 iterations for 4 chains in parallel. However, when I tried larger dataset (N \approx 3 \times 10^5, M = 2). It has taken around 5 days to get 500 samples and hasn’t converged yet (most Rhat is around 1.1 and Neff around 10; sampler parameters from stansummary also pasted below).
name,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
"lp__",-4.01751e+07,1.96137e+07,9.6404e+07,-2.89641e+08,-1.43121e+06,756479,24.1586,inf,1.01487
"accept_stat__",0.999343,0.000218408,0.00346515,0.998102,0.999955,1,251.712,inf,0.99805
"stepsize__",3.48344e-05,2.06904e-06,4.04532e-06,3.21151e-05,3.2273e-05,4.23705e-05,3.82268,inf,1.57626
"treedepth__",9.52953,0.231384,1.07193,7,10,10,21.4616,inf,1.01066
"n_leapfrog__",874.465,71.2052,320.677,127,1023,1023,20.2821,inf,1.01249
"divergent__",0,0,0,0,0,0,508,inf,-nan
"energy__",4.32656e+07,2.07136e+07,1.0122e+08,-417360,1.91271e+06,3.02495e+08,23.8792,inf,1.01533
Another thing that doesn’t work for the larger dataset is the generated quantity gamma, which is the probability of an observation ending up in each mixture component. Most values are -nan. I would appreciate it for any suggestion to debug the model and speed it up!
data {
    int<lower=0> N;                 // number of loci
    int<lower=1> M;                // number of phenotypes
    matrix[N, M] B;                  // observed effect sizes
    matrix[N, M] SE;                // standard errors
    int<lower=1> K;                 // number of mixture components (K = 2 for this model)
}
transformed data{
    vector[M] zeros;
    vector[K] ones; 
    vector[M] SE_vec[N];
    
    zeros = rep_vector(0, M);
    ones = rep_vector(1, K);
    for (n in 1:N) {
        SE_vec[n] = to_vector(SE[n]);
    }
}
parameters {
    simplex[K] pi;                              // mixing proportions
    cholesky_factor_corr[M] L_e;     // cholesky factor of correlation matrix of errors
    cholesky_factor_corr[M] L_b;     // cholesky factor of correlation matrix of bias
    cholesky_factor_corr[M] L_g;     // cholesky factor of correlation matrix of effect
    vector[M] beta_vec[N];               // true effect size
    vector<lower=0>[M] tau_0;        // standard deviation of each phenotype (null component)
    vector<lower=0>[M] tau_1;        // standard deviation of each phenotype (nonnull component)
}
transformed parameters{
    matrix[M, M] Sigmas[K];
    
    Sigmas[1] = diag_pre_multiply(tau_0, L_b);
    Sigmas[2] = diag_pre_multiply(tau_1, L_g);
}
model {
    real betaloglik[N, K];
    
    tau_0 ~ double_exponential(0, 1e-3); 
    tau_1 ~ cauchy(0, 2.5);                      
    L_e ~ lkj_corr_cholesky(2.0);
    L_b ~ lkj_corr_cholesky(2.0);
    L_g ~ lkj_corr_cholesky(2.0);
    pi ~ dirichlet(ones);
    
    for (n in 1:N) {
        for (k in 1:K) {
            betaloglik[n, k] = log(pi[k]) + multi_normal_cholesky_lpdf(B[n] | beta_vec[n], diag_pre_multiply(SE_vec[n], L_e)) + multi_normal_cholesky_lpdf(beta_vec[n] | zeros, Sigmas[k]); 
        }
        target += log_sum_exp(betaloglik[n]);
    }
}
generated quantities {
    matrix[M,M] cor_e;
    matrix[M,M] cor_b;
    matrix[M,M] cor_g;
    matrix[N,K] gamma; // probability of being in each mixture component
    
    cor_e = multiply_lower_tri_self_transpose(L_e);
    cor_b = multiply_lower_tri_self_transpose(L_b);
    cor_g = multiply_lower_tri_self_transpose(L_g);
    
    for (n in 1:N) {
        row_vector[K] gamma_unnormalized;
        for (k in 1:K) {
            gamma_unnormalized[k] = pi[k] * exp(multi_normal_cholesky_lpdf(B[n] | beta_vec[n], diag_pre_multiply(SE_vec[n], L_e)) + multi_normal_cholesky_lpdf(beta_vec[n] | zeros, Sigmas[k]));
        }
        gamma[n] = gamma_unnormalized/sum(gamma_unnormalized);
    }
}
            

