Help with joint hierarchical logistic regression model

I am working on data where I have multiple environmental samples taken at each of several timepoints at each of several sites. The samples are a binary outcome (pos/neg). In my actual data (unlike the sim below) the number of samples taken at each site-timepoint, vary greatly. In addition, the proportion of positive samples varies greatly by both timepoint within site and among sites.
For each site-timepoint, I have a single binary outcome that is a disease case at the site (pos/neg). Thus my case dataset is the same length as the number of unique site-timepoint combinations in the environmental dataset.
I would like to know how the proportion of positive environmental samples by site-timepoint, predicts a disease case.
I was advised elsewhere on the forum that a joint model is what I should try.
I have coded a simulation of the data that I described above and a model in rstan. All of this is below in a fully reproducible example. I used brms to makes some Stan code for the hierarchical logistic regression and then modified it heavily.

library(rstan)

#data
N_1 <- 50 #number of sites
N_t <- 10 #number of timepoints per site
N_2 <- N_t*N_1 #total number of timepoint by site combinations
N_3 <- 30 #number of observations per timepoint within sites
N <- N_2*N_3 #number of observations in model 1

J_1 <- rep(1:N_1, each=(N/N_1)) #site id model 1
J_2 <- rep(1:N_t, each=((N/N_1)/N_t), times=N_1) #timepoint id (to use with brms if needed)
J_2s <- rep(1:N_2, each=N_3) #timepoint id for sim (this is an id for all site by timepoint combinations)

a <- rep(-2, times=N) #model 1 intercept
a_1 <- rnorm(N_1, 0, 1.5) #varying intercepts for site
a_2 <- rnorm(N_2, 0, 0.75) #varying intercepts for timepoint within site

p <- plogis(a + a_1[J_1] + a_2[J_2s]) #model 1
Y <- rbinom(N, 1, p=p) #model 1

K_1 <- rep(1:N_1, each=N_t) #site id model 2
K_2 <- 1:N_2

a_b <- rep(-3, times=N_2) #model 2 intercept
b <- rep(3, times=N_2) #model 2 effect of site-timepoint proportion on outcome
p_c <- plogis(a_b + (plogis(a[1:N_2] + a_1[K_1] + a_2[K_2]))*b) #model 2 
y_c <- rbinom(N_2, 1, p=p_c) #model 2


#stan data
d1 <- list(N = N, #total number of observations in the model 1 data
           Y = Y, #binary response for model 1
           N_1 = N_1, #number of sites
           J_1 = J_1, #site id for model 1
           N_2 = N_2, #number of site by timepoint combinations for model 1 data
           J_2 = J_2, #site by timepoint combination id for model 1 data
           y_c = y_c, #binary response for model 2
           K_1 = K_1, #site id for model 2
           K_2 = K_2, #site by timepoint combination id for model 2 data
           M_1 = 1,
           M_2 = 1)

#stan code
stan_code1 <- "
data {
  int<lower=1> N;  // total number of observations
  int Y[N];  // response variable for model 1
  // data for group-level effects of ID 1
  int<lower=1> N_1;  // number of grouping levels
  int<lower=1> J_1[N];  // grouping indicator per observation
  // data for group-level effects of ID 2
  int<lower=1> N_2;  // number of grouping levels (note, same length as model 2 dataset)
  int<lower=1> J_2[N];  // grouping indicator per observation
  // model 2 data
  int y_c[N_2]; // model 2 outcome
  int<lower=1> K_1[N_2];  // grouping indicator per observation, site level id for model 2
  int<lower=1> K_2[N_2];  // grouping indicator per observation, site-timepoint id for model 2 (note, K_2 same length as N_2)
  int<lower=1> M_1;
  int<lower=1> M_2;
}
parameters {
  real a;  // intercept for model 1
  vector<lower=0>[M_1] sd_1;  // group-level standard deviations
  vector[N_1] z_1[M_1];  // standardized group-level effects
  vector<lower=0>[M_2] sd_2;  // group-level standard deviations
  vector[N_2] z_2[M_2];  // standardized group-level effects
  real a_b; // intercept for case model
  real b; // effect of proportion of site-timepoint on model 2 outcome
}
transformed parameters {
  vector[N_1] r_1_1;  // actual group-level effects
  vector[N_2] r_2_1;  // actual group-level effects
  r_1_1 = (sd_1[1] * (z_1[1]));
  r_2_1 = (sd_2[1] * (z_2[1]));
}
model {
    for (n in 1:N) {
      Y[n] ~ bernoulli_logit(a + r_1_1[J_1[n]] + r_2_1[J_2[n]]);
    }
    for (n_2 in 1:N_2) {
      y_c[n_2] ~ bernoulli_logit(a_b + (inv_logit(a + r_1_1[K_1[n_2]] + r_2_1[K_2[n_2]]))*b); 
    }
  // priors including constants
  target += normal_lpdf(a | 0, 5);
  target += normal_lpdf(sd_1 | 0, 2.5)
    - 1 * normal_lccdf(0 | 0, 2.5);
  target += std_normal_lpdf(z_1[1]);
  target += normal_lpdf(sd_2 | 0, 2.5)
    - 1 * normal_lccdf(0 | 0, 2.5);
  target += std_normal_lpdf(z_2[1]);
  target += normal_lpdf(a_b | -5, 2.5);
  target += normal_lpdf(b | 0, 2.5);
}
generated quantities {
  real b_or;
  b_or = exp(b);
}
"

#fit model
fit1 <- stan(model_code = stan_code1, data = d1,
             chains = 4, iter = 2000, warmup = 1000, 
             thin = 1, cores = 4)

print(fit1, pars=c("sd_1", "sd_2", "a", "a_b", "b", "b_or")

It seems to do sorta ok most of the time…(I have tried sims with many more observations that take too long to post the code for here)…but sometimes not. Model 1 seems to always work fine, but Model 2 (the model where the site-timepoint proportions enter in as predictors) doesn’t seem as good.

So, I am a bit skeptical that I have programmed this correctly. Could someone please help?

Thanks!

Edit: I made a mistake in the data setup, which is now corrected. The model seems to recover parameters decently well. Any thoughts on the Stan model code, especially the joint part are appreciated.