Reparamaterize conditional autoregressive model of occupancy to avoid low E-BFMI warning?

Hello,
I am trying to fit an occupancy model (based on @mbjoseph’s gist) that includes a Conditional Autoregressive component (based on @mbjoseph’s sparse CAR implementation case study). Sampling results in the “low E-BFMI” warning and n_eff values less than 10 for tau (the precision parameter in the CAR component) and lp_ when fitting to simulated data. If I use a strongly informative prior (e.g., a normal distribution with mu= the true value and sd = 0.1), I can get the model to sample and it returns the correct regression parameter estimates. If I use a new value for tau the model still samples, but returns an incorrect value for tau and the parameter estimates for the regression parameters are badly biased. I would like to fit these models to actual data wherein I will not know the true value of tau, and so likely(?) need to reparamaterize the CAR component of the model. Any assistance would be most appreciated.


Generate simulated data

# Load packages -----------------------------------------------------------

library(tidyverse);
library(spdep);
library(rgdal)
library(rstan);
library(sf);
library(tigris);

options(mc.cores = 3);
rstan_options(auto_write = TRUE)

options(tigris_use_cache = TRUE)
prj <- "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"

##Function to simulate spatial autocorrelation in the data
gen_CAR <- function(lyr, a, tau){
  lyr.ord <- lyr[order(lyr$GEOID),] %>% as(., "Spatial"); #make sure that layer is ordered by GEOID
  lyr.nb <- poly2nb(lyr.ord, row.names = lyr.ord$GEOID); #estimate neighbors could use st_touches on sf, but slower
  lyr.W <- nb2mat(lyr.nb, style="B"); #convert neighborhood list into matrix
  lyr.D <- diag(rowSums(lyr.W)); # get the diagonal
  prec.mat <- tau*(lyr.D - a*lyr.W); #estimate precision matrix
  cov.mat <- solve(prec.mat); #invert precision matrix necesarry because tau is the reciprocal of variance
  set.seed(1234)
  a.lyr.CAR <-  MASS::mvrnorm(1, rep(0.5, nrow(cov.mat)), cov.mat) #generate from normal distr with spatial cov matrix
  lyr.CAR <- lyr.ord %>% as(.,"sf") %>% cbind(., a.lyr.CAR) #bind to original spatial data frame
  return(list(lyr.W, lyr.D, cov.mat, lyr.CAR)) 
}


# Get a dummy geography ---------------------------------------------------

st <- "IA"

geog.tct <- tracts(st) %>% as(., "sf") %>% st_transform(., crs= prj)
geog.bg <- block_groups(st) %>% as(., "sf") %>% st_transform(., crs= prj)

tct.car <- gen_CAR(geog.tct, 0.8, 2)

tct.df <- tct.car[[4]]


# Simulate true occupancy states ------------------------------------------
# define a design matrix for site-level occupancy
n_site <- nrow(tct.df)
m_psi <- 3    # m_psi is the number of columns in the site level design matrix
X_psi <- matrix(c(rep(1, n_site), rnorm((m_psi - 1) * n_site)), 
                ncol = m_psi
stopifnot(ncol(X_psi) >= 1)

# get occupancy probabilities
beta_psi <- rnorm(m_psi)
psi <- plogis(X_psi %*% beta_psi + as.vector(tct.df$a.lyr.CAR))
z <- rbinom(n_site, size = 1, prob = psi)

tct.df <- cbind(tct.df, z)

# Simulate imperfect detection/nondetection data --------------------------

# determine number of surveys per site
survey_summary <- geog.bg %>% 
  group_by(., STATEFP, COUNTYFP, TRACTCE) %>% 
  summarise(count = n()) #NOTE: This orders the data; fine if CAR function also orders, but could screw things up

n_survey <- as.vector(survey_summary$count)
total_surveys <- nrow(geog.bg)

# define a survey-level design matrix for detection probabilities
m_p <- 2    # m_p is the number of survey-level covariates
beta_p <- rnorm(m_p)
X_p <- matrix(c(rep(1, total_surveys), rnorm((m_p - 1) * total_surveys)), 
              ncol = m_p)
stopifnot(ncol(X_p) >= 1)
p <- plogis(X_p %*% beta_p)

# simulate observations
survey_df <- tibble(site = rep(1:n_site, n_survey),
                    siteID = rep(tct.df$GEOID, n_survey)) %>%
  mutate(y = rbinom(n = total_surveys, size = 1, prob = z[site] * p))

# get start and end indices to extract slices of y for each site
start_idx <- rep(0, n_site)
end_idx <- rep(0, n_site)
for (i in 1:n_site) {
  if (n_survey[i] > 0) {
    site_indices <- which(survey_df$site == i)
    start_idx[i] <- site_indices[1]
    end_idx[i] <- site_indices[n_survey[i]]
  }
}

# create vector of whether any positive observations were seen at each site
any_seen <- rep(0, n_site)
for (i in 1:n_site) {
  if (n_survey[i] > 0) {
    any_seen[i] <- max(survey_df$y[start_idx[i]:end_idx[i]])
  }
}

# Bundle data for Stan ----------------------------------------------------
stan_d <- list(n_site = n_site, 
               m_psi = m_psi,
               X_psi = X_psi,
               total_surveys = total_surveys, 
               m_p = m_p, 
               X_p = X_p, 
               site = survey_df$site, 
               y = survey_df$y, 
               start_idx = start_idx, 
               end_idx = end_idx, 
               any_seen = any_seen, 
               n_survey = n_survey,
               W_tct = tct.car[[1]],
               W_n = sum(tct.car[[1]])/2)

Stan Model

functions {
  /**
  * Return the log probability of a proper conditional autoregressive (CAR) prior 
  * with a sparse representation for the adjacency matrix
  *
  * @param phi Vector containing the parameters with a CAR prior
  * @param tau Precision parameter for the CAR prior (real)
  * @param alpha Dependence (usually spatial) parameter for the CAR prior (real)
  * @param W_sparse Sparse representation of adjacency matrix (int array)
  * @param n Length of phi (int)
  * @param W_n Number of adjacent pairs (int)
  * @param D_sparse Number of neighbors for each location (vector)
  * @param lambda Eigenvalues of D^{-1/2}*W*D^{-1/2} (vector)
  *
  * @return Log probability density of CAR prior up to additive constant
  */
  real sparse_car_lpdf(vector phi, real tau, real alpha, 
    int[,] W_sparse, vector D_sparse, vector lambda, int n_site, int W_n) {
      row_vector[n_site] phit_D; // phi' * D
      row_vector[n_site] phit_W; // phi' * W
      vector[n_site] ldet_terms;
    
      phit_D = (phi .* D_sparse)';
      phit_W = rep_row_vector(0, n_site);
      for (i in 1:W_n) {
        phit_W[W_sparse[i, 1]] = phit_W[W_sparse[i, 1]] + phi[W_sparse[i, 2]];
        phit_W[W_sparse[i, 2]] = phit_W[W_sparse[i, 2]] + phi[W_sparse[i, 1]];
      }
    
      for (i in 1:n_site) ldet_terms[i] = log1m(alpha * lambda[i]);
      return 0.5 * (n_site * log(tau)
                    + sum(ldet_terms)
                    - tau * (phit_D * phi - alpha * (phit_W * phi)));
  }
}
data {
  // site-level occupancy covariates
  int<lower = 1> n_site;
  int<lower = 1> m_psi;
  matrix[n_site, m_psi] X_psi;
  
  // survey-level detection covariates
  int<lower = 1> total_surveys;
  int<lower = 1> m_p;
  matrix[total_surveys, m_p] X_p;

  // survey level information  
  int<lower = 1, upper = n_site> site[total_surveys];
  int<lower = 0, upper = 1> y[total_surveys];
  int<lower = 0, upper = total_surveys> start_idx[n_site];
  int<lower = 0, upper = total_surveys> end_idx[n_site];
  
  // summary of whether species is known to be present at each site
  int<lower = 0, upper = 1> any_seen[n_site];
  
  // number of surveys at each site
  int<lower = 0> n_survey[n_site];
  
  //adjacency data
  matrix<lower = 0, upper = 1>[n_site, n_site] W_tct; //adjacency matrix tract
  int W_n; //number of adjacent pairs
  //real<lower = 0> tau;
}
transformed data {
  int W_sparse[W_n, 2];   // adjacency pairs
  vector[n_site] D_sparse;     // diagonal of D (number of neigbors for each site)
  vector[n_site] lambda;       // eigenvalues of invsqrtD * W * invsqrtD
  
  { // generate sparse representation for W
  int counter;
  counter = 1;
  // loop over upper triangular part of W to identify neighbor pairs
    for (i in 1:(n_site - 1)) {
      for (j in (i + 1):n_site) {
        if (W_tct[i, j] == 1) {
          W_sparse[counter, 1] = i;
          W_sparse[counter, 2] = j;
          counter = counter + 1;
        }
      }
    }
  }
  for (i in 1:n_site) D_sparse[i] = sum(W_tct[i]);
  {
    vector[n_site] invsqrtD;  
    for (i in 1:n_site) {
      invsqrtD[i] = 1 / sqrt(D_sparse[i]);
    }
    lambda = eigenvalues_sym(quad_form(W_tct, diag_matrix(invsqrtD)));
  }
}

parameters {
  vector[n_site] phi;
 
  //real<lower = 0, upper = 1> alpha;
  vector[m_psi] beta_psi;
  vector[m_p] beta_p;
  real<lower = 0, upper =1> alpha_tilde;
  real<lower = 0> sigma;
}
transformed parameters {
  vector[total_surveys] logit_p = X_p * beta_p;
  vector[n_site] logit_psi = X_psi * beta_psi + phi;
  real<lower = 0, upper =1> alpha = (1+alpha_tilde)/2;
   real<lower = 0> tau = 1/sigma;
}
model {
  vector[n_site] log_psi = log_inv_logit(logit_psi);
  vector[n_site] log1m_psi = log1m_inv_logit(logit_psi);

  phi ~ sparse_car(tau, alpha, W_sparse, D_sparse, lambda, n_site, W_n);
  alpha_tilde ~ beta(2, 2);
  sigma ~ normal(0.1,1);
  beta_psi ~ normal(0, 1);
  beta_p ~ normal(0, 1);
  
  for (i in 1:n_site) {
    if (n_survey[i] > 0) {
      if (any_seen[i]) {
        // site is occupied
        target += log_psi[i] 
                  + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] | 
                                         logit_p[start_idx[i]:end_idx[i]]);
      } else {
        // site may or may not be occupied
        target += log_sum_exp(
          log_psi[i] + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] |
                                            logit_p[start_idx[i]:end_idx[i]]), 
          log1m_psi[i]
        );
      }
    }
  }
}

I have tried sampling tau directly (rather than as the reciprocal of sigma), but that doesn’t seem to help with the problem.

Thanks Ben - Did you point me to this because of @anon75146577 's comment?

There’s a subtle error in the ICAR part of the example: the dimension of the support of the prior is (n-1) rather than n and so the term corresponding to \tau in the log density should be (n-1)\log(\tau).

Which I think corresponds to this part of my code:

for (i in 1:n_site) ldet_terms[i] = log1m(alpha * lambda[i]);
      return 0.5 * (n_site * log(tau)
                    + sum(ldet_terms)
                    - tau * (phit_D * phi - alpha * (phit_W * phi)));

Which should be edited to:

for (i in 1:n_site) ldet_terms[i] = log1m(alpha * lambda[i]);
      return 0.5 * ((n_site - 1) * log(tau)
                    + sum(ldet_terms)
                    - tau * (phit_D * phi - alpha * (phit_W * phi)));

Or is there something else in the post that I’m not seeing?

I am always referring to comments by @anon75146577 but in this case, I was also referring to my own.

Sorry, just saw this part:

For the Scottish Lip Cancer example, it is possible to get an effective sample size that is much higher by post-multiplying a row-vector of parameters (that have a standard normal prior) by the inverse of the Cholesky factor of the precision matrix to form phi (transposed) in the transformed parameters block.

which, I think is where you are pointing me?

Any advice on how to implement that in my code? Does it need to be done within the function for the CAR prior or still within the transformed parameters block?

If you can create the precision matrix, you can create the covariance matrix, obtain its Cholesky factor, and use that in the non-centered parameterization of the coefficients. Or if you work out the math, you can get the Cholesky factor of the precision matrix and then do a triangular solve to get the coefficients.

Thanks - might take me a bit to figure that out (trying to re-learn matrix operations). I’ll report back once I’ve given it a try

May also be worth exploring Andrew’s suggestion here to implement a unit CAR prior, and then multiply \phi by a scale parameter:

functions {
  /**
  * Return log probability of a unit-scale proper conditional autoregressive 
  * (CAR) prior with a sparse representation for the adjacency matrix
  *
  * @param phi Vector containing the parameters with a CAR prior
  * @param alpha Dependence (usually spatial) parameter for the CAR prior (real)
  * @param W_sparse Sparse representation of adjacency matrix (int array)
  * @param n Length of phi (int)
  * @param W_n Number of adjacent pairs (int)
  * @param D_sparse Number of neighbors for each location (vector)
  * @param lambda Eigenvalues of D^{-1/2}*W*D^{-1/2} (vector)
  *
  * @return Log probability density of CAR prior up to additive constant
  */
  real sparse_car_lpdf(vector phi, real alpha, 
    int[,] W_sparse, vector D_sparse, vector lambda, int n_site, int W_n) {
      row_vector[n_site] phit_D; // phi' * D
      row_vector[n_site] phit_W; // phi' * W
      vector[n_site] ldet_terms;
    
      phit_D = (phi .* D_sparse)';
      phit_W = rep_row_vector(0, n_site);
      for (i in 1:W_n) {
        phit_W[W_sparse[i, 1]] += phi[W_sparse[i, 2]];
        phit_W[W_sparse[i, 2]] += phi[W_sparse[i, 1]];
      }
    
      for (i in 1:n_site) ldet_terms[i] = log1m(alpha * lambda[i]);
      return 0.5 * (sum(ldet_terms)
                    - (phit_D * phi - alpha * (phit_W * phi)));
  }
}
data {
  // site-level occupancy covariates
  int<lower = 1> n_site;
  int<lower = 1> m_psi;
  matrix[n_site, m_psi] X_psi;
  
  // survey-level detection covariates
  int<lower = 1> total_surveys;
  int<lower = 1> m_p;
  matrix[total_surveys, m_p] X_p;

  // survey level information  
  int<lower = 1, upper = n_site> site[total_surveys];
  int<lower = 0, upper = 1> y[total_surveys];
  int<lower = 0, upper = total_surveys> start_idx[n_site];
  int<lower = 0, upper = total_surveys> end_idx[n_site];
  
  // summary of whether species is known to be present at each site
  int<lower = 0, upper = 1> any_seen[n_site];
  
  // number of surveys at each site
  int<lower = 0> n_survey[n_site];
  
  //adjacency data
  matrix<lower = 0, upper = 1>[n_site, n_site] W_tct; //adjacency matrix tract
  int W_n; //number of adjacent pairs
}
transformed data {
  int W_sparse[W_n, 2];   // adjacency pairs
  vector[n_site] D_sparse;     // diagonal of D (number of neigbors for each site)
  vector[n_site] lambda;       // eigenvalues of invsqrtD * W * invsqrtD
  
  { // generate sparse representation for W
  int counter;
  counter = 1;
  // loop over upper triangular part of W to identify neighbor pairs
    for (i in 1:(n_site - 1)) {
      for (j in (i + 1):n_site) {
        if (W_tct[i, j] == 1) {
          W_sparse[counter, 1] = i;
          W_sparse[counter, 2] = j;
          counter += 1;
        }
      }
    }
  }
  for (i in 1:n_site) D_sparse[i] = sum(W_tct[i]);
  {
    vector[n_site] invsqrtD;  
    for (i in 1:n_site) {
      invsqrtD[i] = 1 / sqrt(D_sparse[i]);
    }
    lambda = eigenvalues_sym(quad_form(W_tct, diag_matrix(invsqrtD)));
  }
}

parameters {
  vector[n_site] phi;
  vector[m_psi] beta_psi;
  vector[m_p] beta_p;
  real<lower = 0, upper =1> alpha;
  real<lower = 0> sigma;
}
transformed parameters {
  vector[total_surveys] logit_p = X_p * beta_p;
  vector[n_site] logit_psi = X_psi * beta_psi + phi * sigma;
}
model {
  vector[n_site] log_psi = log_inv_logit(logit_psi);
  vector[n_site] log1m_psi = log1m_inv_logit(logit_psi);

  phi ~ sparse_car(alpha, W_sparse, D_sparse, lambda, n_site, W_n);
  sigma ~ normal(0, 1);
  beta_psi ~ normal(0, 1);
  beta_p ~ normal(0, 1);
  
  for (i in 1:n_site) {
    if (n_survey[i] > 0) {
      if (any_seen[i]) {
        // site is occupied
        target += log_psi[i] 
                  + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] | 
                                         logit_p[start_idx[i]:end_idx[i]]);
      } else {
        // site may or may not be occupied
        target += log_sum_exp(
          log_psi[i] + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] |
                                            logit_p[start_idx[i]:end_idx[i]]), 
          log1m_psi[i]
        );
      }
    }
  }
}
1 Like

Thanks @mbjoseph - this does seem to help overcome the problem, but requires a bit of tuning of the priors to return reasonable estimates of the regression parameters. I’ll post where I land on that for completeness. I’m still working on figuring out how to actually implement @bgoodri 's suggestion and will post that as well (if I can get it to work)