I have this simulation study for incorporating the benchmarking and inequality constraints but the results of my use for the model based estimates shows a higher MSE than the direct estimates which is problematic. It looks like the model is doing quite a bit worse than the direct estimate.I need help in figuring out why that is the case and fix it.
# ==========================================================
# LIBRARIES
# ==========================================================
library(rstan)
library(ggplot2)
# ==========================================================
# SIMULATING THE DATA (Adjusted to Match Nandram's Paper)
# ==========================================================
set.seed(123)
# Define the Agricultural Statistics District (ASD) sizes
asdSizes <- c(12, 11, 9, 11, 7, 13, 15, 12, 12)
numASDs <- length(asdSizes)
L <- sum(asdSizes) # Total number of counties (L = 102)
asdIndex <- rep(seq_len(numASDs), times = asdSizes)
# "True" parameters for theta: global mean plus ASD and county effects
globalMean <- 50
tauASD <- 30
tauCounty <- 10
asdEffect <- rnorm(numASDs, mean = 0, sd = tauASD)
theta_true <- numeric(L)
for(i in seq_len(L)) {
theta_true[i] <- globalMean +
asdEffect[ asdIndex[i] ] +
rnorm(1, mean = 0, sd = tauCounty)
}
# Simulate sample sizes (n_i) and county-level coefficients of variation (CV)
n_i <- sample(2:74, size = L, replace = TRUE)
CV <- runif(L, min = 0.08, max = 0.93)
# Simulate survey samples for each county.
# For each county i:
# - Use a standard deviation based on the true value: CV[i] * theta_true[i]
# - Compute the direct estimate as the sample mean (theta_hat[i])
# - Compute the sample variance S2[i]
# - Compute the survey standard error as: sigma_hat[i] = CV[i] * theta_hat[i]
theta_hat <- numeric(L)
S2 <- numeric(L)
sigma_hat <- numeric(L) # Computed survey standard error
for(i in seq_len(L)) {
sample_i <- rnorm(n_i[i], mean = theta_true[i], sd = CV[i] * abs(theta_true[i]))
theta_hat[i] <- mean(sample_i)
S2[i] <- var(sample_i)
sigma_hat[i] <- CV[i] * theta_hat[i] # Standard error per Nandram's approach
}
# Generate Administrative (FSA) values: lower bounds c_i.
# The formulation is: c_i = theta_hat * (1 + U), where U ~ Uniform(-s, s)
sVal <- 0.10
U <- runif(L, min = -sVal, max = sVal)
c_i <- theta_hat * (1 + U)
# Set the benchmarking target.
# Following Nandram's approach, choose 0 < d < 1 so that a = sum(c_i)/d > sum(c_i).
# (Here, dVal = 0.95 is used; you might try 0.99 to more closely match some scenarios.)
dVal <- 0.95
a <- sum(c_i) / dVal
# Package the simulated data in a data frame (for diagnostic/plotting purposes)
simData <- data.frame(
County = seq_len(L),
ASD = asdIndex,
n_i = n_i,
theta_hat = theta_hat,
S2 = S2,
sigma_hat = sigma_hat,
c_i = c_i,
theta_true = theta_true
)
cat("Check sum(c_i):", sum(c_i), "\n")
cat("Benchmarking target a:", a, "\n")
cat("Check sum(theta_true) < a?:", sum(theta_true) < a, "\n")
# ==========================================================
# PREPARE THE DATA LIST FOR STAN
# ==========================================================
pBeta <- 1
X_beta <- matrix(1, nrow = L, ncol = pBeta)
pGamma <- 1
X_gamma <- matrix(1, nrow = L, ncol = pGamma)
stanData <- list(
L = L,
c = c_i,
a = a,
n = n_i,
theta_hat = theta_hat,
S2 = S2,
pBeta = pBeta,
X_beta = X_beta,
pGamma = pGamma,
X_gamma = X_gamma
)
# ==========================================================
# STAN MODEL CODE (Double Shrinkage Model with Inequality Constraints)
# ==========================================================
stanModelCode <- "
data {
int<lower=1> L; // number of areas
vector[L] c; // lower bounds: each theta[i] >= c[i]
real<lower=sum(c)> a; // benchmarking target
// Survey data
int<lower=2> n[L];
vector[L] theta_hat;
vector[L] S2;
// Mean-model design
int<lower=1> pBeta;
matrix[L, pBeta] X_beta;
// Variance-model design
int<lower=1> pGamma;
matrix[L, pGamma] X_gamma;
}
parameters {
simplex[L+1] alpha_plus; // Only first L used for theta
//----------------------------------------------------------------
// Hierarchical parameters for the double-shrinkage model
//----------------------------------------------------------------
vector[pBeta] beta; // mean-model coefficients
real<lower=0> delta; // scale for prior on theta
vector[pGamma] gamma; // log-precision model coefficients
real<lower=0> alpha_tau; // shape for tau
vector<lower=0>[L] tau; // area-level precision
}
transformed parameters {
vector[L] theta;
// Compute theta in one vectorized expression
theta = c + (a - sum(c)) * alpha_plus[1:L];
}
model {
//----------------------------------------------------------------
// Priors on the hierarchical parameters
//----------------------------------------------------------------
beta ~ normal(0, 5);
delta ~ cauchy(0, 5);
gamma ~ normal(0, 5);
alpha_tau ~ gamma(10, 1);
// Area-level precision, vectorized version:
// Note: Ensure the sign in the exponent is as intended.
tau ~ gamma(alpha_tau / 2, 0.5 * alpha_tau * exp(-X_gamma * gamma));
//----------------------------------------------------------------
// Prior that shrinks theta around X_beta * beta with scale delta,
// using vectorized likelihood:
//----------------------------------------------------------------
theta ~ normal(X_beta * beta, delta);
//----------------------------------------------------------------
// Double-shrinkage likelihood for theta_hat and S2
//----------------------------------------------------------------
theta_hat ~ normal(theta, inv_sqrt(tau));
// Likelihood for S2 is left in a loop since the shape parameter is
// area-specific. (You might explore vectorizing this if possible.)
for (i in 1:L) {
real shape_i = 0.5 * (n[i] - 1);
real rate_i = 0.5;
target += gamma_lpdf((n[i]-1) * S2[i] * tau[i] | shape_i, rate_i);
}
}
generated quantities {
vector[L] theta_out;
real sum_theta;
// Re-use the vectorized computation for theta
theta_out = c + (a - sum(c)) * alpha_plus[1:L];
sum_theta = sum(theta_out);
}
"
# ==========================================================
# COMPILE AND FIT THE MODEL USING STAN
# ==========================================================
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
stanModel <- stan_model(model_code = stanModelCode)
fit <- sampling(
object = stanModel,
data = stanData,
chains = 2,
iter = 2000,
warmup = 1000,
control = list(adapt_delta = 0.95, max_treedepth = 12),
seed = 123
)
# ==========================================================
# RESULTS AND DIAGNOSTICS
# ==========================================================
print(fit, pars = c('beta','delta','gamma','alpha_tau','sum_theta'), probs = c(0.025, 0.975))
# To see a subset of tau[i] estimates:
print(fit, pars = paste0('tau[', 1:30, ']'), probs = c(0.025, 0.975))
# Extract posterior samples for theta_out (model-based estimates)
post <- extract(fit)
theta_draws <- as.matrix(post$theta_out)
M <- nrow(theta_draws)
L <- ncol(theta_draws)
# Verify that the constraints are satisfied in the posterior draws:
any_violations_before <- FALSE
for (m in seq_len(M)) {
if (any(theta_draws[m, ] < simData$c_i)) {
any_violations_before <- TRUE
break
}
if (sum(theta_draws[m, ]) >= a) {
any_violations_before <- TRUE
break
}
}
cat("Any violations of constraints before raking? ", any_violations_before, "\n")
# Raking step: Adjust draws so that sum(theta) equals the target a
theta_raked <- matrix(NA_real_, nrow = M, ncol = L)
for (m in 1:M) {
scale_m <- a / sum(theta_draws[m, ])
theta_raked[m, ] <- scale_m * theta_draws[m, ]
}
# Check if the raked draws satisfy the constraints
any_violations_after <- FALSE
for (m in seq_len(M)) {
if (any(theta_raked[m, ] < simData$c_i)) {
any_violations_after <- TRUE
break
}
if (abs(sum(theta_raked[m, ]) - a) > 1e-8) {
any_violations_after <- TRUE
break
}
}
cat("Any violations after raking? ", any_violations_after, "\n")
# Compute the posterior mean (raked) for each county
posterior_means_raked <- colMeans(theta_raked)
# ==========================================================
# COMPUTE MSE FOR DIFFERENT ESTIMATORS
# ==========================================================
# MSE for Direct Estimates:
mse_direct <- mean((simData$theta_hat - simData$theta_true)^2)
# MSE for Model-Based (Raked) Estimates:
mse_model_based <- mean((posterior_means_raked - simData$theta_true)^2)
# MSE for c_i Estimates (lower bounds):
mse_ci <- mean((simData$c_i - simData$theta_true)^2)
cat("MSE for Direct Estimates:", mse_direct, "\n")
cat("MSE for Model-Based (Raked) Estimates:", mse_model_based, "\n")
cat("MSE for c_i Estimates:", mse_ci, "\n")
# ==========================================================
# PLOTTING THE RESULTS
# ==========================================================
plot_df <- data.frame(
c_i = simData$c_i,
DirectEstimate = simData$theta_hat,
PosteriorMeanRaked = posterior_means_raked,
theta_true = simData$theta_true
)
p1 <- ggplot(plot_df, aes(x = c_i, y = PosteriorMeanRaked)) +
geom_point(color = "blue", size = 2, alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Posterior Mean (Raked) vs c_i",
x = "c_i", y = "Raked Posterior Mean of Theta") +
theme_minimal()
p2 <- ggplot(plot_df, aes(x = c_i, y = DirectEstimate)) +
geom_point(color = "darkgreen", size = 2, alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Direct Estimate (theta_hat) vs c_i",
x = "c_i", y = "Direct Estimate") +
theme_minimal()
print(p1)
print(p2)
# ===============================================================================
# Finding the MSE OF THE DIRECT ESTIMATES, MODELBASED ESTIMATES,C_I ESTIMATES
# ==============================================================================
# ------------------------------
# Compute MSE for different estimates
# ------------------------------
# MSE for Direct Estimates:
mse_direct <- mean((simData$theta_hat - simData$theta_true)^2)
# MSE for Model-Based (Raked) Estimates:
mse_model_based <- mean((posterior_means_raked - simData$theta_true)^2)
# MSE for c_i Estimates:
mse_ci <- mean((simData$c_i - simData$theta_true)^2)
# Print the MSEs:
cat("MSE for Direct Estimates:", mse_direct, "\n")
cat("MSE for Model-Based (Raked) Estimates:", mse_model_based, "\n")
cat("MSE for c_i Estimates:", mse_ci, "\n")