TL;DR is it correct that when modeling data with a beta-binomial, data rows with lower counts will have a smaller effect/influence on the parameter distributions than rows with larger counts?
I’m trying to model voter preference in US house elections using 2 sources of data. One is the election results in each congressional district. I have a total number of votes, the number of those votes cast for a particular candidate and some demographic info about the district, e.g., %of people with college degrees and %of people over 45. I model this with a beta binomial and get reasonable answers. RHats all look good and loo seems pretty happy. Each district has about 300,000 voters and roughly half vote for each candidate.
I also have data from the CCES survey, which gives me data about a few individual voters in each district. I’m trying to add this to the model by counting up subgroups (e.g., people under 45 and with college degrees) and then setting the predictors to 0% or 100% as appropriate. The CCES survey has data for approx 100 people per district.
I fit each data-set individually and then together and compare the values and 90% confidence intervals of all 3 fits.
My intuition is that adding the CCES data might shift the model parameters slightly or narrow uncertainties but shouldn’t shift things much because it involves so many fewer people. But that’s not quite what I’m seeing. In some cases the parameters seem dominated by the CCES data. So I’m trying to figure out if my intuition is wrong or if that indicates a bug in the model code (included below) somehow.
The actual model uses a few more predictors but works as described above. It’s a bit complicated because of the 2 data-sets. And because it’s also modeling the total number of people voting out of all the people eligible to vote. But the main question remains.
Any thoughts or pointers for where to look for a deeper understanding would be much appreciated!
data {
int<lower = 1> N; // number of districts
int<lower = 1> M; // number of cces rows
int<lower = 1> K; // number of predictors
matrix[N, K] Xe;
matrix[M, K] Xc;
int<lower=-1, upper=1> IncE[N];
int<lower=-1, upper=1> IncC[M];
int<lower = 0> VAPe[N];
int<lower = 0> VAPc[M];
int<lower = 0> TVotesE[N];
int<lower = 0> DVotesE[N];
int<lower = 0> TVotesC[M];
int<lower = 0> DVotesC[M];
}
transformed data {
int<lower=0> G = M + N;
matrix[G, K] X = append_row (Xe, Xc);
int<lower=-1, upper=1> Inc[G] = append_array(IncE, IncC);
int<lower=0> VAP[G] = append_array(VAPe, VAPc);
int<lower=0> TVotes[G] = append_array (TVotesE, TVotesC);
int<lower=0> DVotes[G] = append_array (DVotesE, DVotesC);vector<lower=0>[K] sigma;
matrix[G, K] X_centered;
for (k in 1:K) {
real col_mean = mean(X[,k]);
X_centered[,k] = X[,k] - col_mean;
sigma[k] = sd(Xe[,k]);
}
matrix[G, K] Q_ast;
matrix[K, K] R_ast;
matrix[K, K] R_ast_inverse;
// thin and scale the QR decomposition
Q_ast = qr_Q(X_centered)[, 1:K] * sqrt(G - 1);
R_ast = qr_R(X_centered)[1:K,]/sqrt(G - 1);
R_ast_inverse = inverse(R_ast);
}
parameters {
real alphaD;
vector[K] thetaV;
real alphaV;
vector[K] thetaD;
real incBetaD;
real <lower=1e-5, upper=(1-1e-5)> dispD;
real <lower=1e-5, upper=(1-1e-5)> dispV;
}
transformed parameters {
real<lower=0> phiV = dispV/(1-dispV);
real<lower=0> phiD = dispD/(1-dispD);
vector<lower=0, upper=1> [G] pDVoteP = inv_logit (alphaD + Q_ast * thetaD + to_vector(Inc) * incBetaD);
vector<lower=0, upper=1> [G] pVotedP = inv_logit (alphaV + Q_ast * thetaV);
vector [K] betaV;
vector [K] betaD;
betaV = R_ast_inverse * thetaV;
betaD = R_ast_inverse * thetaD;
}
model {
alphaD ~ cauchy(0, 10);
alphaV ~ cauchy(0, 10);
betaV ~ cauchy(0, 2.5);
betaD ~ cauchy(0, 2.5);
incBetaD ~ cauchy(0, 2.5);
phiD ~ cauchy(0,2);
phiV ~ cauchy(0,2);
TVotes ~ beta_binomial(VAP, pVotedP * phiV, (1 - pVotedP) * phiV);
DVotes ~ beta_binomial(TVotes, pDVoteP * phiD, (1 - pDVoteP) * phiD);
}
generated quantities {
vector[G] log_lik;
for (g in 1:G) {
log_lik[g] = beta_binomial_lpmf(DVotes[g] | TVotes[g], pDVoteP[g] * phiD, (1 - pDVoteP[g]) * phiD) ;
}
vector<lower = 0>[G] eTVotes;
vector<lower = 0>[G] eDVotes;
for (g in 1:G) {
eTVotes[g] = pVotedP[g] * VAP[g];
eDVotes[g] = pDVoteP[g] * TVotes[g];
}
real avgPVoted = inv_logit (alphaV);
real avgPDVote = inv_logit (alphaD);
vector[K] deltaV;
vector[K] deltaD;
for (k in 1:K) {
deltaV [k] = inv_logit (alphaV + sigma [k] * betaV [k]) - avgPVoted;
deltaD [k] = inv_logit (alphaD + sigma [k] * betaD [k]) - avgPDVote;
}
real deltaIncD = inv_logit(alphaD + incBetaD) - avgPDVote;
}