Meta-analysis, different scales and loo

I’ve been working on a hierarchical model for election data which attempts to incorporate surveys (approx. 60,000 respondents in the entire set, O(1000) per context) and actual election results (approx. 150 million voters, O(3 million) per context). I am modeling both kinds of data as binomial with probabilities parameterized by demographic information.

In practice, I’m rescaling the election result data or else the Stan run takes too long on my laptop. I’m currently able to run in reasonable time after dividing all the election results (eligible voters, votes, votes for candidate) by 100. Which still leaves more than an order of magnitude difference in scale between the surveys and election results.

There have been various challenges! One which remains is trying to understand the validity/goodness of fit. I can do posterior-predictive checks on the various data-sets. I would also like to do some cross-validation but–unsurprisingly I guess?–loo has bad/Inf pareto-k for all the election results because, I think, they are way more influential on the fit than the survey data given the relatively smaller variance of the binomial distribution with such large numbers of trials.

Is there some standard way to deal with this? Look at the loo for each data-set separately or something?

Can you post your actual model, or a sketch of it? I wonder if what’s happening is that you have context-specific parameters but are trying to do loo while leaving out entire contexts?

Thanks @jsocolar !
Happy to post the model. I don’t think I’m leaving anything out, but I admit things are a bit…complicated. So I could be…

data {
  int<lower=1> J_CD;
  int<lower=1> N_Elections_House;
  array[N_Elections_House] int<lower=1> Elections_House_CD;
  int<lower=1> J_State;
  array[N_Elections_House] int<lower=1> Elections_House_State;
  int<lower=1> N_CCESP_House;
  array[N_CCESP_House] int<lower=1> CCESP_House_State;
  int<lower=1> N_CPS;
  array[N_CPS] int<lower=1> CPS_State;
  int<lower=1> N_CCESP_President;
  array[N_CCESP_President] int<lower=1> CCESP_President_State;
  int<lower=1> N_Elections_Senate;
  array[N_Elections_Senate] int<lower=1> Elections_Senate_State;
  int<lower=1> N_CCEST;
  array[N_CCEST] int<lower=1> CCEST_State;
  int<lower=1> N_ACS;
  array[N_ACS] int<lower=1> ACS_CD;
  array[N_ACS] int<lower=1> ACS_State;
  int<lower=1> N_Elections_President;
  array[N_Elections_President] int<lower=1> Elections_President_State;
  array[N_ACS] int<lower=0> ACS_CVAP_WGT;
  int K_DMTurnout;
  matrix[N_ACS,K_DMTurnout] DMTurnout_ACS;
  int K_DMPref;
  matrix[N_ACS,K_DMPref] DMPref_ACS;
  array[N_Elections_House] int<lower=0> CVAP_Elex_Ho;
  array[N_Elections_House] int<lower=0> Voted_ElexHo;
  array[N_Elections_House] int<lower=0> VotesInRace_Elex_Ho;
  array[N_Elections_House] int<lower=0> DVotesInRace_Elex_Ho;
  array[N_Elections_Senate] int<lower=0> CVAP_Elex_Se;
  array[N_Elections_Senate] int<lower=0> Voted_ElexSe;
  array[N_Elections_Senate] int<lower=0> VotesInRace_Elex_Se;
  array[N_Elections_Senate] int<lower=0> DVotesInRace_Elex_Se;
  array[N_Elections_President] int<lower=0> CVAP_Elex_Pr;
  array[N_Elections_President] int<lower=0> Voted_ElexPr;
  array[N_Elections_President] int<lower=0> VotesInRace_Elex_Pr;
  array[N_Elections_President] int<lower=0> DVotesInRace_Elex_Pr;
  matrix[N_CCEST,K_DMTurnout] DMTurnout_CCEST;
  array[N_CCEST] int<lower=0> CVAP_CCES;
  array[N_CCEST] int<lower=0> Voted_CCES;
  array[N_CPS] int<lower=0> CVAP_CPS;
  array[N_CPS] int<lower=0> Voted_CPS;
  matrix[N_CPS,K_DMTurnout] DMTurnout_CPS;
  matrix[N_CCESP_House,K_DMPref] DMPref_CCESP_House;
  array[N_CCESP_House] int<lower=0> VotesInRace_CCES_House;
  array[N_CCESP_House] int<lower=0> DVotesInRace_CCES_House;
  matrix[N_CCESP_President,K_DMPref] DMPref_CCESP_President;
  array[N_CCESP_President] int<lower=0> VotesInRace_CCES_President;
  array[N_CCESP_President] int<lower=0> DVotesInRace_CCES_President;
}
parameters {
  real muAlphaT;
  real<lower=0> sigmaAlphaT;
  vector[J_State] alphaT_raw;
  vector[K_DMTurnout] muThetaT;
  vector<lower=0>[K_DMTurnout] tauThetaT;
  cholesky_factor_corr[K_DMTurnout] corrT;
  matrix[K_DMTurnout,J_State] thetaT_raw;
  real muAlphaP;
  real<lower=0> sigmaAlphaP;
  vector[J_State] alphaP_raw;
  vector[K_DMPref] muThetaP;
  vector<lower=0>[K_DMPref] tauThetaP;
  cholesky_factor_corr[K_DMPref] corrP;
  matrix[K_DMPref,J_State] thetaP_raw;
  real ixS_Elex_Se;
  real ixS_Elex_Pr;
  real ix_CCEST;
  real ix_CPST;
  real ix_CCESPHouse;
  real ix_CCESPPresident;
}
transformed parameters {
  vector[J_State] alphaT = muAlphaT + (sigmaAlphaT * alphaT_raw);
  matrix[K_DMTurnout,J_State] thetaT = rep_matrix(muThetaT,J_State) + diag_pre_multiply(tauThetaT,corrT) * thetaT_raw;
  vector[J_State] alphaP = muAlphaP + (sigmaAlphaP * alphaP_raw);
  matrix[K_DMPref,J_State] thetaP = rep_matrix(muThetaP,J_State) + diag_pre_multiply(tauThetaP,corrP) * thetaP_raw;
  vector[N_Elections_House] ElexT_Ho_ps;
 {
    vector[J_CD] ACS_By_CD = rep_vector(0,J_CD);
    vector[J_CD] ACS_By_CD_wgts = rep_vector(0,J_CD);
    for (k in 1:N_ACS) {
      ACS_By_CD[ACS_CD[k]] += (ACS_CVAP_WGT[k]) * (inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
      ACS_By_CD_wgts[ACS_CD[k]] += (ACS_CVAP_WGT[k]);
    }
    ACS_By_CD ./= ACS_By_CD_wgts;
    for (k in 1:N_Elections_House) {
      ElexT_Ho_ps[k] = ACS_By_CD[Elections_House_CD[k]];
    }
  }
  vector[N_Elections_House] ElexS_Ho_ps;
 {
    vector[J_CD] ACS_By_CD = rep_vector(0,J_CD);
    vector[J_CD] ACS_By_CD_wgts = rep_vector(0,J_CD);
    for (k in 1:N_ACS) {
      ACS_By_CD[ACS_CD[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]]))) * (inv_logit(alphaP[ACS_State[k]] + dot_product(DMPref_ACS[k],thetaP[,ACS_State[k]])));
      ACS_By_CD_wgts[ACS_CD[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
    }
    ACS_By_CD ./= ACS_By_CD_wgts;
    for (k in 1:N_Elections_House) {
      ElexS_Ho_ps[k] = ACS_By_CD[Elections_House_CD[k]];
    }
  }
  vector[N_Elections_Senate] ElexT_Se_ps;
 {
    vector[J_State] ACS_By_State = rep_vector(0,J_State);
    vector[J_State] ACS_By_State_wgts = rep_vector(0,J_State);
    for (k in 1:N_ACS) {
      ACS_By_State[ACS_State[k]] += (ACS_CVAP_WGT[k]) * (inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
      ACS_By_State_wgts[ACS_State[k]] += (ACS_CVAP_WGT[k]);
    }
    ACS_By_State ./= ACS_By_State_wgts;
    for (k in 1:N_Elections_Senate) {
      ElexT_Se_ps[k] = ACS_By_State[Elections_Senate_State[k]];
    }
  }
  vector[N_Elections_Senate] ElexS_Se_ps;
 {
    vector[J_State] ACS_By_State = rep_vector(0,J_State);
    vector[J_State] ACS_By_State_wgts = rep_vector(0,J_State);
    for (k in 1:N_ACS) {
      ACS_By_State[ACS_State[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]]))) * (inv_logit(ixS_Elex_Se + alphaP[ACS_State[k]] + dot_product(DMPref_ACS[k],thetaP[,ACS_State[k]])));
      ACS_By_State_wgts[ACS_State[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
    }
    ACS_By_State ./= ACS_By_State_wgts;
    for (k in 1:N_Elections_Senate) {
      ElexS_Se_ps[k] = ACS_By_State[Elections_Senate_State[k]];
    }
  }
  vector[N_Elections_President] ElexT_Pr_ps;
 {
    vector[J_State] ACS_By_State = rep_vector(0,J_State);
    vector[J_State] ACS_By_State_wgts = rep_vector(0,J_State);
    for (k in 1:N_ACS) {
      ACS_By_State[ACS_State[k]] += (ACS_CVAP_WGT[k]) * (inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
      ACS_By_State_wgts[ACS_State[k]] += (ACS_CVAP_WGT[k]);
    }
    ACS_By_State ./= ACS_By_State_wgts;
    for (k in 1:N_Elections_President) {
      ElexT_Pr_ps[k] = ACS_By_State[Elections_President_State[k]];
    }
  }
  vector[N_Elections_President] ElexS_Pr_ps;
 {
    vector[J_State] ACS_By_State = rep_vector(0,J_State);
    vector[J_State] ACS_By_State_wgts = rep_vector(0,J_State);
    for (k in 1:N_ACS) {
      ACS_By_State[ACS_State[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]]))) * (inv_logit(ixS_Elex_Pr + alphaP[ACS_State[k]] + dot_product(DMPref_ACS[k],thetaP[,ACS_State[k]])));
      ACS_By_State_wgts[ACS_State[k]] += (ACS_CVAP_WGT[k] * inv_logit(alphaT[ACS_State[k]] + dot_product(DMTurnout_ACS[k],thetaT[,ACS_State[k]])));
    }
    ACS_By_State ./= ACS_By_State_wgts;
    for (k in 1:N_Elections_President) {
      ElexS_Pr_ps[k] = ACS_By_State[Elections_President_State[k]];
    }
  }
}
model {
  muAlphaT ~ normal(0.4054651081081642,1);
  sigmaAlphaT ~ normal(0,0.4);
  alphaT_raw ~ normal(0,1);
  muThetaT ~ normal(0,1);
  tauThetaT ~ normal(0,0.4);
  corrT ~ lkj_corr_cholesky(1.0);
  to_vector(thetaT_raw) ~ normal(0,0.4);
  muAlphaP ~ normal(0,1);
  sigmaAlphaP ~ normal(0,0.4);
  alphaP_raw ~ normal(0,1);
  muThetaP ~ normal(0,1);
  tauThetaP ~ normal(0,0.4);
  corrP ~ lkj_corr_cholesky(1.0);
  to_vector(thetaP_raw) ~ normal(0,0.4);
  target += binomial_lpmf(Voted_ElexHo | CVAP_Elex_Ho,ElexT_Ho_ps);
  target += binomial_lpmf(DVotesInRace_Elex_Ho | VotesInRace_Elex_Ho,ElexS_Ho_ps);
  ixS_Elex_Se ~ normal(0,0.5);
  target += binomial_lpmf(Voted_ElexSe | CVAP_Elex_Se,ElexT_Se_ps);
  target += binomial_lpmf(DVotesInRace_Elex_Se | VotesInRace_Elex_Se,ElexS_Se_ps);
  ixS_Elex_Pr ~ normal(0,0.5);
  target += binomial_lpmf(Voted_ElexPr | CVAP_Elex_Pr,ElexT_Pr_ps);
  target += binomial_lpmf(DVotesInRace_Elex_Pr | VotesInRace_Elex_Pr,ElexS_Pr_ps);
  ix_CCEST ~ normal(0,0.5);
  vector[N_CCEST] mu_CCEST_v;
  for (n in 1:N_CCEST) {
    mu_CCEST_v[n] = ix_CCEST + alphaT[CCEST_State[n]] + dot_product(DMTurnout_CCEST[n],thetaT[,CCEST_State[n]]);
  }
  Voted_CCES ~ binomial_logit(CVAP_CCES,mu_CCEST_v);
  ix_CPST ~ normal(0,0.5);
  vector[N_CPS] mu_CPST_v;
  for (n in 1:N_CPS) {
    mu_CPST_v[n] = ix_CPST + alphaT[CPS_State[n]] + dot_product(DMTurnout_CPS[n],thetaT[,CPS_State[n]]);
  }
  Voted_CPS ~ binomial_logit(CVAP_CPS,mu_CPST_v);
  ix_CCESPHouse ~ normal(0,0.5);
  vector[N_CCESP_House] mu_CCESPHouse_v;
  for (n in 1:N_CCESP_House) {
    mu_CCESPHouse_v[n] = ix_CCESPHouse + alphaP[CCESP_House_State[n]] + dot_product(DMPref_CCESP_House[n],thetaP[,CCESP_House_State[n]]);
  }
  DVotesInRace_CCES_House ~ binomial_logit(VotesInRace_CCES_House,mu_CCESPHouse_v);
  ix_CCESPPresident ~ normal(0,0.5);
  vector[N_CCESP_President] mu_CCESPPresident_v;
  for (n in 1:N_CCESP_President) {
    mu_CCESPPresident_v[n] = ix_CCESPPresident + alphaP[CCESP_President_State[n]] + dot_product(DMPref_CCESP_President[n],thetaP[,CCESP_President_State[n]]);
  }
  DVotesInRace_CCES_President ~ binomial_logit(VotesInRace_CCES_President,mu_CCESPPresident_v);
}
generated quantities {
  vector[N_Elections_House + N_Elections_House + N_CCESP_House + N_CPS + N_CCESP_President + N_Elections_Senate + N_Elections_Senate + N_CCEST + N_Elections_President + N_Elections_President] log_lik;
 {
    for (n in 1:N_Elections_House) {
      log_lik[n] = binomial_lpmf(Voted_ElexHo[n] | CVAP_Elex_Ho[n],ElexT_Ho_ps[n]);
    }
  }
 {
    for (n in 1:N_Elections_House) {
      log_lik[n + N_Elections_House] = binomial_lpmf(DVotesInRace_Elex_Ho[n] | VotesInRace_Elex_Ho[n],ElexS_Ho_ps[n]);
    }
  }
 {
    for (n in 1:N_CCESP_House) {
      log_lik[n + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(DVotesInRace_CCES_House[n] | VotesInRace_CCES_House[n],ix_CCESPHouse + alphaP[CCESP_House_State[n]] + dot_product(DMPref_CCESP_House[n],thetaP[,CCESP_House_State[n]]));
    }
  }
 {
    for (n in 1:N_CPS) {
      log_lik[n + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(Voted_CPS[n] | CVAP_CPS[n],ix_CPST + alphaT[CPS_State[n]] + dot_product(DMTurnout_CPS[n],thetaT[,CPS_State[n]]));
    }
  }
 {
    for (n in 1:N_CCESP_President) {
      log_lik[n + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(DVotesInRace_CCES_President[n] | VotesInRace_CCES_President[n],ix_CCESPPresident + alphaP[CCESP_President_State[n]] + dot_product(DMPref_CCESP_President[n],thetaP[,CCESP_President_State[n]]));
    }
  }
 {
    for (n in 1:N_Elections_Senate) {
      log_lik[n + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_lpmf(Voted_ElexSe[n] | CVAP_Elex_Se[n],ElexT_Se_ps[n]);
    }
  }
 {
    for (n in 1:N_Elections_Senate) {
      log_lik[n + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_lpmf(DVotesInRace_Elex_Se[n] | VotesInRace_Elex_Se[n],ElexS_Se_ps[n]);
    }
  }
 {
    for (n in 1:N_CCEST) {
      log_lik[n + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(Voted_CCES[n] | CVAP_CCES[n],ix_CCEST + alphaT[CCEST_State[n]] + dot_product(DMTurnout_CCEST[n],thetaT[,CCEST_State[n]]));
    }
  }
 {
    for (n in 1:N_Elections_President) {
      log_lik[n + N_CCEST + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_lpmf(Voted_ElexPr[n] | CVAP_Elex_Pr[n],ElexT_Pr_ps[n]);
    }
  }
 {
    for (n in 1:N_Elections_President) {
      log_lik[n + N_Elections_President + N_CCEST + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_lpmf(DVotesInRace_Elex_Pr[n] | VotesInRace_Elex_Pr[n],ElexS_Pr_ps[n]);
    }
  }
}

Yeah, complicated indeed! I don’t have time to wade in, but my guess is that the election results are highly informative for context-specific parameters, such that each parameter is getting informed by just one or a handful of data points. Is this consistent with how this model works?

I don’t think it is important per se that there’s an imbalance between the informativeness of election data versus survey data, except inasmuch as you can’t count on the survey data to be the extra datapoints that add more information to each parameter. If each parameter were informed by numerous election-based datapoints, then the problem would likely disappear.

That’s a good thing to check. I’ll do that soon and reply. But I’m not convinced, particularly because the number of problematic pareto-k’s goes down as I shrink the election results. When I shrink them to an order of magnitude larger than the surveys, I get problematic pareto-k’s for all of them but when I shrink them by another order of magnitude, I only get about 5% of them being problematic. So there is definitely something going on with the spread in scale between the surveys and the election results.

But, for sure, I should demonstrate that more convincingly by just modeling the election results without the surveys and making sure that the pareto-k’s are all fine and it’s not just an issue with the election results themselves as you suggest.

Do you mean that you’re subsampling it? I don’t understand why just dividing numbers would change efficiency. If you can subsample, then your results should be valid (assuming the model can be subsampled).

One thing you can do is replace binomial_lpmf with binomial_lupmf to remove normalizing constants, or switch to sampling notation which does that automatically. Otherwise, you’re computing a ton of gamma functions here which will dominate the computations. Any loops that can be replaced with matrix operations are also a win, but I can’t quit see how to do that with the dot products the way you’ve defined them (I’m not sure it’s possible).

Lines like these

for (k in 1:N_Elections_House) {
      ElexT_Ho_ps[k] = ACS_By_CD[Elections_House_CD[k]];
    }

can be simplified to one-liners like:

ElexT_Ho_ps = ACS_By_CD[Elections_House_CD];

assuming that Elections_house_CD is N_Elections_House elements long. Overall, I’d suggest pulling all the common computations into functions and trying to dramatically shorten your variable names so the structure of the code is more clear. It’ll also make sure that your code is doing the right thing, which is hard to verify with this much cut-and-paste.

Vectorizing the log likelihoods would be a huge win for efficiency if you don’t need them for x-validation.

@Bob_Carpenter Thanks for all the thoughtful ideas!

I’m only dividing the election results by something. The surveys stay as they are. So what’s changing is the difference in scale between the election results and the CCES and CPS data.

Yep. I was doing that before but the elections are large and so there were numerical issues (underflow?) when I used “binomial_lupmf”. But once I’ve rescaled them, maybe I can switch back to “lupmf”

Yeah. The loops in “transformed parameters” don’t have any simple way to rewrite them as matrix ops because of the inv_logit functions.

Lines like these

That’s a good point. Thanks!

Yeah. That’s tricky. This is being generated by Haskell code (mine) so I can fix some things by improving the code generator, but others are harder than they might seem. It’ s not cut and paste right now so much as the code generator doing the same thing for each data-set. But I can think about how to have it generate functions and then re-use them.

I’m not sure what you mean here. You mean the stuff in the “generated quantities” block? That’s not my efficiency issue. When I run the model itself, that’s not even present. I compute those later via cmdstan and the “generate quantities” mode. And that doesn’t take very long. Or do you mean the sampling statements?

The code is using non-centered parameterization, which can be inefficient for informative likelihood, which would explain the improved efficiency when making the likelihood less informative.’

Sorry for the long delay in answering. I’ve been experimenting…

That’s an interesting point! I find the centering/non-centering to be very confusing. In this model, how informative each context (state) is–which I understand to mean how much the observations in that context depend on being in that context rather than other parameters–varies a lot depending on how many predictors I include. So if I fit just the alphas, using no predictors, the centered parameterization works best and then starts to struggle as I add predictors. I think.

But I hadn’t really thought about it in the context of rescaling the election size.

There are also the contexts of each different data-set, the usual sense of context in a meta-analysis. But I am not treating those hierarchically, at least for now. But maybe I should?

I’ve been thinking more about this and I realize it’s a general point of difficulty. Is there some way in stan to build a vector which acts as a “view” into another vector? Or some way of managing these sorts of loops where something is being in re-indexed and this somehow stops the entire thing from being vectorized?

So I’ve done some of this but it’s hard to do as much as might be best for straightforward code because stan seems unhappy with an expression that evaluates to a vector being given as an argument to a function that expects a vector. Which makes sense, since it’s C++ underneath. I can go further in the direction of functionalization by making functions which are model specific, which is a little painful from the code-generation standpoint but might help clarify the code. I’m still not sure I can remove any loops, though I might be able to combine a few.

Which beings me to another question: Is it worth outsize work to combine loops? What is the relative cost of

vector v[N];
vector w[N];
for (k in 1:N) {
  v[k] = f(...)
}
for (k in 1:N) {
  w[k] = g(...)
}

vs

vector v[N];
vector w[N];
for (k in 1:N) {
  v[k] = f(...)
  w[k] = g(...)
}

?

Combining loops is hard from for me from the code-generation side, because I don’t usually have more than one line or loop in hand before I write out the relevant Stan code. But in the long term, I plan to have the code generator work a bit more globally and this would be one good reason. Though to do it optimally, I’d need to keep track of expression dependencies so as to do whatever loops I need before I use things computed in them…

If a is a vector then a[L:U] is the slice of a from L to U. For now, these are being allocated and copied, but that’s not usually the bottleneck because there’s no autodiff overhead there.

If you have a function declared as int foo(vector y), then you should be able to call foo(e) with any expression e that denotes a vector. It’s hard to say more without seeing the error message you’re getting. If e is a vector and foo(e) isn’t accepted, that’s a bug that we would be very grateful if you could report.

There’s almost no difference in speed between

for (j in 1:J) foo[j] = ...;
for (j in 1:J) bar[j] = ...;

and

for (j in 1:J) {
  foo[j] = ...;
  bar[j] = ...;
}

In fact, the first one can be faster because it often has better memory-locality properties and loops are cheap. It’s even better to not have loops at all, because evaluating all the foo[j] and bar[j] independently (instead of in a shared expression that can share computation and expression graphs in the autodiff library) is suboptimal.

1 Like

Ah. I think this was my misunderstanding from code generation and whatever makes those dot products hard to vectorize. I will try to make a more coherent statement of what doesn’t work as I might hope. It’s also connected to what I meant by a “view” into a vector–not a slice, but a reindexing via an array[] int. Now that I’ve rewritten the parameter transformations as a function, it’s clear that I can pass any vector expression where a vector is expected, as you indicate.

I’ve included the code again, now with the transformation done mostly as a function–there’s no way to return 2 things from a function, right? Are there tuples of vectors in Stan?–and you’re right, it’s clearer and, I think because of some manual common-subexpression-elimination, quite a bit faster. I’m including it here in case there are more easy opportunities for optimization that I have missed!

I still have my two original questions:

  • These fits have high (“bad” or worse) pareto-k for the election-data as compared to the survey data. I don’t really understand what that means but my rough understanding is that it indicates that those points are in some sense disproportionately influential. That might make sense here, since the elections have many more “trials” (in the binomial sense) than the surveys. So should I just expect the high pareto-k? Or am I misunderstanding?
  • What does it mean if I have low N_eff but only in “latent” variables? That is, every variable I use for post-stratification and analysis has reasonable N_eff but some of the variables used in the fit, e.g., some alphaT_raw but not alphaT. I am curious about two things: can I rely on the values of alphaT even though alphaT_raw has low N_eff and, less coherently, might that information indicate specific places to look for useful re-parameterization?

Anyway, here’s the re-factored code. Apologies for not shortening variable names! For me, that makes it harder to follow since there are various data sources with similar data.

Thanks for all the great suggestions and thoughts so far!

functions {
  matrix elexPSFunction(array[] int gIdx, vector psWgts, vector aT, matrix bT, matrix dT, vector aP, matrix bP, matrix dP, int N_elex, array[] int elexIdx) {
    int N_ps = size(gIdx);
    int N_grp = size(aT);
    matrix[N_grp, 2] p = rep_matrix(0, N_grp, 2);
    matrix[N_grp, 2] wgts = rep_matrix(0, N_grp, 2);
    for (k in 1:N_ps) {
      real pT = inv_logit(aT[gIdx[k]] + dot_product(dT[k], bT[,gIdx[k]]));
      real pP = inv_logit(aP[gIdx[k]] + dot_product(dP[k], bP[,gIdx[k]]));
      real wPT = psWgts[k] * pT;
      p[gIdx[k], 1] += wPT;
      p[gIdx[k], 2] += wPT * pP;
      wgts[gIdx[k], 1] += psWgts[k];
      wgts[gIdx[k], 2] += wPT;
    }
    p ./= wgts;
    matrix[N_elex, 2] q;
    q[,1] = p[elexIdx,1];
    q[,2] = p[elexIdx,2];
    return q;
  }
}
data {
  int<lower=1> J_CD;
  int<lower=1> N_Elections_House;
  array[N_Elections_House] int<lower=1> Elections_House_CD;
  int<lower=1> J_State;
  array[N_Elections_House] int<lower=1> Elections_House_State;
  int<lower=1> N_CCESP_House;
  array[N_CCESP_House] int<lower=1> CCESP_House_State;
  int<lower=1> N_CPS;
  array[N_CPS] int<lower=1> CPS_State;
  int<lower=1> N_CCESP_President;
  array[N_CCESP_President] int<lower=1> CCESP_President_State;
  int<lower=1> N_Elections_Senate;
  array[N_Elections_Senate] int<lower=1> Elections_Senate_State;
  int<lower=1> N_CCEST;
  array[N_CCEST] int<lower=1> CCEST_State;
  int<lower=1> N_ACS;
  array[N_ACS] int<lower=1> ACS_CD;
  array[N_ACS] int<lower=1> ACS_State;
  int<lower=1> N_Elections_President;
  array[N_Elections_President] int<lower=1> Elections_President_State;
  array[N_ACS] int<lower=0> ACS_CVAP_WGT;
  int K_DMTurnout;
  matrix[N_ACS,K_DMTurnout] DMTurnout_ACS;
  int K_DMPref;
  matrix[N_ACS,K_DMPref] DMPref_ACS;
  array[N_Elections_House] int<lower=0> CVAP_Elex_Ho;
  array[N_Elections_House] int<lower=0> Voted_ElexHo;
  array[N_Elections_House] int<lower=0> VotesInRace_Elex_Ho;
  array[N_Elections_House] int<lower=0> DVotesInRace_Elex_Ho;
  array[N_Elections_Senate] int<lower=0> CVAP_Elex_Se;
  array[N_Elections_Senate] int<lower=0> Voted_ElexSe;
  array[N_Elections_Senate] int<lower=0> VotesInRace_Elex_Se;
  array[N_Elections_Senate] int<lower=0> DVotesInRace_Elex_Se;
  array[N_Elections_President] int<lower=0> CVAP_Elex_Pr;
  array[N_Elections_President] int<lower=0> Voted_ElexPr;
  array[N_Elections_President] int<lower=0> VotesInRace_Elex_Pr;
  array[N_Elections_President] int<lower=0> DVotesInRace_Elex_Pr;
  matrix[N_CCEST,K_DMTurnout] DMTurnout_CCEST;
  array[N_CCEST] int<lower=0> CVAP_CCES;
  array[N_CCEST] int<lower=0> Voted_CCES;
  array[N_CPS] int<lower=0> CVAP_CPS;
  array[N_CPS] int<lower=0> Voted_CPS;
  matrix[N_CPS,K_DMTurnout] DMTurnout_CPS;
  matrix[N_CCESP_House,K_DMPref] DMPref_CCESP_House;
  array[N_CCESP_House] int<lower=0> VotesInRace_CCES_House;
  array[N_CCESP_House] int<lower=0> DVotesInRace_CCES_House;
  matrix[N_CCESP_President,K_DMPref] DMPref_CCESP_President;
  array[N_CCESP_President] int<lower=0> VotesInRace_CCES_President;
  array[N_CCESP_President] int<lower=0> DVotesInRace_CCES_President;
}
parameters {
  real muAlphaT;
  real<lower=0> sigmaAlphaT;
  vector[J_State] alphaT_raw;
  vector[K_DMTurnout] muThetaT;
  vector<lower=0>[K_DMTurnout] tauThetaT;
  cholesky_factor_corr[K_DMTurnout] corrT;
  matrix[K_DMTurnout,J_State] thetaT_raw;
  real muAlphaP;
  real<lower=0> sigmaAlphaP;
  vector[J_State] alphaP_raw;
  vector[K_DMPref] muThetaP;
  vector<lower=0>[K_DMPref] tauThetaP;
  cholesky_factor_corr[K_DMPref] corrP;
  matrix[K_DMPref,J_State] thetaP_raw;
  real alphaT_Elex_Se;
  real alphaP_Elex_Se;
  real alphaT_Elex_Pr;
  real alphaP_Elex_Pr;
  real alpha_CCEST;
  real alpha_CPST;
  real alpha_CCESPHouse;
  real alpha_CCESPPresident;
}
transformed parameters {
  vector[J_State] alphaT = muAlphaT + (sigmaAlphaT * alphaT_raw);
  matrix[K_DMTurnout,J_State] thetaT = rep_matrix(muThetaT,J_State) + diag_pre_multiply(tauThetaT,corrT) * thetaT_raw;
  vector[J_State] alphaP = muAlphaP + (sigmaAlphaP * alphaP_raw);
  matrix[K_DMPref,J_State] thetaP = rep_matrix(muThetaP,J_State) + diag_pre_multiply(tauThetaP,corrP) * thetaP_raw;
  matrix[N_Elections_House,2] elexProbs_Ho = elexPSFunction(ACS_State,to_vector(ACS_CVAP_WGT),alphaT,thetaT,DMTurnout_ACS,alphaP,thetaP,DMPref_ACS,N_Elections_House,Elections_House_State);
  vector[N_Elections_House] elexPT_Ho = col(elexProbs_Ho,1);
  vector[N_Elections_House] elexPP_Ho = col(elexProbs_Ho,2);
  matrix[N_Elections_Senate,2] elexProbs_Se = elexPSFunction(ACS_State,to_vector(ACS_CVAP_WGT),alphaT,thetaT,DMTurnout_ACS,alphaP,thetaP,DMPref_ACS,N_Elections_Senate,Elections_Senate_State);
  vector[N_Elections_Senate] elexPT_Se = col(elexProbs_Se,1);
  vector[N_Elections_Senate] elexPP_Se = col(elexProbs_Se,2);
  matrix[N_Elections_President,2] elexProbs_Pr = elexPSFunction(ACS_State,to_vector(ACS_CVAP_WGT),alphaT,thetaT,DMTurnout_ACS,alphaP,thetaP,DMPref_ACS,N_Elections_President,Elections_President_State);
  vector[N_Elections_President] elexPT_Pr = col(elexProbs_Pr,1);
  vector[N_Elections_President] elexPP_Pr = col(elexProbs_Pr,2);
}
model {
  muAlphaT ~ normal(0.4054651081081642,1);
  sigmaAlphaT ~ normal(0,0.4);
  alphaT_raw ~ normal(0,1);
  muThetaT ~ normal(0,1);
  tauThetaT ~ normal(0,0.4);
  corrT ~ lkj_corr_cholesky(1.0);
  to_vector(thetaT_raw) ~ normal(0,0.4);
  muAlphaP ~ normal(0,1);
  sigmaAlphaP ~ normal(0,0.4);
  alphaP_raw ~ normal(0,1);
  muThetaP ~ normal(0,1);
  tauThetaP ~ normal(0,0.4);
  corrP ~ lkj_corr_cholesky(1.0);
  to_vector(thetaP_raw) ~ normal(0,0.4);
  to_vector(Voted_ElexHo) ~ normal(elexPT_Ho .* to_vector(CVAP_Elex_Ho),sqrt(elexPT_Ho .* (1 - elexPT_Ho) .* to_vector(CVAP_Elex_Ho)));
  to_vector(DVotesInRace_Elex_Ho) ~ normal(elexPP_Ho .* to_vector(VotesInRace_Elex_Ho),sqrt(elexPP_Ho .* (1 - elexPP_Ho) .* to_vector(VotesInRace_Elex_Ho)));
  alphaT_Elex_Se ~ normal(0,0.5);
  alphaP_Elex_Se ~ normal(0,0.5);
  to_vector(Voted_ElexSe) ~ normal(elexPT_Se .* to_vector(CVAP_Elex_Se),sqrt(elexPT_Se .* (1 - elexPT_Se) .* to_vector(CVAP_Elex_Se)));
  to_vector(DVotesInRace_Elex_Se) ~ normal(elexPP_Se .* to_vector(VotesInRace_Elex_Se),sqrt(elexPP_Se .* (1 - elexPP_Se) .* to_vector(VotesInRace_Elex_Se)));
  alphaT_Elex_Pr ~ normal(0,0.5);
  alphaP_Elex_Pr ~ normal(0,0.5);
  to_vector(Voted_ElexPr) ~ normal(elexPT_Pr .* to_vector(CVAP_Elex_Pr),sqrt(elexPT_Pr .* (1 - elexPT_Pr) .* to_vector(CVAP_Elex_Pr)));
  to_vector(DVotesInRace_Elex_Pr) ~ normal(elexPP_Pr .* to_vector(VotesInRace_Elex_Pr),sqrt(elexPP_Pr .* (1 - elexPP_Pr) .* to_vector(VotesInRace_Elex_Pr)));
  alpha_CCEST ~ normal(0,0.5);
  vector[N_CCEST] mu_CCEST_v;
  for (n in 1:N_CCEST) {
    mu_CCEST_v[n] = alpha_CCEST + alphaT[CCEST_State[n]] + dot_product(DMTurnout_CCEST[n],thetaT[,CCEST_State[n]]);
  }
  Voted_CCES ~ binomial_logit(CVAP_CCES,mu_CCEST_v);
  alpha_CPST ~ normal(0,0.5);
  vector[N_CPS] mu_CPST_v;
  for (n in 1:N_CPS) {
    mu_CPST_v[n] = alpha_CPST + alphaT[CPS_State[n]] + dot_product(DMTurnout_CPS[n],thetaT[,CPS_State[n]]);
  }
  Voted_CPS ~ binomial_logit(CVAP_CPS,mu_CPST_v);
  alpha_CCESPHouse ~ normal(0,0.5);
  vector[N_CCESP_House] mu_CCESPHouse_v;
  for (n in 1:N_CCESP_House) {
    mu_CCESPHouse_v[n] = alpha_CCESPHouse + alphaP[CCESP_House_State[n]] + dot_product(DMPref_CCESP_House[n],thetaP[,CCESP_House_State[n]]);
  }
  DVotesInRace_CCES_House ~ binomial_logit(VotesInRace_CCES_House,mu_CCESPHouse_v);
  alpha_CCESPPresident ~ normal(0,0.5);
  vector[N_CCESP_President] mu_CCESPPresident_v;
  for (n in 1:N_CCESP_President) {
    mu_CCESPPresident_v[n] = alpha_CCESPPresident + alphaP[CCESP_President_State[n]] + dot_product(DMPref_CCESP_President[n],thetaP[,CCESP_President_State[n]]);
  }
  DVotesInRace_CCES_President ~ binomial_logit(VotesInRace_CCES_President,mu_CCESPPresident_v);
}
generated quantities {
  vector[N_Elections_House + N_Elections_House + N_CCESP_House + N_CPS + N_CCESP_President + N_Elections_Senate + N_Elections_Senate + N_CCEST + N_Elections_President + N_Elections_President] log_lik;
 {
    for (n in 1:N_Elections_House) {
      log_lik[n] = normal_lpdf(Voted_ElexHo[n] | elexPT_Ho[n] .* CVAP_Elex_Ho[n],sqrt(elexPT_Ho[n] .* (1 - elexPT_Ho[n]) .* CVAP_Elex_Ho[n]));
    }
  }
 {
    for (n in 1:N_Elections_House) {
      log_lik[n + N_Elections_House] = normal_lpdf(DVotesInRace_Elex_Ho[n] | elexPP_Ho[n] .* VotesInRace_Elex_Ho[n],sqrt(elexPP_Ho[n] .* (1 - elexPP_Ho[n]) .* VotesInRace_Elex_Ho[n]));
    }
  }
 {
    for (n in 1:N_CCESP_House) {
      log_lik[n + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(DVotesInRace_CCES_House[n] | VotesInRace_CCES_House[n],alpha_CCESPHouse + alphaP[CCESP_House_State[n]] + dot_product(DMPref_CCESP_House[n],thetaP[,CCESP_House_State[n]]));
    }
  }
 {
    for (n in 1:N_CPS) {
      log_lik[n + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(Voted_CPS[n] | CVAP_CPS[n],alpha_CPST + alphaT[CPS_State[n]] + dot_product(DMTurnout_CPS[n],thetaT[,CPS_State[n]]));
    }
  }
 {
    for (n in 1:N_CCESP_President) {
      log_lik[n + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(DVotesInRace_CCES_President[n] | VotesInRace_CCES_President[n],alpha_CCESPPresident + alphaP[CCESP_President_State[n]] + dot_product(DMPref_CCESP_President[n],thetaP[,CCESP_President_State[n]]));
    }
  }
 {
    for (n in 1:N_Elections_Senate) {
      log_lik[n + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = normal_lpdf(Voted_ElexSe[n] | elexPT_Se[n] .* CVAP_Elex_Se[n],sqrt(elexPT_Se[n] .* (1 - elexPT_Se[n]) .* CVAP_Elex_Se[n]));
    }
  }
 {
    for (n in 1:N_Elections_Senate) {
      log_lik[n + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = normal_lpdf(DVotesInRace_Elex_Se[n] | elexPP_Se[n] .* VotesInRace_Elex_Se[n],sqrt(elexPP_Se[n] .* (1 - elexPP_Se[n]) .* VotesInRace_Elex_Se[n]));
    }
  }
 {
    for (n in 1:N_CCEST) {
      log_lik[n + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = binomial_logit_lpmf(Voted_CCES[n] | CVAP_CCES[n],alpha_CCEST + alphaT[CCEST_State[n]] + dot_product(DMTurnout_CCEST[n],thetaT[,CCEST_State[n]]));
    }
  }
 {
    for (n in 1:N_Elections_President) {
      log_lik[n + N_CCEST + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = normal_lpdf(Voted_ElexPr[n] | elexPT_Pr[n] .* CVAP_Elex_Pr[n],sqrt(elexPT_Pr[n] .* (1 - elexPT_Pr[n]) .* CVAP_Elex_Pr[n]));
    }
  }
 {
    for (n in 1:N_Elections_President) {
      log_lik[n + N_Elections_President + N_CCEST + N_Elections_Senate + N_Elections_Senate + N_CCESP_President + N_CPS + N_CCESP_House + N_Elections_House + N_Elections_House] = normal_lpdf(DVotesInRace_Elex_Pr[n] | elexPP_Pr[n] .* VotesInRace_Elex_Pr[n],sqrt(elexPP_Pr[n] .* (1 - elexPP_Pr[n]) .* VotesInRace_Elex_Pr[n]));
    }
  }
}