I am trying to estimate the posterior distribution of the total success count of a hypergeometric distribution.
I have 39 farms (population) from which I am getting a random sample without replacement of 30 farms. I observe 15 successes and what to know my uncertainty around the number of successes in the population (a).
Since Stan doesn’t support discrete parameters, I left the parameters block empty and did the log density manipulation in the transformed parameters block. Does this approach sounds good to you? I am new to Stan and marginalization, so not sure if I am doing this correctly. Or do I need a continuous parameter anyway?
I am using a uniform prior, are there any other prior that are recommended for the hypergeometric distribution?
data {
int<lower=1> pop; // Population
int<lower=1, upper=pop> ss; // Sample size
int<lower=0, upper=ss> k1; // Number of successes observed
}
transformed data {
// The maximum number of successes is the population minus observed failures
int<lower=0, upper=pop> max_a = pop-(ss-k1);
// The number of successes can't be less than what observed in the sample
int<lower=0, upper=ss> min_a = k1;
// Number of possible values a can take
int<lower=1, upper=pop> len_a = max_a-min_a+1;
}
parameters {
}
transformed parameters {
vector[pop+1] lp; // We add 1 to include zero successes
for(a in 0:pop){
if (a < min_a)
lp[a+1] = negative_infinity();
else if (a > max_a)
lp[a+1] = negative_infinity();
else
lp[a+1] = hypergeometric_lpmf(k1 | ss, a, pop-a) + log(1.0/len_a);
}
}
model {
target += log_sum_exp(lp);
}
generated quantities{
int i;
simplex[pop+1] tp;
tp = softmax(lp);
i = categorical_rng(tp);
}
In case it helps, I am adding the output of the model for pop=39, ss=30, k1=15
tp[1] is (I think) the posterior probability of a = 0
tp[15] is the posterior probability of a = 14
and so on…
Inference for Stan model: anon_model_269b3e5d623b621b4f88db6edb173cfd.
4 chains, each with iter=10000; warmup=0; thin=1;
post-warmup draws per chain=10000, total post-warmup draws=40000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
lp[1] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[2] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[3] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[4] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[5] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[6] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[7] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[8] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[9] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[10] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[11] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[12] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[13] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[14] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[15] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[16] -7.39 0.0 0.0 -7.39 -7.39 -7.39 -7.39 -7.39 2 1.0
lp[17] -5.6 1.3e-15 1.8e-15 -5.6 -5.6 -5.6 -5.6 -5.6 2 1.0
lp[18] -4.51 6.3e-16 8.9e-16 -4.51 -4.51 -4.51 -4.51 -4.51 2 1.0
lp[19] -3.87 3.1e-16 4.4e-16 -3.87 -3.87 -3.87 -3.87 -3.87 2 1.0
lp[20] -3.56 9.4e-16 1.3e-15 -3.56 -3.56 -3.56 -3.56 -3.56 2 1.0
lp[21] -3.56 9.4e-16 1.3e-15 -3.56 -3.56 -3.56 -3.56 -3.56 2 1.0
lp[22] -3.87 3.1e-16 4.4e-16 -3.87 -3.87 -3.87 -3.87 -3.87 2 1.0
lp[23] -4.51 6.3e-16 8.9e-16 -4.51 -4.51 -4.51 -4.51 -4.51 2 1.0
lp[24] -5.6 1.3e-15 1.8e-15 -5.6 -5.6 -5.6 -5.6 -5.6 2 1.0
lp[25] -7.39 0.0 0.0 -7.39 -7.39 -7.39 -7.39 -7.39 2 1.0
lp[26] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[27] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[28] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[29] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[30] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[31] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[32] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[33] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[34] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[35] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[36] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[37] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[38] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[39] -inf nan nan -inf -inf -inf -inf -inf nan nan
lp[40] -inf nan nan -inf -inf -inf -inf -inf nan nan
i 20.49 8.3e-3 1.67 17.0 19.0 20.0 22.0 24.0 40190 1.0
tp[1] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[2] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[3] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[4] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[5] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[6] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[7] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[8] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[9] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[10] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[11] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[12] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[13] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[14] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[15] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[16] 4.8e-3 1.2e-18 1.7e-18 4.8e-3 4.8e-3 4.8e-3 4.8e-3 4.8e-3 2 1.0
tp[17] 0.03 1.2e-17 1.7e-17 0.03 0.03 0.03 0.03 0.03 2 1.0
tp[18] 0.08 9.8e-18 1.4e-17 0.08 0.08 0.08 0.08 0.08 2 1.0
tp[19] 0.16 3.9e-17 5.6e-17 0.16 0.16 0.16 0.16 0.16 2 1.0
tp[20] 0.22 3.9e-17 5.6e-17 0.22 0.22 0.22 0.22 0.22 2 1.0
tp[21] 0.22 3.9e-17 5.6e-17 0.22 0.22 0.22 0.22 0.22 2 1.0
tp[22] 0.16 3.9e-17 5.6e-17 0.16 0.16 0.16 0.16 0.16 2 1.0
tp[23] 0.08 9.8e-18 1.4e-17 0.08 0.08 0.08 0.08 0.08 2 1.0
tp[24] 0.03 1.2e-17 1.7e-17 0.03 0.03 0.03 0.03 0.03 2 1.0
tp[25] 4.8e-3 1.2e-18 1.7e-18 4.8e-3 4.8e-3 4.8e-3 4.8e-3 4.8e-3 2 1.0
tp[26] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[27] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[28] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[29] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[30] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[31] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[32] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[33] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[34] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[35] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[36] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[37] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[38] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[39] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
tp[40] 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
lp__ 0.0 nan 0.0 0.0 0.0 0.0 0.0 0.0 nan nan
Samples were drawn using Fixed_param at Sun Aug 16 20:40:50 2020.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
Yes your approach is correct, and quite clever at that! Essentially what you are doing is marginalising over K under a uniform prior on the “interval” I = [k, N-(n-k)]. I made a couple changes to your program so you don’t have to loop over the whole array 0:N, since you have positive probability only over I.
Stan program:
data {
int<lower=1> pop; // Population
int<lower=1, upper=pop> ss; // Sample size
int<lower=0, upper=ss> k1; // Number of successes observed
}
transformed data {
// The maximum number of successes is the population minus observed failures
int<lower=0, upper=pop> max_a = pop-(ss-k1);
// The number of successes can't be less than what observed in the sample
int<lower=0, upper=ss> min_a = k1;
// Number of possible values a can take
int<lower=1, upper=pop> len_a = max_a-min_a+1;
int<lower=min_a, upper=max_a> marginalisation_array[len_a];
for(a in 1:len_a) marginalisation_array[a] = min_a + a-1;
}
parameters {
}
transformed parameters {
vector[len_a] lp;
for(a in 1:len_a){
lp[a] = hypergeometric_lpmf(k1 | ss, marginalisation_array[a], pop-marginalisation_array[a]) -log(len_a);
}
}
model {
target += log_sum_exp(lp);
}
generated quantities{
int i;
simplex[len_a] tp;
tp = softmax(lp);
i = categorical_rng(tp);
}