Mixture of counts with millions of observations

I have a dataset that I would like to fit a mixture model to, but there are several million observations. The counts only span about 100 numbers, so each count is observed many times. For example, there are 5,000 observations with count 1, 10,000 with count 2, …, 7,000 with count 100. Instead of passing a vector of length 10 million to stan, is it possible to use the summary statistics for a mixture model, e.g. a mixture model on the histogram instead of the data used to create the histogram?

If you can write out your model here, it should be really easy to translate things to the “compressed” format.

1 Like

Thanks @maxbiostat. I simulated some data and included the code that I am starting from. I would like to be able to pass something like table(y) to stan, but not sure how to go about setting up the mixture model for this.

library(rstan)

N <- 5000
lambda <- c(5, 15) # poisson parameters
theta <- c(0.3, 0.7) # mixing proportions

set.seed(1)
z <- sample(1:2, N, replace=TRUE, prob=theta)
y <- rpois(N, lambda[z])

table(y)

stancode <- "
data {
    // data
    int N; // number observations
    int y[N]; // observations
}

parameters {
    ordered[2] lambda;
    simplex[2] theta;
}

model {
    real contributions[2];

    // prior
    lambda ~ exponential(0.1);
    theta ~ dirichlet(rep_vector(0.5, 2));

    for (n in 1:N) {
        for (k in 1:2) {
            contributions[k] = log(theta[k]) + poisson_lpmf(y[n] | lambda[k]);
        }

        target += log_sum_exp(contributions);
    }
}
"

model <- stan_model(model_code=stancode,
                    model_name="mixture")

stan_data <- list(N=N,
                  y=y)

results <- sampling(model,
                    data=stan_data,
                    chains=4,
                    cores=4)

if you store the data as observed values and counts, you could simply multiply the likelihood for each observed value by the corresponding count. It gives equivalent resuits and seems faster.

stancode_summary <- "
data {
    // data
    int N; // number unique observed values
    int y[N]; // observed values
    int count[N]; // counts
}

parameters {
    ordered[2] lambda;
    simplex[2] theta;
}

model {
    real contributions[2];

    // prior
    lambda ~ exponential(0.1);
    theta ~ dirichlet(rep_vector(0.5, 2));

    for (n in 1:N) {
        for (k in 1:2) {
            contributions[k] = log(theta[k]) + poisson_lpmf(y[n] | lambda[k]);
        }

        target += log_sum_exp(contributions) * count[n];
    }
}
"

model_summary <- stan_model(model_code=stancode_summary,
                    model_name="mixture_summary")

library(dplyr)
data_summary <- data.frame( y = y) %>% group_by(y) %>% summarise( count = n() )  

stan_data_summary <- list( N = nrow(data_summary),
                  y=data_summary$y,
                  count = data_summary$count)

results_summary <- sampling(model_summary,
                    data=stan_data_summary,
                    chains=4,
                    cores=4)

3 Likes

See if this works for you:

library(rstan)

N <- 5000
lambda <- c(5, 15) # poisson parameters
theta <- c(0.3, 0.7) # mixing proportions

set.seed(1)
z <- sample(1:2, N, replace=TRUE, prob=theta)
y <- rpois(N, lambda[z])

table(y)

compress_data <- function(x){
  raw <- table(x)
  vv <- as.numeric(names(raw))
  ns <- as.numeric(raw)
  return(list(v = vv, fs = ns, K = length(vv)))
}


stancode <- "
data {
    // data
    int K; // number unique observations
    int f[K]; // frequencies
    int v[K]; // unique observations
}

parameters {
    ordered[2] lambda;
    simplex[2] theta;
}

model {
    real contributions[2];

    // prior
    lambda ~ exponential(0.1);
    theta ~ dirichlet(rep_vector(0.5, 2));

    for (k in 1:K) {
        for (j in 1:2) {
            contributions[j] = log(theta[j]) + poisson_lpmf(v[k] | lambda[j]);
        }

        target += f[k] * log_sum_exp(contributions);
    }
}
"

model <- stan_model(model_code=stancode,
                    model_name="mixture")
comp.data <- compress_data(y)

stan_data <- list(K = comp.data$K,
                  f = comp.data$fs,
                  v = comp.data$v)

results <- sampling(model,
                    data=stan_data,
                    chains=4,
                    cores=4)

A quick and dirty test showed results were the same as for the “uncompressed” version, at a fraction of the time.

1 Like

Thank you both for your help!