Improving Runtime & Reduce Sum for Hierarchical model with large number of groups

Hi y’all! I have a hierarchical regression model with a decent amount of data (n = 150,000) with and two categorical (x2, x3) and one continuous variable (x1). One grouping consists of around 10 categories (x2) and the second consists of around 2,500 categories (x3). The runtime for this model for 2,000 iterations can take between 8-10 hours. I have attempted to utilize the reduce_sum function, but 1) have not been able to get it implemented and 2) am unsure if this method will result in faster runtime after reading this thread: reduce_sum_thread . I have been getting the error:

Error in stanc(file = file, model_code = model_code, model_name = model_name, : 0 Semantic error in 'string', line 22, column 21 to column 35: Index must be of type int or int[] or must be a range. Instead found type vector.

I have provided a reproducible example below (with a smaller simulated dataset). I would appreciate any and all suggestions for speeding up this model. Thank you!

This is the Stan code, where the code that is commented out was the original version before attempting to implement the reduce sum.


functions {
  matrix scale_r_cor(matrix z, vector SD, matrix L) {
    return transpose(diag_pre_multiply(SD, L) * z);
  }
  
  
   real partial_sum(real[]  y_slice,
                    int start, int end,
                    vector r_1_1,
                    vector r_1_2,
                    vector r_2_1,
                    vector Z_1_1,
                    vector Z_1_2,
                    vector Z_2_1,
                    vector J_1,
                    vector J_2,
                    real sigma
                    
                  ){
        
        return normal_lpdf(y_slice | 
               r_1_1[J_1[start:end]] * Z_1_1[start:end] + 
               r_1_2[J_1[start:end]] * Z_1_2[start:end] + 
               r_2_1[J_2[start:end]] * Z_2_1[start:end], sigma);
        
                  }
}


data {
  // Overall Data
  int<lower=1> N;      // total number of observations
  vector[N] Y;         // response variable
  
  // data for group-level effects (Variable 1 & 2)
  int<lower=1> N_1;    // number of grouping levels
  int<lower=1> M_1;    // number of coefficients per level
  int<lower=1> J_1[N]; // grouping indicator per observation
  vector[N] Z_1_1;     // Intercept Group Level
  vector[N] Z_1_2;     // Variable 1 Group Level

  // data for group-level effects (Variable 3)
  int<lower=1> N_2;    // number of grouping levels
  int<lower=1> M_2;    // number of coefficients per level
  int<lower=1> J_2[N]; // grouping indicator per observation
  vector[N] Z_2_1;     // Intercept
  
  // Prior Information 
  vector[N_2] mu_prior;         // Prior Means for Variable 3
  real<lower=0> sigma_prior; // Prior variance for Variable 3
  
  }


parameters {
  
  real Intercept;               // temporary intercept for centered predictors
  real<lower=0> sigma;          // dispersion parameter
  
  vector<lower=0>[M_1] sd_1;    // group-level standard deviations
  matrix[M_1, N_1] z_1;         // standardized group-level effects
  cholesky_factor_corr[M_1] L_1;// cholesky factor of correlation matrix
  
  vector<lower=0>[M_2] sd_2;    // group-level standard deviations
  vector[N_2] z_2[M_2];         // standardized group-level effects
  
}



transformed parameters {
  matrix[N_1, M_1] r_1;  // group-level effects
  vector[N_1] r_1_1;     // intercept effects
  vector[N_1] r_1_2;     // actual variable 1 group-level effects

  vector[N_2] r_2_1;     //  actual variable 3 group-level effects
  
  real lprior = 0;       // prior contributions to the log posterior
  
  // Computing group-level effects
  r_1 = scale_r_cor(z_1, sd_1, L_1);
  r_1_1 = r_1[, 1];
  r_1_2 = r_1[, 2];

  r_2_1 = (sd_2[1] * (z_2[1]));
  
  // Prior Specificaions for Sigmas and intercepts
  lprior += student_t_lpdf(Intercept | 3, -0.2, 2.5); 
  
  lprior += student_t_lpdf(sigma | 3, 0, 2.5)
    - 1 * student_t_lccdf(0 | 3, 0, 2.5);
    
  lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
    - 3 * student_t_lccdf(0 | 3, 0, 2.5);
    
  lprior += lkj_corr_cholesky_lpdf(L_1 | 1);
  
  lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
    - 1 * student_t_lccdf(0 | 3, 0, 2.5);

}


model {
  
    // Initialize linear predictor term
    vector[N] mu = rep_vector(0.0, N);
    mu += Intercept;
  //  for (n in 1:N) {
  //    // add more terms to the linear predictor
  //    mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_1_2[J_1[n]] * Z_1_2[n] + r_2_1[J_2[n]] * Z_2_1[n];
  //  }
  //  target += normal_lpdf(Y | mu, sigma);
  
    int grainsize = 1;
    target += reduce_sum(partial_sum, Y,
                     grainsize, N,
                     r_1_1,
                     r_1_2,
                     r_2_1,
                     Z_1_1,
                     Z_1_2,
                     Z_2_1,
                     J_1,
                     J_2,
                     sigma
                     );
  
  
  // Prior Specification
  target += lprior;
  target += std_normal_lpdf(to_vector(z_1));
  target += normal_lpdf(z_2[1] | mu_prior, sigma_prior); // Setting Random Effect Priors
  
}


R code that simulates hierarchical data:

#--- set seed
set.seed(24)
#--- sample size
n = 10000 

#--- Getting coefficient, variance, and error for group 1 (k = 10)
true_beta2 = data.frame(
  beta2 = seq(from = -2,to = 2, length.out = 10), 
  var2 = seq(1,10,1),
  error2 = rnorm(10,0,.5)
  )
#--- Getting coefficient, variance, and error for group 2 (k = 200)
true_beta3 = data.frame(
  beta3 = seq(from = -3,to = 3, length.out = 200), 
  var3 = seq(1,200,1),
  error3 = rnorm(200, 0,.5)
  )

df <- data.frame(
      var1 = rnorm(n,0,1), # Variable 1 (continuous)
      var2 = sample(seq(1,10,1), n, replace = TRUE), # Variable 2 (categorical group 1)
      var3 = sample(seq(1,200,1), n, replace = TRUE), # Variable 3 (categorical group 2)
      error = rnorm(n,0,.5)
      )%>%
  left_join(
    true_beta2, 
    by = "var2"
    )%>%
  left_join(
    true_beta3, 
    by = "var3"
    )%>%
  mutate(
    y = var1*.5 + 
        var2*beta2 +
        var3*beta3 + 
        error + 
        error2 + 
        error3
  )
#--- Setting Priors
df_prior <- data.frame(
   beta3 = unique(df$beta3),
    error_prior = rnorm(200,0,.5)
  )%>%
  mutate(
    prior = beta3 + error_prior
  )


#--- Defining Stan Data
stan_data = list(
  
  #--- Overall Data
  N = dim(df)[1], # number of observations
  Y = df$y,       # response variable
  
  #--- Group Data
  N_1 = length(unique(df$var2)),# number of grouping levels for first_fielder_pos
  M_1 = 2,                      # number of coefficients per level (Intercept + Variable 1)
  J_1 = df$var2,                # Variable 2 (Categorical)
  Z_1_1 = rep(1, dim(df)[1]),   # intercept
  Z_1_2 = df$var1,              # Variable 1 (Continuous)

  #--- Variable 3
  N_2 = length(unique(df$var3)),# number of grouping levels
  M_2 = 1,                      # number of coefficients per level
  J_2 = df$var3,                # grouping indicator per observation
  Z_2_1 = rep(1, dim(df)[1]),   # Specifying Intercept
  
  #--- Setting Priors for Var 3 
  #------ Defining Means
  mu_prior = df_prior$prior, #Prior means for coef of var 3
  #------- Defining SDs
  sigma_prior = .5 #Prior variance for coef of var 3
  
)


# *Train Model --------------------------------------------------------------
stan_example_fit <- stan(
  file = "HierarchicalReduceSum.stan", # Stan program
  data = stan_data,        # named list of data
  iter = 1500, # number of total iterations per chain
  warmup = 100, # number of warm-up iterations per chain
  chains = 1, # number of Markov chains
  seed = 24 # setting seed for reproducibility
)

See above; the rest looks good (I did not run it). Please us a grain size bigger than 1. This would say that the grain size recommend to the TBB is just 1 leading to no bigger chunking. I’d recommend to set it as a starting point to number-of data rows / (2*number of cores).

Be sure to take a look at the sufficient stats trick to speed up the likelihood compute within each group. Video here (start at 44:20) and code here

Thank you so much for the help! I removed the N and increased the grain-size, but I am unfortunately getting the same error:

Semantic error in 'string', line 22, column 21 to column 35:

Index must be of type int or int[] or must be a range. Instead found type vector.

I wasn’t sure if I am incorrectly specifying the parameter types for the partial_sum function?

The video and corresponding code are extremely helpful thank you so much! I will definitely be working on integrating the sufficient stats.

1 Like

You need to work out that the arguments match in their type correctly. Right now Y is a vector while in the partial sum function its an array. That does not fit together.

Hi @Rose, if you have a normal likelihood and normal priors on regression coefficients, you can speed things up considerably using something like this – GitHub - pgree/fastNoNo: Fast Normal-Normal models. The idea of these algorithms is closely related to @mike-lawrence’s comment about sufficient statistics. That package only currently supports a couple of normal-normal models, but with a little customization those methods can probably address your model (if it’s indeed normal-normal).

2 Likes

I had an additional follow up question for the sufficient statistics. I spent sometime going through the code for the sufficient statistics for multiple regression and at the moment it seems the framework is set up to handle multiple categorical or discrete variables (I believe the example you gave was time of day and predicting height). Is there a way to accommodate if there is an additional continuous variable? Or are we then unable to split the data into these unique groups to get sufficient statistics?
Here is was a simple simulation I was hoping to test with the Stan code you provided from your talk .

x1 = rnorm(100,0,1)
x2 = sample(c(1,2,3),100, replace = TRUE)
x3 = sample(c(1,2,3,4), 100, replace = TRUE)
y = 2*x1 + .5*x2 + 1*x3 + rnorm(100,0,.1)

functions{
    real normal_sufficient_lpdf(
        matrix obs
        , vector mean
        , vector variance
    ){
        return(
            normal_lpdf(
                obs[,1]
                | mean
                , sqrt(variance) ./ obs[,3]
            )
            + gamma_lpdf(
                obs[,2]
                | obs[,4]
                , obs[,4] ./ variance
            )
        ) ;
    }

}
data {
    // n_x_cols: number of predictor-variables in x (as columns)
    int<lower=1> n_x_cols ;
    // n_x_rows: number of combinations (rows) of the predictor variables in x
    int<lower=1> n_x_rows ;
    // x: predictor matrix
    matrix[n_x_rows,n_x_cols] x ;
    // obs_mean: mean of observations for each row in x
    vector[n_x_rows] obs_mean ;
    // obs_var: variance of observations for each row in x
    vector<lower=0>[n_x_rows] obs_var ;
    // obs_n: count of observations for each row in x
    vector<lower=2>[n_x_rows] obs_n ;
}
transformed data{
    matrix[n_x_rows,4] obs ;
    obs[,1] = obs_mean ;
    obs[,2] = obs_var ;
    obs[,3] = sqrt(obs_n) ;
    obs[,4] = (obs_n-1.0)/2.0 ;
    matrix[n_x_cols,n_x_rows] x_transpose ; // so we can use `columns_dot_product()`, which is faster than `rows_dot_product()`
}
parameters {
    // mu_intercept: value for the mean at the intercept
    real mu_intercept ;
    // mu_tod_beta: effect of each predictor on the mean
    vector[n_x_cols] mu_beta ;
    // eta_intercept: value for the log-variance at the intercept
    real eta_intercept ;
    // eta_tod_beta: effect of each predictor on the log-variance
    vector[n_x_cols] eta_beta ;
}
model {
    // priors (adjust when using real data!!)
    mu_intercept ~ std_normal() ;
    mu_beta ~ std_normal() ;
    eta_intercept ~ std_normal() ;
    eta_beta ~ std_normal() ;
    // likelihood
    obs ~ normal_sufficient(
        (
            mu_intercept
            + transpose(
                columns_dot_product(
                    rep_matrix(mu_beta,n_x_rows)
                    , x_transpose
                )
            )
        )
        , exp(
            eta_intercept
            + transpose(
                columns_dot_product(
                    rep_matrix(eta_beta,n_x_rows)
                    , x_transpose
                )
            )
        )
    ) ;
}

The latter, so multiple observations have to have the same values for all predictors to collapse those observations to sufficient statistics.