I am writing a Stan implementation of this factor analysis model (I call it Gao’s Model) and I would like to get some within-chain parallelism using reduce_sum. The basic model is Y = X \Lambda + \epsilon where
- Y is NxD observed data matrix
- X is NxK latent scores, X_{ij} \sim N(0,1)
- \Lambda is KxD loadings, with essentially horseshoe priors on its entries (which promote sparseness, providing rotational identifiability)
- \epsilon is NxD noise matrix, where each observation’s noise vector is IID N(0, diag(\psi_1, \cdots, \psi_D).
An important feature to the performance of this model is that typically K should be small, while N and D are large. So, the small K may dictate what kind of containers to use, whether to parallelize in K or N or D, etc.
I’ve tried several ways to incorporate reduce_sum in here, and I’m profiling them on a 2017 MacBook Pro, which has 2 physical cores, so I’m just using one chain and 2 threads per chain. I’m just watching the CPU use and the sampling iterations per second.
Here are some of my models:
Serial Y, parallel flat X: About 7 it/sec, about 120% CPU use.
functions{
real partial_sum_x_lpdf(real[] x_slice,
int start,
int end){
return normal_lupdf(x_slice | 0, 1);
}
}
data {
int<lower=1> N; // num observations
int<lower=1> D; // num features
real Y[N, D]; // observed data
int<lower=1> K; // max number of latent dimensions
int<lower=1> grainsize;
}
transformed data{
real<lower=0> a = 0.5;
real<lower=0> b = 0.5;
real<lower=0> c = 0.5;
real<lower=0> d = 0.5;
real<lower=0> e_ = 0.5;
real<lower=0> f = 0.5;
real<lower=0> nu = 1.0;
}
parameters {
real X[N, K]; // latent scores
matrix[K,D] Lambda; // needs to be matrix for matrix multiplication
matrix<lower=0>[K,D] Theta;
matrix<lower=0>[K,D] Delta;
vector<lower=0>[K] phi;
vector<lower=0>[K] tau;
real<lower=0> eta;
real<lower=0> gam;
vector<lower=0>[D] psi;
}
model {
// Priors and hyperpriors
for(k in 1:K){
Lambda[k] ~ normal(0.0, Theta[k]); // i.e. Lambda[k,j] ~ N(0, Theta[k,j])
Theta[k] ~ gamma(a, Delta[k]); // i.e. Theta[k,j] ~ G(a, Delta[k,j])
Delta[k] ~ gamma(b, phi[k]); // i.e. Delta[k,j] ~ G(b, phi[k])
}
phi ~ gamma(c, tau); // i.e. phi[k] ~ G(c, tau[k])
tau ~ gamma(d, eta); // i.e. tau[k] ~ G(d, eta)
eta ~ gamma(e_, gam);
gam ~ gamma(f, nu);
// Likelihood
for(i in 1:N){
Y[i] ~ normal(to_row_vector(X[i]) * Lambda, psi);
}
target += reduce_sum(partial_sum_x_lpdf, to_array_1d(X), grainsize);
// Hard-code Jeffreys' prior on psi_1, ..., psi_D
target += -0.5*log(psi);
}
Same model as above, but with 2D arrays instead of matrices where possible: About 6.5 it/s, CPU Usage about 120% (so, slower with same CPU usage).
functions{
real partial_sum_x_lpdf(real[] x_slice,
int start,
int end){
return normal_lupdf(x_slice | 0, 1);
}
}
data {
int<lower=1> N; // num observations
int<lower=1> D; // num features
real Y[N, D]; // observed data
int<lower=1> K; // max number of latent dimensions
int<lower=1> grainsize;
}
transformed data{
real<lower=0> a = 0.5;
real<lower=0> b = 0.5;
real<lower=0> c = 0.5;
real<lower=0> d = 0.5;
real<lower=0> e_ = 0.5;
real<lower=0> f = 0.5;
real<lower=0> nu = 1.0;
}
parameters {
real X[N, K]; // latent scores
matrix[K,D] Lambda; // needs to be matrix for matrix multiplication
real<lower=0> Theta[K,D];
real<lower=0> Delta[K,D];
vector<lower=0>[K] phi;
vector<lower=0>[K] tau;
real<lower=0> eta;
real<lower=0> gam;
vector<lower=0>[D] psi;
}
model {
// Priors and hyperpriors
real LambdaArr[K,D] = to_array_2d(Lambda); // convert to array to make row access faster.
for(k in 1:K){
LambdaArr[k] ~ normal(0.0, Theta[k]); // i.e. Lambda[k,j] ~ N(0, Theta[k,j])
Theta[k] ~ gamma(a, Delta[k]); // i.e. Theta[k,j] ~ G(a, Delta[k,j])
Delta[k] ~ gamma(b, phi[k]); // i.e. Delta[k,j] ~ G(b, phi[k])
}
phi ~ gamma(c, tau); // i.e. phi[k] ~ G(c, tau[k])
tau ~ gamma(d, eta); // i.e. tau[k] ~ G(d, eta)
eta ~ gamma(e_, gam);
gam ~ gamma(f, nu);
// Likelihood
for(i in 1:N){
Y[i] ~ normal(to_row_vector(X[i]) * Lambda, psi);
}
target += reduce_sum(partial_sum_x_lpdf, to_array_1d(X), grainsize);
// Hard-code Jeffreys' prior on psi_1, ..., psi_D
target += -0.5*log(psi);
}
Same as above, but with reduce_sum on the Lambda, Theta, Delta priors: About 5.8 it/s, CPU Usage about 140% (so, reduce sum is not helping)
functions{
real partial_sum_Lambda_lpdf(array[] real Lam_k_slice,
int start,
int end,
array[] real Theta_k) {
return normal_lpdf(Lam_k_slice | 0, Theta_k[start:end]);
}
real partial_sum_Theta_lpdf(array[] real Theta_k_slice,
int start,
int end,
real shape_param,
array[] real Delta_k) {
return gamma_lpdf(Theta_k_slice | shape_param, Delta_k[start:end]);
}
real partial_sum_Delta_lpdf(array[] real Delta_k_slice,
int start,
int end,
real shape_param,
real rate_param) {
return gamma_lpdf(Delta_k_slice | shape_param, rate_param);
}
real partial_sum_x_lpdf(real[] x_slice,
int start,
int end){
return normal_lupdf(x_slice | 0, 1);
}
real partial_sum_y_lpdf(real[] y_slice,
int start,
int end,
row_vector X_i_Lambda,
vector psi){
return normal_lpdf(y_slice | X_i_Lambda[start:end], psi[start:end]) ;
}
}
data {
int<lower=1> N; // num observations
int<lower=1> D; // num features
real Y[N, D];
int<lower=1> K; // max number of latent dimensions
int<lower=1> grainsize;
}
transformed data{
real<lower=0> a = 0.5;
real<lower=0> b = 0.5;
real<lower=0> c = 0.5;
real<lower=0> d = 0.5;
real<lower=0> e_ = 0.5;
real<lower=0> f = 0.5;
real<lower=0> nu = 1.0;
}
parameters {
real X[N, K]; // latent scores
matrix[K,D] Lambda; // needs to be matrix for matrix multiplication
real<lower=0> Theta[K,D];
real<lower=0> Delta[K,D];
vector<lower=0>[K] phi;
vector<lower=0>[K] tau;
real<lower=0> eta;
real<lower=0> gam;
vector<lower=0>[D] psi;
}
model {
// Priors and hyperpriors
real LambdaArr[K,D] = to_array_2d(Lambda);
for(k in 1:K){
target += reduce_sum(partial_sum_Lambda_lpdf, LambdaArr[k], grainsize, Theta[k]);
target += reduce_sum(partial_sum_Theta_lpdf, Theta[k], grainsize, a, Delta[k]);
target += reduce_sum(partial_sum_Delta_lpdf, Delta[k], grainsize, b, phi[k]);
}
phi ~ gamma(c, tau); // i.e. phi[k] ~ G(c, tau[k])
tau ~ gamma(d, eta); // i.e. tau[k] ~ G(d, eta)
eta ~ gamma(e_, gam);
gam ~ gamma(f, nu);
// Likelihood for Y
for(i in 1:N){
Y[i] ~ normal(to_row_vector(X[i]) * Lambda, psi);
}
// Likelihood for X
target += reduce_sum(partial_sum_x_lpdf, to_array_1d(X), grainsize);
// Hard-code Jeffreys' prior on psi_1, ..., psi_D
target += -0.5*log(psi);
}
Parallel within each row of Y, parallel on flattened X: About 1.25 it/s, CPU Usage about 180% (so, much worse)
functions{
real partial_sum_x_lpdf(real[] x_slice,
int start,
int end){
return normal_lupdf(x_slice | 0, 1);
}
real partial_sum_y_lpdf(real[] y_slice,
int start,
int end,
row_vector X_i_Lambda,
vector psi){
return normal_lpdf(y_slice | X_i_Lambda[start:end], psi[start:end]);
}
}
data {
int<lower=1> N; // num observations
int<lower=1> D; // num features
real Y[N, D];
int<lower=1> K; // max number of latent dimensions
int<lower=1> grainsize;
}
transformed data{
real<lower=0> a = 0.5;
real<lower=0> b = 0.5;
real<lower=0> c = 0.5;
real<lower=0> d = 0.5;
real<lower=0> e_ = 0.5;
real<lower=0> f = 0.5;
real<lower=0> nu = 1.0;
}
parameters {
real X[N, K]; // latent scores
matrix[K,D] Lambda; // needs to be matrix for matrix multiplication
matrix<lower=0>[K,D] Theta;
matrix<lower=0>[K,D] Delta;
vector<lower=0>[K] phi;
vector<lower=0>[K] tau;
real<lower=0> eta;
real<lower=0> gam;
vector<lower=0>[D] Psi;
}
model {
// Priors and hyperpriors
for(k in 1:K){
// target += reduce_sum(partial_sum_Lambda, Lambda[k], grainsize, Theta[k])
Lambda[k] ~ normal(0.0, Theta[k]); // i.e. Lambda[k,j] ~ N(0, Theta[k,j])
// target += reduce_sum(partial_sum_Theta, Theta[k], grainsize, a, Delta[k])
Theta[k] ~ gamma(a, Delta[k]); // i.e. Theta[k,j] ~ G(a, Delta[k,j])
Delta[k] ~ gamma(b, phi[k]); // i.e. Delta[k,j] ~ G(b, phi[k])
}
phi ~ gamma(c, tau); // i.e. phi[k] ~ G(c, tau[k])
tau ~ gamma(d, eta); // i.e. tau[k] ~ G(d, eta)
eta ~ gamma(e_, gam);
gam ~ gamma(f, nu);
// Likelihood
for(i in 1:N){
row_vector[D] X_i_Lambda = to_row_vector(X[i]) * Lambda;
target += reduce_sum(partial_sum_y_lpdf, Y[i], grainsize, X_i_Lambda, Psi);
}
target += reduce_sum(partial_sum_x_lpdf, to_array_1d(X), grainsize);
// Hard-code Jeffreys' prior on psi_1, ..., psi_D
target += -0.5*log(Psi);
}
I’m able to get the reduce-sum tutorial to work, but I’m not getting much benefit from reduce sum here. I think the fact that most of my values are in 2D containers makes it not straightforward for me to see how to take best advantage of reduce-sum, vectorization, locality, etc. Once I have a good parallelization scheme I’d like to scale up to a cluster, but I want to be able to get at least good 2-core parallelism on my laptop first. Any help appreciated!