Hi!

In the course of the Stan parallel design RFC it was requested to share how the proposed parallel building blocks can be used in the Stan language by the user. This bit has been left out from the RFC since we do not want to introduce higher-level functions which we would need for this. Thus, I opted to go ahead anyways and assume that we will be able to adapt a Stan 3 parser easily to what I am suggesting in the RFC. In order to facilitate continued review I am sharing here a fully working Stan program which brings the adapter code in the form of calling a user-defined C++ function.

The example is a simulated hierarchical Poisson problem:

```
functions {
// runs reduce over the index range start to end. Mapping from
// data-index set to group indices pre-stored in gidx
real hierarchical_reduce(int start, int end, int[] y, vector log_lambda_group, int[] gidx) {
return poisson_log_lpmf(y[start:end]| log_lambda_group[gidx[start:end]]);
}
// function defined in C++ which calls parallel_reduce_sum with a
// lambda functor which simply binds the arguments passed into it
real parallel_hierarchical_reduce(int[] y, vector log_lambda_group, int[] gidx, int grainsize);
// map-rect alterantive
vector mr_reduce(vector beta, vector log_lambda, real[] xr, int[] xi) {
real lp = poisson_log_lpmf(xi| log_lambda[1]);
return [lp]';
}
}
data {
int<lower=0> N;
int<lower=0> G;
int<lower=1> grainsize;
int<lower=0,upper=1> use_tbb;
int<lower=0,upper=1> use_mr;
}
transformed data {
real true_log_lambda = log(5.0);
real true_tau = log(10)/1.96;
int y[N*G];
int yg[G,N];
real xr[G,0];
int gidx[N*G];
int start[G];
int end[G];
vector[0] theta_dummy;
print("Simulating problem size: G = ", G, "; N = ", N);
if (use_tbb) {
print("Using parallel TBB with grainsize = ", grainsize);
} else {
if(use_mr) {
print("Using map_rect code.");
} else {
print("Using serial code only.");
}
}
for(g in 1:G) {
real lambda_group = lognormal_rng(true_log_lambda, true_tau);
int elem = 1;
start[g] = (g-1) * N + 1;
end[g] = g * N;
for(i in start[g]:end[g]) {
y[i] = poisson_rng(lambda_group);
yg[g,elem] = y[i];
gidx[i] = g;
elem += 1;
}
}
}
parameters {
real log_lambda;
real<lower=0> tau;
vector[G] eta;
}
model {
vector[G] log_lambda_group = log_lambda + eta * tau;
real lpmf = 0;
if (use_tbb) {
lpmf = parallel_hierarchical_reduce(y, log_lambda_group, gidx, grainsize);
} else {
if(use_mr) {
vector[1] log_lambda_group_tilde[G];
for(g in 1:G)
log_lambda_group_tilde[g,1] = log_lambda_group[g];
lpmf = sum(map_rect(mr_reduce, theta_dummy, log_lambda_group_tilde, xr, yg));
} else {
lpmf = poisson_log_lpmf(y| log_lambda_group[gidx]);
}
}
target += lpmf;
target += std_normal_lpdf(log_lambda);
target += std_normal_lpdf(tau);
target += std_normal_lpdf(eta);
}
generated quantities {
real msq_log_lambda = square(true_log_lambda - log_lambda);
real msq_tau = square(true_tau - tau);
}
```

The `user_header.hpp`

file contains this:

```
#include <stan/math.hpp>
#include <boost/iterator/counting_iterator.hpp>
#include <stan/math/prim/scal/functor/parallel_reduce_sum.hpp>
#include <stan/math/rev/scal/functor/parallel_reduce_sum.hpp>
namespace poisson_hierarchical_scale_model_namespace {
// user provided function
template <typename T3__>
typename boost::math::tools::promote_args<T3__>::type
hierarchical_reduce(const int& start,
const int& end,
const std::vector<int>& y,
const Eigen::Matrix<T3__, Eigen::Dynamic, 1>& log_lambda_group,
const std::vector<int>& gidx, std::ostream* pstream__);
template <typename T3>
inline typename boost::math::tools::promote_args<T3>::type
parallel_hierarchical_reduce(const std::vector<int>& y,
const Eigen::Matrix<T3, Eigen::Dynamic, 1>& log_lambda_group,
const std::vector<int>& gidx,
const int& grainsize,
std::ostream* pstream__) {
typedef boost::counting_iterator<int> count_iter;
typedef typename boost::math::tools::promote_args<T3>::type return_t;
const int elems = y.size();
typedef typename boost::math::tools::promote_args<T3>::type local_scalar_t__;
typedef local_scalar_t__ fun_return_scalar_t__;
// C++ only binds with a lambda the arguments of the user function
// this adapter code would need to be generated by the Stan parser
return_t lpmf = stan::math::parallel_reduce_sum(
count_iter(1), count_iter(elems+1), return_t(0.0),
[&](int start, int end) {
return hierarchical_reduce(start, end, y, log_lambda_group, gidx, pstream__);
}, grainsize);
/* C++ only version
return_t lpmf = stan::math::parallel_reduce_sum(
count_iter(1), count_iter(elems+1), return_t(0.0),
[&](int start, int end) {
const std::size_t partial_elems = end-start+1;
std::vector<int> partial_data;
partial_data.insert(partial_data.end(), y.begin() + start - 1,
y.begin() + end);
std::vector<return_t> partial_log_lambda(partial_elems);
for(std::size_t i = 0; i != partial_elems; ++i)
partial_log_lambda[i] = log_lambda_group[gidx[start + i-1]-1];
return stan::math::poisson_log_lpmf(partial_data, partial_log_lambda);
}, grainsize);
*/
return lpmf;
}
}
```

The stan data file which makes the above fully running with the stan-math branch `parallel-ad-tape-3`

is this:

```
G <- 400
N <- 10
grainsize <- 1100
use_tbb <- 1
use_mr <- 0
```

Now, the reason why it took me a while is that I really wanted to make sure that all of this speeds up Stan programs - and it doesâ€¦ but: I see about 2x speedup on 2 cores (great)â€¦ however, with more cores the speed degrades and with 4-6 cores the execution times explode. Clearly there is still some issue (probably with threads going to sleep or whatever).

In the meantime I also coded up the same example using the performance benchmark suite from @seantalts, see here. When running benchmarks there, I do get again very nice speedups:

```
1 core
BM_tbbM_median 467394 ns 467134 ns 1242
2 cores => 1.8x
BM_tbbM_median 255672 ns 255380 ns 2660
4 cores => 2.9x
BM_tbbM_median 161253 ns 160471 ns 4579
```

All in all this convinces me that we are on the right track - the weirdness which happens in a real Stan program needs to be sorted out, but a stripped down problem points in the correct direction.

@syclik I hope this clarifies as to how I think the user should be using the new parallel reduce facility. Bear in mind that the user-provided function must have start and end arguments as the first two arguments. All the remaining arguments are whatever is needed by the user to write down the sliced likelihood evaluation. The slicing is left intentionally to the user who will know best how to subset his data structures in whatever way (and in additional the Stan language has good support for subsetting which we do not have in stan-math).

Best,

Sebastian