Example model bernoulli.stan compiles with cmdstan 2.23.0 but does not sample when built with STAN_OPENCL=true on macOS 10.14.16 with Intel processor.
I’ve seen statements on a number of pages dealing with the GPU that it will only work with “suitable hardware (e.g. Nvidia or AMD gpu) that supports OpenCL 1.2”. Obviously my GPU is not Nvidia or AMD, but it supports OpenCL. If the GPU routines simply can’t be run with an Intel processor, don’t bother reading on - but could someone explain to me why that is?
Otherwise… I’m not sure whether my compilers are messed up or if there’s a bug or what. I installed cmdstan as explained on this page: https://github.com/stan-dev/cmdstan/wiki/Getting-Started-with-CmdStan. Before running “make build”, I added a file ‘./make/local’ with the lines
STAN_OPENCL=true
OPENCL_DEVICE_ID=1
OPENCL_PLATFORM_ID=0
“make/build” then seems to work fine (but I did notice the line ld: warning: directory not found for option ‘-L/Users/~/cmdstan/stan/lib/stan_math/lib/tbb’), and then “make examples/bernoulli/bernoulli” also works fine. However, when attempting to sample from the model with “examples/bernoulli/bernoulli sample data file=examples/bernoulli/bernoulli.data.R”, I get the following output:
libc++abi.dylib: terminating with uncaught exception of type std::__1::system_error: compile_kernel: bernoulli_logit_glm <program source>:1:904: error: call to '__cl_isfinite' is ambiguous
__kernel void bernoulli_logit_glm( __global double* logp_global, __global double* theta_derivative_global, __global double* theta_derivative_sum, const __global int* y_global, const __global double* x, const __global double* alpha, const __global double* beta, const int N, const int M, const int is_y_vector, const int is_alpha_vector, const int need_theta_derivative, const int need_theta_derivative_sum) { const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); const int wg_id = get_group_id(0); __local double local_storage[LOCAL_SIZE_]; double logp = 0; double theta_derivative = 0; if (gid < N) { double ytheta = 0; for (int i = 0, j = 0; i < M; i++, j += N) { ytheta += x[j + gid] * beta[i]; } const int y = y_global[gid * is_y_vector]; const double sign = 2 * y - 1; ytheta += alpha[gid * is_alpha_vector]; ytheta *= sign; if (y > 1 || y < 0 || !isfinite(ytheta)) { logp = NAN; } const double exp_m_ytheta = exp(-ytheta); const double cutoff = 20.0; if (ytheta > cutoff) { logp -= exp_m_ytheta; theta_derivative = -exp_m_ytheta; } else if (ytheta < -cutoff) { logp += ytheta; theta_derivative = sign; } else { logp += -log1p(exp_m_ytheta); theta_derivative = sign * exp_m_ytheta / (exp_m_ytheta + 1); } if (need_theta_derivative) { theta_derivative_global[gid] = theta_derivative; } } local_storage[lid] = logp; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { logp_global[wg_id] = local_storage[0]; } if (need_theta_derivative_sum) { barrier(CLK_LOCAL_MEM_FENCE); local_storage[lid] = theta_derivative; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { theta_derivative_sum[wg_id] = local_storage[0]; } } }
^~~~~~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5080:27: note: expanded from macro 'isfinite'
#define isfinite(__x) __cl_isfinite(__x)
^~~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
__CLFN_IL_1FD_MODD(__cl_isfinite);
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:598:51: note: expanded from macro '__CLFN_IL_1FD_MODD'
#define __CLFN_IL_1FD_MODD(name) int __OVERLOAD__ name(float x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:599:19: note: expanded from macro '__CLFN_IL_1FD_MODD'
int2 __OVERLOAD__ name(float2 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:600:19: note: expanded from macro '__CLFN_IL_1FD_MODD'
int3 __OVERLOAD__ name(float3 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:601:19: note: expanded from macro '__CLFN_IL_1FD_MODD'
int4 __OVERLOAD__ name(float4 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:602:19: note: expanded from macro '__CLFN_IL_1FD_MODD'
int8 __OVERLOAD__ name(float8 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:5077:20: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:603:20: note: expanded from macro '__CLFN_IL_1FD_MODD'
int16 __OVERLOAD__ name(float16 x);
^
<program source>:1:966: error: call to '__fast_relax_exp' is ambiguous
__kernel void bernoulli_logit_glm( __global double* logp_global, __global double* theta_derivative_global, __global double* theta_derivative_sum, const __global int* y_global, const __global double* x, const __global double* alpha, const __global double* beta, const int N, const int M, const int is_y_vector, const int is_alpha_vector, const int need_theta_derivative, const int need_theta_derivative_sum) { const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); const int wg_id = get_group_id(0); __local double local_storage[LOCAL_SIZE_]; double logp = 0; double theta_derivative = 0; if (gid < N) { double ytheta = 0; for (int i = 0, j = 0; i < M; i++, j += N) { ytheta += x[j + gid] * beta[i]; } const int y = y_global[gid * is_y_vector]; const double sign = 2 * y - 1; ytheta += alpha[gid * is_alpha_vector]; ytheta *= sign; if (y > 1 || y < 0 || !isfinite(ytheta)) { logp = NAN; } const double exp_m_ytheta = exp(-ytheta); const double cutoff = 20.0; if (ytheta > cutoff) { logp -= exp_m_ytheta; theta_derivative = -exp_m_ytheta; } else if (ytheta < -cutoff) { logp += ytheta; theta_derivative = sign; } else { logp += -log1p(exp_m_ytheta); theta_derivative = sign * exp_m_ytheta / (exp_m_ytheta + 1); } if (need_theta_derivative) { theta_derivative_global[gid] = theta_derivative; } } local_storage[lid] = logp; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { logp_global[wg_id] = local_storage[0]; } if (need_theta_derivative_sum) { barrier(CLK_LOCAL_MEM_FENCE); local_storage[lid] = theta_derivative; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { theta_derivative_sum[wg_id] = local_storage[0]; } } }
^~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4544:22: note: expanded from macro 'exp'
#define exp(__x) __fast_relax_exp(__x)
^~~~~~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_exp, native_exp, __cl_exp);
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:417:27: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float __OVERLOAD__ _name(float x) { return _default_name(x); } \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:418:28: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float2 __OVERLOAD__ _name(float2 x) { return _default_name(x); } \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:419:28: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float3 __OVERLOAD__ _name(float3 x) { return _default_name(x); } \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:420:28: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float4 __OVERLOAD__ _name(float4 x) { return _default_name(x); } \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:421:28: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float8 __OVERLOAD__ _name(float8 x) { return _default_name(x); } \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4542:30: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:422:29: note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float16 __OVERLOAD__ _name(float16 x){ return _default_name(x); }
^
<program source>:1:1177: error: call to '__cl_log1p' is ambiguous
__kernel void bernoulli_logit_glm( __global double* logp_global, __global double* theta_derivative_global, __global double* theta_derivative_sum, const __global int* y_global, const __global double* x, const __global double* alpha, const __global double* beta, const int N, const int M, const int is_y_vector, const int is_alpha_vector, const int need_theta_derivative, const int need_theta_derivative_sum) { const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); const int wg_id = get_group_id(0); __local double local_storage[LOCAL_SIZE_]; double logp = 0; double theta_derivative = 0; if (gid < N) { double ytheta = 0; for (int i = 0, j = 0; i < M; i++, j += N) { ytheta += x[j + gid] * beta[i]; } const int y = y_global[gid * is_y_vector]; const double sign = 2 * y - 1; ytheta += alpha[gid * is_alpha_vector]; ytheta *= sign; if (y > 1 || y < 0 || !isfinite(ytheta)) { logp = NAN; } const double exp_m_ytheta = exp(-ytheta); const double cutoff = 20.0; if (ytheta > cutoff) { logp -= exp_m_ytheta; theta_derivative = -exp_m_ytheta; } else if (ytheta < -cutoff) { logp += ytheta; theta_derivative = sign; } else { logp += -log1p(exp_m_ytheta); theta_derivative = sign * exp_m_ytheta / (exp_m_ytheta + 1); } if (need_theta_derivative) { theta_derivative_global[gid] = theta_derivative; } } local_storage[lid] = logp; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { logp_global[wg_id] = local_storage[0]; } if (need_theta_derivative_sum) { barrier(CLK_LOCAL_MEM_FENCE); local_storage[lid] = theta_derivative; barrier(CLK_LOCAL_MEM_FENCE); for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; step /= REDUCTION_STEP_SIZE) { if (lid < step) { for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { local_storage[lid] += local_storage[lid + step * i]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (lid == 0) { theta_derivative_sum[wg_id] = local_storage[0]; } } }
^~~~~~~~~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4668:24: note: expanded from macro 'log1p'
#define log1p(__x) __cl_log1p(__x)
^~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
__CLFN_FD_1FD(__cl_log1p);
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:367:48: note: expanded from macro '__CLFN_FD_1FD'
#define __CLFN_FD_1FD(name) float __OVERLOAD__ name(float x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:368:21: note: expanded from macro '__CLFN_FD_1FD'
float2 __OVERLOAD__ name(float2 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:369:21: note: expanded from macro '__CLFN_FD_1FD'
float3 __OVERLOAD__ name(float3 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:370:21: note: expanded from macro '__CLFN_FD_1FD'
float4 __OVERLOAD__ name(float4 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:371:21: note: expanded from macro '__CLFN_FD_1FD'
float8 __OVERLOAD__ name(float8 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4665:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:372:22: note: expanded from macro '__CLFN_FD_1FD'
float16 __OVERLOAD__ name(float16 x);
^
: Unknown error -11
Abort trap: 6
_____
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/c++/4.2.1
Apple clang version 11.0.0 (clang-1100.0.33.17)
Target: x86_64-apple-darwin18.7.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin