Gradients/Log Prob in External C++ CUDA

Greetings! I’m currently facing a fascinating challenge that stems from my thirst for knowledge. As a self-taught individual, I find myself grappling with the intricate realms of advanced mathematics employed by Stan. Please forgive me if my explanation resembles a mathematical primer for fifth graders—just trying to keep it light-hearted!

To dive into the technicalities, I’ve successfully integrated two functions into the Transformed data section of the .stan file, which compiles and runs smoothly. However, I’m encountering difficulties while attempting to create two additional functions that replace the linear trend and normal_id_glm functions within the model. Each time I test these functions, I encounter the line search error. Interestingly, when I employ if statements for testing, the values produced by the original functions match the values I input through my cuda functions. I thought it was an issue with precision but as they numbers are equal I am a bit confused. How I tested (not exactly what I did, but you get the deal):

for (int i = 0; 1:T; i++){
     if (og_trend[i] == cuda_trend[i]){
          print("not good"
}}

Now, my intuition tells me that I should be returning stan::math::var_value as the type in my functions. However, when I attempt to compile with that type in the vector or matrix, an error occurs. It seems that I am unable to place a var_val into an Eigen::matrix, even though I suspect I should use var(new precomp_v_vari(f, x.vi_, dfdx_)).

If you happen to have any helpful tips or recommend any books that can steer me in the right direction, I would be absolutely thrilled.
Stan file:

functions {
  //vector cuStan_model(real k, real m, vector delta, real tau, real sigma_obs, vector beta, vector sigmas, vector y, matrix X_sa, matrix X_sm, vector trend);
  vector cuStan_model(vector beta, matrix X_sa, matrix X_sm, vector trend);
  void cuStan_lt(vector mine, vector stan);
  matrix cuStan_chp_mat(vector t, vector t_change, int T, int S);
  matrix cuStan_xmake(matrix X, vector s_x, int T);
  matrix get_changepoint_matrix(vector t, vector t_change, int T, int S) {
    // Assumes t and t_change are sorted.
    matrix[T, S] A;
    row_vector[S] a_row;
    int cp_idx;
    // Start with an empty matrix.
    A = rep_matrix(0, T, S);
    a_row = rep_row_vector(0, S);
    cp_idx = 1;
    // Fill in each row of A.
    for (i in 1:T) {
      while ((cp_idx <= S) && (t[i] >= t_change[cp_idx])) {
        a_row[cp_idx] = 1;
        cp_idx = cp_idx + 1;
      }
      A[i] = a_row;
    }
    return A;
  }
  // Logistic trend functions
  vector logistic_gamma(real k, real m, vector delta, vector t_change, int S) {
    vector[S] gamma;  // adjusted offsets, for piecewise continuity
    vector[S + 1] k_s;  // actual rate in each segment
    real m_pr;
    // Compute the rate in each segment
    k_s = append_row(k, k + cumulative_sum(delta));
    // Piecewise offsets
    m_pr = m; // The offset in the previous segment
    for (i in 1:S) {
      gamma[i] = (t_change[i] - m_pr) * (1 - k_s[i] / k_s[i + 1]);
      m_pr = m_pr + gamma[i];  // update for the next segment
    }
    return gamma;
  }

  vector logistic_trend(
    real k,
    real m,
    vector delta,
    vector t,
    vector cap,
    matrix A,
    vector t_change,
    int S
  ) {
    vector[S] gamma;

    gamma = logistic_gamma(k, m, delta, t_change, S);
    return cap .* inv_logit((k + A * delta) .* (t - (m + A * gamma)));
  }

  // Linear trend function

  vector linear_trend(
    real k,
    real m,
    vector delta,
    vector t,
    matrix A,
    vector t_change
  ) {
    //return cuStan_lt(k, m, delta, t, A, t_change);
    return (k + A * delta) .* t + (m + A * (-t_change .* delta));
  }
  vector linear_trend_map(
    vector all_params,
    vector t,
    data array[] real temp,
    data array[] int ii
  ) {
    real k = all_params[1];
    real m = all_params[2];
    vector[1] ret;
    
    ret[1] = (k + t[2]) * t[1] + (m + t[3]);
    //ret[1] = (k) * t[1] + (m);

    return ret;
  }
  vector delt_val_map(
    vector all_params1,
    vector t,
    data array[] real temp,
    data array[] int ii
  ) {
    int S = num_elements(temp);
    real delta_val1 = 0.0;
    real delta_val2 = 0.0;
    for (j in 1:S){
      delta_val1 = (temp[j] * all_params1[j]) + delta_val1;
      delta_val2 = (temp[j] * (all_params1[j] * (-(all_params1[S+j])))) + delta_val2;
    }
    vector[2] ret;
    ret[1] = delta_val1;
    ret[2] = delta_val2;
    return ret;
  }
  vector mapping_trend(
    real k,
    real m,
    vector delta,
    vector t,
    matrix A,
    vector t_change,
    int T,
    int S,
    data array[,] int count,
    data array[,] real tempy
  ){
    int SS = S*2;
    int TT = T*2;
    vector[SS] all_params1 = rep_vector(0, SS);
    for (i in 1:S){
      all_params1[i] = delta[i];
      all_params1[i+S] = t_change[i];
    }
    array[T] vector[3] tt;
    for (i in 1:T){
      tt[i] = rep_vector(0.0, 3);
    }
    
    vector[TT] det_vals = map_rect(delt_val_map, all_params1, tt, tempy, count);
    vector[2] all_params2;
    all_params2[1] = k;
    all_params2[2] = m;
    int f = 1;
    for (i in 1:T) {
      tt[i][1] = t[i];
      tt[i][2] = det_vals[f];
      f = f + 1;
      tt[i][3] = det_vals[f];
      f = f + 1;
    }
    //print("finished mapping");
    return map_rect(linear_trend_map, all_params2, tt, tempy, count);
  }
  
  // Flat trend function
  vector flat_trend(
    real m,
    int T
  ) {
    return rep_vector(m, T);
  } 

  array[,] int get_count_map(int T)
  {
    array[T,T] int count_map;
    for (i in 1:T){
      for (j in 1:T){
          count_map[i,j] = i;
      }
    }
    return count_map;
  }
}

data {
  int T;                // Number of time periods
  int<lower=1> K;       // Number of regressors
  vector[T] t;          // Time
  vector[T] cap;        // Capacities for logistic trend
  vector[T] y;          // Time series
  int S;                // Number of changepoints
  vector[S] t_change;   // Times of trend changepoints
  matrix[T,K] X;        // Regressors
  vector[K] sigmas;     // Scale on seasonality prior
  real<lower=0> tau;    // Scale on changepoints prior
  int trend_indicator;  // 0 for linear, 1 for logistic, 2 for flat
  vector[K] s_a;        // Indicator of additive features
  vector[K] s_m;        // Indicator of multiplicative features

}

transformed data {
  matrix[T, S] A = cuStan_chp_mat(t, t_change, T, S);
  matrix[T, K] X_sa = cuStan_xmake(X, s_a, T);//X .* rep_matrix(s_a', T); // big resorce hog
  matrix[T, K] X_sm = cuStan_xmake(X, s_m, T);//X .* rep_matrix(s_m', T); // big resorce hog
}

parameters {
  real k;                   // Base trend growth rate
  real m;                   // Trend offset
  vector[S] delta;          // Trend rate adjustments
  real<lower=0> sigma_obs;  // Observation noise
  vector[K] beta;           // Regressor coefficients
}

transformed parameters {
  vector[T] trend;
  if (trend_indicator == 0) {
    trend = linear_trend(k, m, delta, t, A, t_change); //
  } else if (trend_indicator == 1) {
    trend = logistic_trend(k, m, delta, t, cap, A, t_change, S);
  } else if (trend_indicator == 2) {
    trend = flat_trend(m, T);
  }
}

model {
  //priors
  k ~ normal(0, 5);
  m ~ normal(0, 5);
  delta ~ double_exponential(0, tau);
  sigma_obs ~ normal(0, 0.5);
  beta ~ normal(0, sigmas);
  //likelihood
  y ~ normal_id_glm(X_sa, trend .* (1 + X_sm * beta), beta, sigma_obs );
  //cuStan_lt(cuStan_model(beta, X_sa, X_sm, trend), (trend .* (1 + X_sm * beta)));
  //y ~ normal(cuStan_model(beta, X_sa, X_sm, trend), sigma_obs); //this: y ~ Normal(α + x⋅β, σ) α=trend .* (1 + X_sm * beta), x=X_sa, β=beta, σ=sigma_obs
}

User header:

#include <stan/model/model_header.hpp>
#include <Eigen/core>
#include <stan/math.hpp>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include <cuda.h>
#include <windows.h>
#include <fstream>
#include <string>
#include <iomanip>
#include <stan/math/rev/fun.hpp>
#include <mutex>
#include <processthreadsapi.h>
#define DLLIMPORT __declspec(dllimport)
DLLIMPORT void likelihood(double* d_X_sa, double* d_X_sm, double* d_beta, double* d_trend, double* result, int num_rows, int num_cols);
DLLIMPORT void linear_trendh(double k, double m, double* delta_ptr, double* t_ptr, double* A_ptr, double* t_change_ptr, double* result_ptr, int T, int S, double* delta_val_1, double* delta_val_2);
DLLIMPORT void elementwize(double* d_X, double* d_s_a, double* d_result, int size);
DLLIMPORT void chek_pont(double* d_tz, double* d_t_change, double* d_A, int T, int S);
#ifndef STAN_CUDA_DLL
#define STAN_CUDA_DLL
  HMODULE cudaDll = LoadLibrary("cuda_func_help.dll");
#endif  
int CSV_NUM = 0;
Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>> modeld(const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& beta, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sa, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sm, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& trend) {
  // Summary of equation: α + x⋅β || α = (trend .* (1 + X_sm * beta)) || x = X_sa || β = beta
  void (*likelihood)(double*, double*, double*, double*, double*, int, int) = (void (*)(double*, double*, double*, double*, double*, int, int))GetProcAddress(cudaDll, "likelihood");
  const int T = X_sa.rows();
  const int K = beta.rows();
  // Allocate device memory
  double* d_X_sa; //size = T*K
  double* d_X_sm; //size = T*K
  double* d_beta; //size = K
  double* d_trend; //size = T
  double* result_ptr;
  cudaMalloc((void**)&d_X_sa, T * K * sizeof(double));
  cudaMalloc((void**)&d_X_sm, T * K * sizeof(double));
  cudaMalloc((void**)&d_beta, K * sizeof(double));
  cudaMalloc((void**)&d_trend, T * sizeof(double));
  cudaMalloc((void**)&result_ptr, T * sizeof(double));
  // Copy data from host to device
  double* X_sm_arr = new double [T * K];
  double* X_sa_arr = new double [T * K];
  double* beta_arr = new double [K];
  double* trend_arr = new double [T];
  for (int i = 0; i < T; i++) {
    for (int j = 0; j < K; j++) {
      X_sa_arr[i * K + j] = X_sa(i, j).val();
      X_sm_arr[i * K + j] = X_sm(i, j).val();
    }
    trend_arr[i] = trend(i, 0).val();
  }
  for (int i = 0; i < K; i++) {
    beta_arr[i] = beta(i, 0).val();
  }
  // Copy data from host to device
  cudaMemcpy(d_X_sa, X_sa_arr, T * K * sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_X_sm, X_sm_arr, T * K * sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_beta, beta_arr, K * sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_trend, trend_arr, T * sizeof(double), cudaMemcpyHostToDevice);
  // Launch the likelihood kernel
  likelihood(d_X_sa, d_X_sm, d_beta, d_trend, result_ptr, T, K);
  cudaDeviceSynchronize();
  // Copy the results back to the host.
  double* temp = new double [T];
  cudaMemcpy(temp, result_ptr, T * sizeof(double), cudaMemcpyDeviceToHost);
  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>> to_normal = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>>(temp, T, 1);
  cudaFree(0);
  cudaFree(d_X_sa);
  cudaFree(d_X_sm);
  cudaFree(d_beta);
  cudaFree(d_trend);
  delete[] X_sa_arr;
  delete[] X_sm_arr;
  delete[] beta_arr;
  delete[] trend_arr;
  return to_normal;
}
void linear_trendd(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1> mine, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1> stan){
    std::cout << "linear_trendd" << std::endl;
    const int T = stan.rows();
    for (int i = 0; i < T; i++) {
        if (mine(i, 0).val() != stan(i, 0).val()) {
            std::cout << "mine: " << mine(i,0).val() << " stan: " << stan(i,0).val() << std::endl;
        }
    }
    
    /*
    void (*linear_trendh)(double, double, double*, double*, double*, double*, double*, int, int, double*, double*) = (void (*)(double, double, double*, double*, double*, double*, double*, int, int, double*, double*))GetProcAddress(cudaDll, "linear_trendh");    
    //cudaFree(0);
    int T = t.rows(); //A rows
    int S = delta.rows();// A cols
    const int T_C = T;
    const int S_C = S;
    double* delta_ptr;
    double* t_ptr;
    double* A_ptr;
    double* t_change_ptr;
    double* result_ptr;
    double* d_v_1ptr;
    double* d_v_2ptr;
    cudaMalloc((void**)&delta_ptr, S * sizeof(double));
    cudaMalloc((void**)&t_ptr, T * sizeof(double));
    cudaMalloc((void**)&d_v_1ptr, T * sizeof(double));
    cudaMalloc((void**)&d_v_2ptr, T * sizeof(double));
    cudaMalloc((void**)&A_ptr, T * S * sizeof(double));
    cudaMalloc((void**)&t_change_ptr, S * sizeof(double));
    cudaMalloc((void**)&result_ptr, T * sizeof(double));
    double* t_arr = new double [T_C];
    double* t_c_arr = new double [S_C];
    double* A_arr = new double [T_C * S_C];;
    double* delt_arr = new double [S_C];
    double* result = new double [T_C];
    for (int j = 0; j < T_C; j++) {
        t_arr[j] = t(j, 0).val();
    }
    for (int i=0; i < T_C; i++) {
        for (int j = 0; j < S_C; j++) {
            A_arr[i * S_C + j] = A(i, j).val(); //i = rows, j = cols
        }
    }
    for (int j = 0; j < S_C; j++) {
        delt_arr[j] = delta(j, 0).val();
    }
    for (int j = 0; j < S_C; j++) {
        t_c_arr[j] = t_change(j, 0).val();
    }
    double k_val = k.val();
    double m_val = m.val();
    cudaMemcpy(delta_ptr, delt_arr, S * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(t_ptr, t_arr, T * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(A_ptr, A_arr, T * S * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(t_change_ptr, t_c_arr, S * sizeof(double), cudaMemcpyHostToDevice);
    //run kernel
    linear_trendh(k_val, m_val, delta_ptr, t_ptr, A_ptr, t_change_ptr, result_ptr, T, S, d_v_1ptr, d_v_2ptr);
    cudaDeviceSynchronize();
    // Copy the results back to the host.
    double* temp = new double [T_C];

    cudaMemcpy(temp, result_ptr, T * sizeof(double), cudaMemcpyDeviceToHost);
    Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>> trendz = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>>(temp, T_C, 1);
    // Free device memory

    cudaFree(0);
    cudaFree(delta_ptr);
    cudaFree(t_ptr);
    cudaFree(A_ptr);
    cudaFree(t_change_ptr);
    cudaFree(result_ptr);
    delete[] t_arr;
    delete[] t_c_arr;
    delete[] A_arr;
    delete[] delt_arr;
    return trendz;*/
  
}
Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> Xmaker(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& s_a, const int& T) {
  void (*elementwize)(double*, double*, double*, int) = (void (*)(double*, double*, double*, int))GetProcAddress(cudaDll, "elementwize");
  const int size = X.size();
  const int T_C = T;
  const int K_C = X.cols();
  // Allocate device memory
  double* d_X;
  double* d_s_a;
  double* d_result;
  //std::cout << "Size needed in kb:" << ((T_C * K_C * sizeof(double) + K_C * sizeof(double) + T_C * K_C * sizeof(double)) / 1024) << std::endl;  
  cudaMalloc((void**)&d_X, T_C * K_C * sizeof(double));
  cudaMalloc((void**)&d_s_a, T_C * K_C * sizeof(double));
  cudaMalloc((void**)&d_result, T_C * K_C * sizeof(double));
  // Copy data from host to device
  double* X_arr = new double [T_C * K_C];
  double* s_a_arr = new double [T_C * K_C];
    // Initialize the array.
  for (int i = 0; i < T_C; i++) {
      for (int j = 0; j < K_C; j++) {
          X_arr[i * K_C + j] = X(i, j).val();
      }
  }
    // Initialize the array.
  for (int i = 0; i < T_C; i++) {
    for (int j = 0; j < K_C; j++) {
      s_a_arr[i * K_C + j] = s_a(j, 0).val();
    }
  }
  cudaMemcpy(d_X, X_arr, T_C * K_C * sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_s_a, s_a_arr, K_C * sizeof(double), cudaMemcpyHostToDevice);
  // Launch the element-wise multiplication kernel
  elementwize(d_X, d_s_a, d_result, (T_C * K_C));
  
  // Map the device memory to Eigen::Matrix
  Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> x_x(T_C, K_C);
  cudaMemcpy(x_x.data(), d_result, (T_C * K_C), cudaMemcpyDeviceToHost);
  double* r_data = new double [T_C * K_C];
  //switch from rowwise to colwise because of eigen
  for (int i = 0; i < T_C; i++) {
    for (int j = 0; j < K_C; j++) {
      r_data[i * K_C + j] = x_x(i, j);
    }
  }
  // Copy the result back to the host
  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> resultHost = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>>(r_data, T_C, K_C);
  // Free device memory
  cudaFree(d_X);
  cudaFree(d_s_a);
  cudaFree(d_result);
  delete[] s_a_arr;
  delete[] X_arr;
  return resultHost;
}
Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> get_changepoint_matrixx(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& t, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& t_change, const int& T, const int& S){
  void (*chek_pont)(double*, double*, double*, int, int) = (void (*)(double*, double*, double*, int, int))GetProcAddress(cudaDll, "chek_pont");  
  Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> A(T, S);
  const int T_C = T;
  const int S_C = S;
  // Allocate device memory
  double* d_tz;
  double* d_t_change;
  double* d_A;
  //std::cout << "Size needed in kb:" << ((T_C * sizeof(double) + S_C * sizeof(double) + S_C * sizeof(double))/1024) << std::endl;
  cudaMalloc((void**)&d_tz, T_C * sizeof(double));
  cudaMalloc((void**)&d_t_change, S_C * sizeof(double));
  cudaMalloc((void**)&d_A, T_C * S_C * sizeof(double));
  //maybe can get rid of this
  double* t_arr = new double [T_C];
  double* t_c_arr = new double [S_C];
    // Initialize the array.
  for (int j = 0; j < T_C; j++) {
      t_arr[j] = t(j, 0).val();
  }
     // Initialize the array.
  for (int j = 0; j < S_C; j++) {
      t_c_arr[j] = t_change(j, 0).val();
  }
  // Copy data from host to device
  cudaMemcpy(d_tz, t_arr, T_C * sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_t_change, t_c_arr, S_C * sizeof(double), cudaMemcpyHostToDevice);
  // Launch the kernel
  chek_pont(d_tz, d_t_change, d_A, T_C, S_C);
  double* temp = new double [T_C * S_C];

  // Copy result from device to host
  cudaMemcpy(temp, d_A, T_C * S_C * sizeof(double), cudaMemcpyDeviceToHost);
  
  double* col_wise_data = new double [T_C * S_C];
  //switch from rowwise to colwise because of eigen
  for (int i = 0; i < T_C; i++) {
    for (int j = 0; j < S_C; j++) {
      col_wise_data[j * T_C + i] = temp[i * S_C + j];
    }
  }
  // Map the device memory to Eigen::Matrix 
  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> Ab = Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>>(col_wise_data, T_C, S_C);
  // Free device memory

  cudaFree(d_tz);
  cudaFree(d_t_change);
  cudaFree(d_A);
  delete[] t_arr;
  delete[] t_c_arr;
  delete[] temp;
  return Ab;
}

namespace prophet_model_namespace {
  //Eigen::Map<Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>> cuStan_model(const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& beta, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sa, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sm, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& trend, std::ostream *pstream__ = nullptr) {
      //return Eigen::Map<Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>>(beta.data(), beta.rows(), 1);
    //research auto diff, gradient, hessian, wrt, log prob, & var(new precomp_v_vari(f, x.vi_, dfdx_))
    //}
  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, 1>, 0, Eigen::Stride<0, 0>> cuStan_model(const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& beta, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sa, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>& X_sm, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& trend, std::ostream *pstream__ = nullptr) {
    return modeld(beta, X_sa, X_sm, trend);
  }

  void cuStan_lt(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1> mine, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1> stan, std::ostream *pstream__ = nullptr) {
      linear_trendd(mine, stan); // compiled, just copy and test //
    }

  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> cuStan_chp_mat(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>&& t, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>&& t_change, const int& T, const int& S, std::ostream *pstream__ = nullptr) {
      return get_changepoint_matrixx(t, t_change, T, S);
    }

  Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>, 0, Eigen::Stride<0, 0>> cuStan_xmake(Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, Eigen::Dynamic>&& X, Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>&& s_a, const int& T, std::ostream *pstream__ = nullptr) {
      return Xmaker(X, s_a, T);
    }
}

Cuda library:


#include <cmath>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#define DLLEXPORT __declspec(dllexport)
//DLLEXPORT __declspec(dllimport)

// Declare the functions in the DLL
__global__ void computeLogProb(float* input, float* output) {
    // Compute logarithm of the probability distribution
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    output[idx] = logf(input[idx]);
}

// CUDA kernel to compute the gradients of the logarithm of the probability distribution
__global__ void computeLogProbGrad(float* input, float* grad) {
    // Compute gradients of the logarithm of the probability distribution
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    grad[idx] = 1.0f / input[idx];
}
__global__ void likelihood_all(double* X_sm, double* X_sa, double* beta, double* trend, double* result, int T, int K) {
    //printf("Started kernal likelihood");
    // α + x⋅β || α=trend .* (1 + X_sm * beta), x=X_sa, β=beta
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < T) {
        double sum = 0.0;
        double dum = 0.0;
        for (int j = 0; j < K; ++j) {
            sum += X_sm[i * K + j] * beta[j];
            dum += X_sa[i * K + j] * beta[j];
        }
        __syncthreads();
        result[i] = (trend[i] * (sum+1)) + dum;
        
    }
    //printf("Ended kernal likelihood1");
}
__global__ void linear_trend_kernel(double k, double m, double* delta, double* t, double* A, double* t_change, double* result, int T, int S, double* delta_val_1, double* delta_val_2) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < T) {
        double delta_val_11 = 0.0;
        double delta_val_22 = 0.0;
        int indexA = 0;
        for (int j = 0; j < S; j++) {
            indexA = A[i * S + j];
            if (indexA == 1) {
                delta_val_11 += (indexA * delta[j]);
                delta_val_22 += (indexA * (delta[j] * (-t_change[j])));
            }
        }
        result[i] = ((k + delta_val_11) * t[i]) + (m + delta_val_22);
        __syncthreads();

    }
}
__global__ void linear_trend_kernel_help(double k, double m, double* delta, double* t, double* A, double* t_change, double* result, int T, int S, double* delta_val_1, double* delta_val_2) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    if (j < (T * S)) {
        int i = j / S;
        int h = j-i*S;
        delta_val_1[i] = 0.0;
        delta_val_2[i] = 0.0;
        delta_val_1[i] += A[j] * delta[h];
        delta_val_2[i] += A[j] * (delta[h] * (-t_change[h]));
        if (i>=T){
            printf("i: %i", i);
        }
        if (h>=S){
            printf("h: %i", h);
        }
        __syncthreads();
    }
}
__global__ void get_changepoint_matrix_kernel(double* t, double* t_change, double* A, int T, int S, double* new_row) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < T) {
      A[i] = 0.0;
      __shared__ int cp_idx;
      cp_idx = 1;
        while ((cp_idx <= S) && (t[i] >= t_change[cp_idx])) {
          A[i * S + cp_idx] = 1.0;
          cp_idx++;
          __syncthreads();
        }
    }

}
__global__ void elementwise_mult_kernel(double* a, double* b, double* c, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        c[i] = a[i] * b[i];
        __syncthreads();
    }
}



extern "C" DLLEXPORT void likelihood(double* d_X_sa, double* d_X_sm, double* d_beta, double* d_trend, double* result, int T, int K) {
    // Set CUDA grid and block dimensions
    int threads_per_block = 512;
    int num_blocks = T/threads_per_block + (T % threads_per_block == 0 ? 0:1);
    likelihood_all << <num_blocks, threads_per_block >> > (d_X_sm, d_X_sa, d_beta, d_trend, result, T, K);
    float* output = new float[N];
    float* grad = new float[N];
    float* devInput, *devOutput, *devGrad;
    cudaMalloc((void**)&devInput, N * sizeof(float));
    cudaMalloc((void**)&devOutput, N * sizeof(float));
    cudaMalloc((void**)&devGrad, N * sizeof(float));

    // Transfer data from CPU to GPU
    cudaMemcpy(devInput, input, N * sizeof(float), cudaMemcpyHostToDevice);

    // Launch the CUDA kernel for computing the logarithm of the probability distribution
    computeLogProb<<<numBlocks, blockSize>>>(devInput, devOutput);

    // Launch the CUDA kernel for computing the gradients
    computeLogProbGrad<<<numBlocks, blockSize>>>(devInput, devGrad);

    // Transfer results from GPU to CPU
    cudaMemcpy(output, devOutput, N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(grad, devGrad, N * sizeof(float), cudaMemcpyDeviceToHost);

    // Clean up memory on the CPU and GPU
    delete[] input;
    delete[] output;
    delete[] grad;
    cudaFree(devInput);
    cudaFree(devOutput);
    cudaFree(devGrad);
    
}
extern "C" DLLEXPORT void linear_trendh(double k, double m, double* delta_ptr, double* t_ptr, double* A_ptr, double* t_change_ptr, double* result_ptr, int T, int S, double* delta_val_1, double* delta_val_2) {
    int threads_per_block = 512;
    int num_blocks = T/threads_per_block + (T % threads_per_block == 0 ? 0:1);
    cudaStream_t stream1;
    cudaStreamCreate(&stream1);
    cudaDeviceSynchronize();
    linear_trend_kernel <<<num_blocks, threads_per_block, 0, stream1 >>>(k, m, delta_ptr, t_ptr, A_ptr, t_change_ptr, result_ptr, T, S, delta_val_1, delta_val_2);
    cudaStreamSynchronize(stream1);
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
    }
    // Destroy the streams
    cudaStreamDestroy(stream1);
}
extern "C" DLLEXPORT void elementwize(double* d_X, double* d_s_a, double* d_result, int size) {
    // Set CUDA grid and block dimensions
    int threads_per_block = 256;
    int num_blocks = (size + threads_per_block - 1) / threads_per_block;
    elementwise_mult_kernel << <num_blocks, threads_per_block >> > (d_X, d_s_a, d_result, size);
}
extern "C" DLLEXPORT void chek_pont(double* d_tz, double* d_t_change, double* d_A, int T, int S) {
    int threads_per_block = 256;
    int num_blocks = (T + threads_per_block - 1) / threads_per_block;
    double* tenmp;
    cudaMalloc((void**)&tenmp, S * sizeof(double));
    get_changepoint_matrix_kernel << <num_blocks, threads_per_block >> > (d_tz, d_t_change, d_A, T, S, tenmp);
    cudaFree(tenmp);
}

If I am following correctly, I think the problem is indeed that you are not propagating derivatives correctly through these functions. A lot of ways of getting this wrong result in the program failing to compile, which is what I would normally expect here.

You should be able to construct a matrix of var_value<double>. It looks like cuStan_lt is already doing so

Thanks for the reply! I’m just a bit confused on how I can further understand derivatives inside of C++ and how to properly return them. When I made the c++ functions, stanc would only compile if the function returned a Eigen::map<Eigen::matrix<double… type but fails to compile if I return a Eigen::map<Eigen::matrix<var_value… type. Am I misunderstanding the function declaration and should be using templates? Or is there something else I am missing?

Thanks

Stan requires derivatives (var_value<double> types) but also expects the function to be callable without autodiff (plain double types).

Looks like the function signature in Stan is

vector cuStan_model(vector beta, data matrix X_sa, data matrix X_sm, vector trend);

Then you need to define two overloads for the C++ function

Eigen::Matrix<double, Eigen::Dynamic, 1> cuStan_model(const Eigen::Matrix<double, Eigen::Dynamic, 1>& beta, const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& X_sa, const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& X_sm, const Eigen::Matrix<double, Eigen::Dynamic, 1>& trend, std::ostream *pstream__ = nullptr);

Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1> cuStan_model(const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& beta, const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& X_sa, const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& X_sm, const Eigen::Matrix<stan::math::var_value<double>, Eigen::Dynamic, 1>& trend, std::ostream *pstream__ = nullptr);

The single function you have written compiles, I think, because C++ compiler inserts automatic Matrix<double> to Matrix<var> conversions that do not propagate derivatives but allow the single function to substitute for both expected overloads.

2 Likes

Everything @nhuurre said is correct. There is a bit more information on this in the CmdStan user’s guide: 24 Using external C++ code | CmdStan User’s Guide

Thank you so much! I’m a little new to this, so I’m sorry if these questions are a bit dumb, but does Stan use Reverse accumulation Auto diff? The function below is to replace the normal_id_glm function (without doing the normal part), do I need to calculate the auto diff for all the variables that I use for it or just the result? And if I do need to calculate for all the variables, can I use “&” in function declaration and modify the original var_value that Stan passes or are all vars const from Stan?

__global__ void likelihood_all(double* X_sm, double* X_sa, double* beta, double* trend, double* result, int T, int K) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < T) {
        double sum = 0.0;
        double dum = 0.0;
        for (int j = 0; j < K; ++j) {
            sum += X_sm[i * K + j] * beta[j];
            dum += X_sa[i * K + j] * beta[j];
        }
        __syncthreads();
        result[i] = (trend[i] * (sum+1)) + dum;
        
    }
}

I took a look at that page a few times but I’m not sure how I can do the following code as a double variable instead of the var_value:

template <typename EigVec, stan::require_eigen_vector_t<EigVec> * = nullptr,
          stan::require_not_st_var<EigVec> * = nullptr>
inline double my_dot_self(const EigVec &x, std::ostream *pstream__)
{
    auto x_ref = stan::math::to_ref(x);
    double sum = 0.0;
    for (int i = 0; i < x.size(); ++i)
    {
        sum += x_ref.coeff(i) * x_ref.coeff(i);
    }
    return sum;
}

template <typename EigVec, stan::require_eigen_vt<stan::is_var, EigVec> * = nullptr>
inline stan::math::var my_dot_self(const EigVec &v, std::ostream *pstream__)
{
    // (1) put v into our memory arena
    stan::arena_t<EigVec> arena_v(v);
    // (2) calculate forward pass using
    // (3) the .val() method for matrices of var types
    stan::math::var res = my_dot_self(arena_v.val(), pstream__);
    // (4) Place a callback for the reverse pass on the callback stack.
    stan::math::reverse_pass_callback(
        [res, arena_v]() mutable
        { arena_v.adj() += 2.0 * res.adj() * arena_v.val(); });
    return res;
}

if I take more vars in for my calculation, how do I make a reverse pass callback for all of them? for this calc:
α + x⋅β
with α = vec[T], x = mat[T,K}, & β = vec[K]
what would it look like to make a reverse pass? The only examples I have found use 1 input and 1 output so I don’t understand how it scales with multiple inputs.

Thanks for your help!

You should propagate derivatives to both inputs by having both captured. It uses a slightly older style, but atan2 might be a good example: math/stan/math/rev/fun/atan2.hpp at develop · stan-dev/math · GitHub

You can do the same thing with reverse_pass_callback