From ef33bcc2c491ea5c17ca23ee27ede0016ec4e95c Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Mon, 13 Mar 2023 14:39:25 -0700
Subject: [PATCH] test, move reduce to cuda kernel

---
 src/LinAlg/VectorCudaKernels.cu  | 59 +++++++++++++++++++++++++++-----
 src/LinAlg/VectorCudaKernels.hpp |  4 +--
 2 files changed, 53 insertions(+), 10 deletions(-)
diff --git a/src/LinAlg/VectorCudaKernels.cu b/src/LinAlg/VectorCudaKernels.cu
index 29fb6984b..872a36667 100644
--- a/src/LinAlg/VectorCudaKernels.cu
+++ b/src/LinAlg/VectorCudaKernels.cu
@@ -394,6 +394,32 @@ __global__ void add_linear_damping_term_cu(int n, double* data, const double* ix
 
 /** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
 __global__ void is_posive_w_pattern_cu(int n, double* data, const double* vd, const double* id)
+{
+  extern __shared__ float shared_sum[];
+  const int num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int sum = 0;
+  for (int i = tid; i < n; i += num_threads) {
+    sum += (id[i] == 1.0 && vd[i] > 0.0) ? 1 : 0;
+  }
+  shared_sum[threadIdx.x] = sum;
+  __syncthreads();
+  
+  for(int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if(threadIdx.x < s) {
+      shared_sum[threadIdx.x] += shared_sum[threadIdx.x + s];
+    }
+     __syncthreads();
+  }
+
+  // Store the result for this block in global memory
+  if(threadIdx.x == 0) {
+    data[blockIdx.x] = shared_sum[0];
+  }
+}
+
+/** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
+__global__ void is_posive_w_pattern_cu_back(int n, double* data, const double* vd, const double* id)
 {
   const int num_threads = blockDim.x * gridDim.x;
   const int tid = blockIdx.x * blockDim.x + threadIdx.x;    
@@ -914,13 +940,30 @@ void add_linear_damping_term_kernel(int n_local,
 }
 
 /** @brief Checks if selected elements of `this` are positive */
-void is_posive_w_pattern_kernel(int n_local,
-                                double* yd,
+int is_posive_w_pattern_kernel(int n_local,
+//                                double* yd,
                                 const double* xd,
                                 const double* id)
 {
   int num_blocks = (n_local+block_size-1)/block_size;
-  is_posive_w_pattern_cu<<<num_blocks,block_size>>>(n_local, yd, xd, id);
+//  is_posive_w_pattern_cu<<<num_blocks,block_size>>>(n_local, yd, xd, id);
+
+  int *h_retval = (int*)malloc(num_blocks*sizeof(int));
+  int *d_retval;
+  cudaMalloc(&d_retval,num_blocks*sizeof(int));
+    
+  is_posive_w_pattern_cu<<<num_blocks,block_size, block_size*sizeof(int)>>>(n_local, d_retval, xd, id);
+  cudaDeviceSynchronize();
+  cudaMemcpy(h_retval, d_retval, num_blocks*sizeof(int), cudaMemcpyDeviceToHost);
+  
+  int sum_result = 0;
+  for(int i=0;i<block_size;i++) {
+    sum_result += h_retval[i];
+  }
+
+  cudaFree(d_retval);
+  free(h_retval);
+  return sum_result;
 }
 
 /// set value with pattern
@@ -1197,12 +1240,12 @@ double min_local_kernel(int n, double* d1)
 int all_positive_w_pattern_kernel(int n, const double* d1, const double* id)
 {
   // TODO: how to avoid this temp vec?
-  thrust::device_vector<double> v_temp(n);
-  double* dv_ptr = thrust::raw_pointer_cast(v_temp.data());
+//  thrust::device_vector<double> v_temp(n);
+//  double* dv_ptr = thrust::raw_pointer_cast(v_temp.data());
+//  return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus<int>());
 
-  hiop::cuda::is_posive_w_pattern_kernel(n, dv_ptr, d1, id);
-  
-  return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus<int>());
+  int irev = hiop::cuda::is_posive_w_pattern_kernel(n, dv_ptr, d1, id);
+  return irev;
 }
 
 /** @brief compute min(d1) for selected elements*/
diff --git a/src/LinAlg/VectorCudaKernels.hpp b/src/LinAlg/VectorCudaKernels.hpp
index 3d8cf6cba..00e8d4870 100644
--- a/src/LinAlg/VectorCudaKernels.hpp
+++ b/src/LinAlg/VectorCudaKernels.hpp
@@ -159,8 +159,8 @@ void add_linear_damping_term_kernel(int n_local,
                                     double ct);
 
 /** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
-void is_posive_w_pattern_kernel(int n_local,
-                                double* yd,
+int is_posive_w_pattern_kernel(int n_local,
+//                                double* yd,
                                 const double* xd,
                                 const double* id);