From ef33bcc2c491ea5c17ca23ee27ede0016ec4e95c Mon Sep 17 00:00:00 2001 From: nychiang Date: Mon, 13 Mar 2023 14:39:25 -0700 Subject: [PATCH] test, move reduce to cuda kernel --- src/LinAlg/VectorCudaKernels.cu | 59 +++++++++++++++++++++++++++----- src/LinAlg/VectorCudaKernels.hpp | 4 +-- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/LinAlg/VectorCudaKernels.cu b/src/LinAlg/VectorCudaKernels.cu index 29fb6984b..872a36667 100644 --- a/src/LinAlg/VectorCudaKernels.cu +++ b/src/LinAlg/VectorCudaKernels.cu @@ -394,6 +394,32 @@ __global__ void add_linear_damping_term_cu(int n, double* data, const double* ix /** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */ __global__ void is_posive_w_pattern_cu(int n, double* data, const double* vd, const double* id) +{ + extern __shared__ float shared_sum[]; + const int num_threads = blockDim.x * gridDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + int sum = 0; + for (int i = tid; i < n; i += num_threads) { + sum += (id[i] == 1.0 && vd[i] > 0.0) ? 1 : 0; + } + shared_sum[threadIdx.x] = sum; + __syncthreads(); + + for(int s = blockDim.x / 2; s > 0; s >>= 1) { + if(threadIdx.x < s) { + shared_sum[threadIdx.x] += shared_sum[threadIdx.x + s]; + } + __syncthreads(); + } + + // Store the result for this block in global memory + if(threadIdx.x == 0) { + data[blockIdx.x] = shared_sum[0]; + } +} + +/** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */ +__global__ void is_posive_w_pattern_cu_back(int n, double* data, const double* vd, const double* id) { const int num_threads = blockDim.x * gridDim.x; const int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -914,13 +940,30 @@ void add_linear_damping_term_kernel(int n_local, } /** @brief Checks if selected elements of `this` are positive */ -void is_posive_w_pattern_kernel(int n_local, - double* yd, +int is_posive_w_pattern_kernel(int n_local, +// double* yd, const double* xd, const double* id) { int num_blocks = (n_local+block_size-1)/block_size; - is_posive_w_pattern_cu<<>>(n_local, yd, xd, id); +// is_posive_w_pattern_cu<<>>(n_local, yd, xd, id); + + int *h_retval = (int*)malloc(num_blocks*sizeof(int)); + int *d_retval; + cudaMalloc(&d_retval,num_blocks*sizeof(int)); + + is_posive_w_pattern_cu<<>>(n_local, d_retval, xd, id); + cudaDeviceSynchronize(); + cudaMemcpy(h_retval, d_retval, num_blocks*sizeof(int), cudaMemcpyDeviceToHost); + + int sum_result = 0; + for(int i=0;i v_temp(n); - double* dv_ptr = thrust::raw_pointer_cast(v_temp.data()); +// thrust::device_vector v_temp(n); +// double* dv_ptr = thrust::raw_pointer_cast(v_temp.data()); +// return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus()); - hiop::cuda::is_posive_w_pattern_kernel(n, dv_ptr, d1, id); - - return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus()); + int irev = hiop::cuda::is_posive_w_pattern_kernel(n, dv_ptr, d1, id); + return irev; } /** @brief compute min(d1) for selected elements*/ diff --git a/src/LinAlg/VectorCudaKernels.hpp b/src/LinAlg/VectorCudaKernels.hpp index 3d8cf6cba..00e8d4870 100644 --- a/src/LinAlg/VectorCudaKernels.hpp +++ b/src/LinAlg/VectorCudaKernels.hpp @@ -159,8 +159,8 @@ void add_linear_damping_term_kernel(int n_local, double ct); /** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */ -void is_posive_w_pattern_kernel(int n_local, - double* yd, +int is_posive_w_pattern_kernel(int n_local, +// double* yd, const double* xd, const double* id);