NVIDIA · caugonnet · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ * @brief Example of reduction implementing using CUB
+ */
+
+#include <cub/cub.cuh>
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+template <typename BinaryOp>
+struct OpWrapper
+{
+  OpWrapper(BinaryOp _op)
+      : op(mv(_op)) {};
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const
+  {
+    return op(a, b);
+  }
+
+  BinaryOp op;
+};
+
+template <typename Ctx, typename InT, typename OutT, typename BinaryOp>
+void exclusive_scan(
+  Ctx& ctx, logical_data<slice<InT>> in_data, logical_data<slice<OutT>> out_data, BinaryOp&& op, OutT init_val)
+{
+  size_t nitems = in_data.shape().size();
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceScan::ExclusiveScan(
+    d_temp_storage,
+    temp_storage_bytes,
+    (InT*) nullptr,
+    (OutT*) nullptr,
+    OpWrapper<BinaryOp>(op),
+    init_val,
+    in_data.shape().size(),
+    0);
+
+  auto ltemp = ctx.logical_data(shape_of<slice<char>>(temp_storage_bytes));
+
+  ctx.task(in_data.read(), out_data.write(), ltemp.write())
+      ->*[&op, init_val, nitems, temp_storage_bytes](cudaStream_t stream, auto d_in, auto d_out, auto d_temp) {
+            size_t d_temp_size = shape(d_temp).size();
+            cub::DeviceScan::ExclusiveScan(
+              (void*) d_temp.data_handle(),
+              d_temp_size,
+              (InT*) d_in.data_handle(),
+              (OutT*) d_out.data_handle(),
+              OpWrapper<BinaryOp>(op),
+              init_val,
+              nitems,
+              stream);
+          };
+}
+
+template <typename Ctx>
+void run()
+{
+  Ctx ctx;
+
+  const size_t N = 1024 * 16;
+
+  ::std::vector<int> X(N);
+  ::std::vector<int> out(N);
+
+  ::std::vector<int> ref_out(N);
+
+  for (size_t ind = 0; ind < N; ind++)
+  {
+    X[ind] = rand() % N;
+
+    // compute the exclusive sum of X
+    ref_out[ind] = (ind == 0) ? 0 : (X[ind - 1] + ref_out[ind - 1]);
+  }
+
+  auto lX   = ctx.logical_data(X.data(), {N});
+  auto lout = ctx.logical_data(out.data(), {N});
+
+  exclusive_scan(
+    ctx,
+    lX,
+    lout,
+    [] __device__(const int& a, const int& b) {
+      return a + b;
+    },
+    0);
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    _CCCL_ASSERT(ref_out[i] == out[i], "Incorrect result");
+  }
+}
+
+int main()
+{
+  run<stream_ctx>();
+  // run<graph_ctx>();
+}
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ * @brief Example of reduction implementing using CUB kernels
+ */
+
+#include <thrust/device_vector.h>
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+template <int BLOCK_THREADS, typename T>
+__global__ void reduce(slice<const T> values, slice<T> partials, size_t nelems)
+{
+  using namespace cub;
+  typedef BlockReduce<T, BLOCK_THREADS> BlockReduceT;
+
+  auto thread_id = BLOCK_THREADS * blockIdx.x + threadIdx.x;
+
+  // Local reduction
+  T local_sum = 0;
+  for (size_t ind = thread_id; ind < nelems; ind += blockDim.x * gridDim.x)
+  {
+    local_sum += values(ind);
+  }
+
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  // Per-thread tile data
+  T result = BlockReduceT(temp_storage).Sum(local_sum);
+
+  if (threadIdx.x == 0)
+  {
+    partials(blockIdx.x) = result;
+  }
+}
+
+template <typename Ctx>
+void run()
+{
+  Ctx ctx;
+
+  const size_t N          = 1024 * 16;
+  const size_t BLOCK_SIZE = 128;
+  const size_t num_blocks = 32;
+
+  int *X, ref_tot;
+
+  X       = new int[N];
+  ref_tot = 0;
+
+  for (size_t ind = 0; ind < N; ind++)
+  {
+    X[ind] = rand() % N;
+    ref_tot += X[ind];
+  }
+
+  auto values   = ctx.logical_data(X, {N});
+  auto partials = ctx.logical_data(shape_of<slice<int>>(num_blocks));
+  auto result   = ctx.logical_data(shape_of<slice<int>>(1));
+
+  ctx.task(values.read(), partials.write(), result.write())->*[&](auto stream, auto values, auto partials, auto result) {
+    // reduce values into partials
+    reduce<BLOCK_SIZE, int><<<num_blocks, BLOCK_SIZE, 0, stream>>>(values, partials, N);
+
+    // reduce partials on a single block into result
+    reduce<BLOCK_SIZE, int><<<1, BLOCK_SIZE, 0, stream>>>(partials, result, num_blocks);
+  };
+
+  ctx.host_launch(result.read())->*[&](auto p) {
+    if (p(0) != ref_tot)
+    {
+      fprintf(stderr, "INCORRECT RESULT: p sum = %d, ref tot = %d\n", p(0), ref_tot);
+      abort();
+    }
+  };
+
+  ctx.finalize();
+}
+
+int main()
+{
+  run<stream_ctx>();
+  run<graph_ctx>();
+}
@@ -10,49 +10,73 @@
 
 /**
  * @file
- * @brief Example of reduction implementing using CUB kernels
+ * @brief Example of reduction implementing using CUB
  */
 
-#include <thrust/device_vector.h>
+#include <cub/cub.cuh>
 
 #include <cuda/experimental/stf.cuh>
 
 using namespace cuda::experimental::stf;
 
-template <int BLOCK_THREADS, typename T>
-__global__ void reduce(slice<const T> values, slice<T> partials, size_t nelems)
+template <typename BinaryOp>
+struct OpWrapper
 {
-  using namespace cub;
-  typedef BlockReduce<T, BLOCK_THREADS> BlockReduceT;
+  OpWrapper(BinaryOp _op)
+      : op(mv(_op)) {};
 
-  auto thread_id = BLOCK_THREADS * blockIdx.x + threadIdx.x;
-
-  // Local reduction
-  T local_sum = 0;
-  for (size_t ind = thread_id; ind < nelems; ind += blockDim.x * gridDim.x)
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const
   {
-    local_sum += values(ind);
+    return op(a, b);
   }
 
-  __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-  // Per-thread tile data
-  T result = BlockReduceT(temp_storage).Sum(local_sum);
+  BinaryOp op;
+};
 
-  if (threadIdx.x == 0)
-  {
-    partials(blockIdx.x) = result;
-  }
+template <typename T, typename Ctx, typename BinaryOp>
+auto reduce(Ctx& ctx, logical_data<slice<T>> data, BinaryOp&& op, T init_val)
+{
+  auto result = ctx.logical_data(shape_of<scalar_view<T>>());
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Reduce(
+    d_temp_storage,
+    temp_storage_bytes,
+    (T*) nullptr,
+    (T*) nullptr,
+    data.shape().size(),
+    OpWrapper<BinaryOp>(op),
+    init_val,
+    0);
+
+  auto ltemp = ctx.logical_data(shape_of<slice<char>>(temp_storage_bytes));
+
+  ctx.task(data.read(), result.write(), ltemp.write())
+      ->*[&op, init_val, temp_storage_bytes](cudaStream_t stream, auto d_data, auto d_result, auto d_temp) {
+            size_t d_temp_size = shape(d_temp).size();
+            cub::DeviceReduce::Reduce(
+              (void*) d_temp.data_handle(),
+              d_temp_size,
+              (T*) d_data.data_handle(),
+              (T*) d_result.addr,
+              shape(d_data).size(),
+              OpWrapper<BinaryOp>(op),
+              init_val,
+              stream);
+          };
+
+  return result;
 }
 
 template <typename Ctx>
 void run()
 {
   Ctx ctx;
 
-  const size_t N          = 1024 * 16;
-  const size_t BLOCK_SIZE = 128;
-  const size_t num_blocks = 32;
+  const size_t N = 1024 * 16;
 
   int *X, ref_tot;
 
@@ -65,31 +89,25 @@ void run()
     ref_tot += X[ind];
   }
 
-  auto values   = ctx.logical_data(X, {N});
-  auto partials = ctx.logical_data(shape_of<slice<int>>(num_blocks));
-  auto result   = ctx.logical_data(shape_of<slice<int>>(1));
-
-  ctx.task(values.read(), partials.write(), result.write())->*[&](auto stream, auto values, auto partials, auto result) {
-    // reduce values into partials
-    reduce<BLOCK_SIZE, int><<<num_blocks, BLOCK_SIZE, 0, stream>>>(values, partials, N);
+  auto values = ctx.logical_data(X, {N});
 
-    // reduce partials on a single block into result
-    reduce<BLOCK_SIZE, int><<<1, BLOCK_SIZE, 0, stream>>>(partials, result, num_blocks);
-  };
+  // int should be deduced from "values"...
+  auto lresult = reduce(
+    ctx,
+    values,
+    [] __device__(const int& a, const int& b) {
+      return a + b;
+    },
+    0);
 
-  ctx.host_launch(result.read())->*[&](auto p) {
-    if (p(0) != ref_tot)
-    {
-      fprintf(stderr, "INCORRECT RESULT: p sum = %d, ref tot = %d\n", p(0), ref_tot);
-      abort();
-    }
-  };
+  int result = ctx.wait(lresult);
+  _CCCL_ASSERT(result == ref_tot, "Incorrect result");
 
   ctx.finalize();
 }
 
 int main()
 {
   run<stream_ctx>();
-  run<graph_ctx>();
+  // run<graph_ctx>();
 }