Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[STF] Implement a reduce algorithm over CUB #3122

Draft
wants to merge 29 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ab1c73f
Copy the existing 08-cub-reduce example in another file indicating it…
caugonnet Dec 11, 2024
cac9438
Start to implement a reduce method over CUB
caugonnet Dec 11, 2024
a012450
Save WIP : start to implement transform_reduce on top of CUB
caugonnet Dec 11, 2024
b023269
Do the reduction part as well
caugonnet Dec 11, 2024
c920e13
clang-format
caugonnet Dec 11, 2024
c3468c2
Better types in transform_reduce
caugonnet Dec 11, 2024
a9db17c
Minor code improvements
caugonnet Dec 12, 2024
023954a
compute shape size once
caugonnet Dec 12, 2024
90db5a5
Merge branch 'main' into stf_cub_reduce
caugonnet Dec 12, 2024
2794242
Explain the algorithm and rename scalar to scalar_view
caugonnet Dec 12, 2024
38c820b
Remove some piece of code intended to use ->* in transform_reduce
caugonnet Dec 12, 2024
e9d4d3e
Get build to work
andralex Dec 12, 2024
616ff99
Use chaining of operator->* in transform_reduce
andralex Dec 13, 2024
493371f
Merge branch 'main' into stf_cub_reduce
caugonnet Dec 16, 2024
30dae91
Implement an example of exclusive scan over slices
caugonnet Dec 16, 2024
afa8a21
clang-format
caugonnet Dec 16, 2024
88baa83
improve reduce example to take a logical data of a slice only
caugonnet Dec 16, 2024
f56775f
fix some constness issue
caugonnet Dec 16, 2024
2469994
fix some constness issue
caugonnet Dec 16, 2024
318eb7b
Merge branch 'main' into stf_cub_reduce
caugonnet Dec 16, 2024
0963c9f
Implement transform_exclusive_scan
caugonnet Dec 16, 2024
26b41e1
ReduceOpWrapper -> LambdaOpWrapper
caugonnet Dec 16, 2024
3b6ccfe
Use the ->* operator of the task to support graphs
caugonnet Dec 17, 2024
fc33c3a
clang-format
caugonnet Dec 17, 2024
649fde6
WIP : Try to move CUB algorithms to utilities
caugonnet Dec 17, 2024
872cd4d
Temporary experiment to use scopes again
caugonnet Dec 17, 2024
b9156bb
Save WIP
caugonnet Dec 17, 2024
fdc91a2
Make things work, first pass
andralex Dec 18, 2024
20ab981
Merge branch 'NVIDIA:main' into stf_cub_reduce
caugonnet Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions cudax/examples/stf/08-cub-exclusive-scan.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
* @brief Example of reduction implementing using CUB
*/

#include <cub/cub.cuh>

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

template <typename BinaryOp>
struct OpWrapper
{
OpWrapper(BinaryOp _op)
: op(mv(_op)) {};

template <typename T>
__device__ __forceinline__ T operator()(const T& a, const T& b) const
{
return op(a, b);
}

BinaryOp op;
};

template <typename Ctx, typename InT, typename OutT, typename BinaryOp>
void exclusive_scan(
Ctx& ctx, logical_data<slice<InT>> in_data, logical_data<slice<OutT>> out_data, BinaryOp&& op, OutT init_val)
{
size_t nitems = in_data.shape().size();

// Determine temporary device storage requirements
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveScan(
d_temp_storage,
temp_storage_bytes,
(InT*) nullptr,
(OutT*) nullptr,
OpWrapper<BinaryOp>(op),
init_val,
in_data.shape().size(),
0);

auto ltemp = ctx.logical_data(shape_of<slice<char>>(temp_storage_bytes));

ctx.task(in_data.read(), out_data.write(), ltemp.write())
->*[&op, init_val, nitems, temp_storage_bytes](cudaStream_t stream, auto d_in, auto d_out, auto d_temp) {
size_t d_temp_size = shape(d_temp).size();
cub::DeviceScan::ExclusiveScan(
(void*) d_temp.data_handle(),
d_temp_size,
(InT*) d_in.data_handle(),
(OutT*) d_out.data_handle(),
OpWrapper<BinaryOp>(op),
init_val,
nitems,
stream);
};
}

template <typename Ctx>
void run()
{
Ctx ctx;

const size_t N = 1024 * 16;

::std::vector<int> X(N);
::std::vector<int> out(N);

::std::vector<int> ref_out(N);

for (size_t ind = 0; ind < N; ind++)
{
X[ind] = rand() % N;

// compute the exclusive sum of X
ref_out[ind] = (ind == 0) ? 0 : (X[ind - 1] + ref_out[ind - 1]);
}

auto lX = ctx.logical_data(X.data(), {N});
auto lout = ctx.logical_data(out.data(), {N});

exclusive_scan(
ctx,
lX,
lout,
[] __device__(const int& a, const int& b) {
return a + b;
},
0);

ctx.finalize();

for (size_t i = 0; i < N; i++)
{
_CCCL_ASSERT(ref_out[i] == out[i], "Incorrect result");
}
}

int main()
{
run<stream_ctx>();
// run<graph_ctx>();
}
95 changes: 95 additions & 0 deletions cudax/examples/stf/08-cub-reduce-lowlevel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
* @brief Example of reduction implementing using CUB kernels
*/

#include <thrust/device_vector.h>

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

template <int BLOCK_THREADS, typename T>
__global__ void reduce(slice<const T> values, slice<T> partials, size_t nelems)
{
using namespace cub;
typedef BlockReduce<T, BLOCK_THREADS> BlockReduceT;

auto thread_id = BLOCK_THREADS * blockIdx.x + threadIdx.x;

// Local reduction
T local_sum = 0;
for (size_t ind = thread_id; ind < nelems; ind += blockDim.x * gridDim.x)
{
local_sum += values(ind);
}

__shared__ typename BlockReduceT::TempStorage temp_storage;

// Per-thread tile data
T result = BlockReduceT(temp_storage).Sum(local_sum);

if (threadIdx.x == 0)
{
partials(blockIdx.x) = result;
}
}

template <typename Ctx>
void run()
{
Ctx ctx;

const size_t N = 1024 * 16;
const size_t BLOCK_SIZE = 128;
const size_t num_blocks = 32;

int *X, ref_tot;

X = new int[N];
ref_tot = 0;

for (size_t ind = 0; ind < N; ind++)
{
X[ind] = rand() % N;
ref_tot += X[ind];
}

auto values = ctx.logical_data(X, {N});
auto partials = ctx.logical_data(shape_of<slice<int>>(num_blocks));
auto result = ctx.logical_data(shape_of<slice<int>>(1));

ctx.task(values.read(), partials.write(), result.write())->*[&](auto stream, auto values, auto partials, auto result) {
// reduce values into partials
reduce<BLOCK_SIZE, int><<<num_blocks, BLOCK_SIZE, 0, stream>>>(values, partials, N);

// reduce partials on a single block into result
reduce<BLOCK_SIZE, int><<<1, BLOCK_SIZE, 0, stream>>>(partials, result, num_blocks);
};

ctx.host_launch(result.read())->*[&](auto p) {
if (p(0) != ref_tot)
{
fprintf(stderr, "INCORRECT RESULT: p sum = %d, ref tot = %d\n", p(0), ref_tot);
abort();
}
};

ctx.finalize();
}

int main()
{
run<stream_ctx>();
run<graph_ctx>();
}
100 changes: 59 additions & 41 deletions cudax/examples/stf/08-cub-reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,49 +10,73 @@

/**
* @file
* @brief Example of reduction implementing using CUB kernels
* @brief Example of reduction implementing using CUB
*/

#include <thrust/device_vector.h>
#include <cub/cub.cuh>

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

template <int BLOCK_THREADS, typename T>
__global__ void reduce(slice<const T> values, slice<T> partials, size_t nelems)
template <typename BinaryOp>
struct OpWrapper
{
using namespace cub;
typedef BlockReduce<T, BLOCK_THREADS> BlockReduceT;
OpWrapper(BinaryOp _op)
: op(mv(_op)) {};

auto thread_id = BLOCK_THREADS * blockIdx.x + threadIdx.x;

// Local reduction
T local_sum = 0;
for (size_t ind = thread_id; ind < nelems; ind += blockDim.x * gridDim.x)
template <typename T>
__device__ __forceinline__ T operator()(const T& a, const T& b) const
{
local_sum += values(ind);
return op(a, b);
}

__shared__ typename BlockReduceT::TempStorage temp_storage;

// Per-thread tile data
T result = BlockReduceT(temp_storage).Sum(local_sum);
BinaryOp op;
};

if (threadIdx.x == 0)
{
partials(blockIdx.x) = result;
}
template <typename T, typename Ctx, typename BinaryOp>
auto reduce(Ctx& ctx, logical_data<slice<T>> data, BinaryOp&& op, T init_val)
{
auto result = ctx.logical_data(shape_of<scalar_view<T>>());

// Determine temporary device storage requirements
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(
d_temp_storage,
temp_storage_bytes,
(T*) nullptr,
(T*) nullptr,
data.shape().size(),
OpWrapper<BinaryOp>(op),
init_val,
0);

auto ltemp = ctx.logical_data(shape_of<slice<char>>(temp_storage_bytes));

ctx.task(data.read(), result.write(), ltemp.write())
->*[&op, init_val, temp_storage_bytes](cudaStream_t stream, auto d_data, auto d_result, auto d_temp) {
size_t d_temp_size = shape(d_temp).size();
cub::DeviceReduce::Reduce(
(void*) d_temp.data_handle(),
d_temp_size,
(T*) d_data.data_handle(),
(T*) d_result.addr,
shape(d_data).size(),
OpWrapper<BinaryOp>(op),
init_val,
stream);
};

return result;
}

template <typename Ctx>
void run()
{
Ctx ctx;

const size_t N = 1024 * 16;
const size_t BLOCK_SIZE = 128;
const size_t num_blocks = 32;
const size_t N = 1024 * 16;

int *X, ref_tot;

Expand All @@ -65,31 +89,25 @@ void run()
ref_tot += X[ind];
}

auto values = ctx.logical_data(X, {N});
auto partials = ctx.logical_data(shape_of<slice<int>>(num_blocks));
auto result = ctx.logical_data(shape_of<slice<int>>(1));

ctx.task(values.read(), partials.write(), result.write())->*[&](auto stream, auto values, auto partials, auto result) {
// reduce values into partials
reduce<BLOCK_SIZE, int><<<num_blocks, BLOCK_SIZE, 0, stream>>>(values, partials, N);
auto values = ctx.logical_data(X, {N});

// reduce partials on a single block into result
reduce<BLOCK_SIZE, int><<<1, BLOCK_SIZE, 0, stream>>>(partials, result, num_blocks);
};
// int should be deduced from "values"...
auto lresult = reduce(
ctx,
values,
[] __device__(const int& a, const int& b) {
return a + b;
},
0);

ctx.host_launch(result.read())->*[&](auto p) {
if (p(0) != ref_tot)
{
fprintf(stderr, "INCORRECT RESULT: p sum = %d, ref tot = %d\n", p(0), ref_tot);
abort();
}
};
int result = ctx.wait(lresult);
_CCCL_ASSERT(result == ref_tot, "Incorrect result");

ctx.finalize();
}

int main()
{
run<stream_ctx>();
run<graph_ctx>();
// run<graph_ctx>();
}
Loading