Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use programmatic dependent launch in CUB merge sort #3114

Merged
merged 2 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/merge_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ void keys(nvbench::state& state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: why this change is needed? Did merge become synchronous?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Merge sort now has calls to cudaGridDependencySynchronize, which cause the benchmark to crash if I were to use no_batch.

dispatch_t::Dispatch(
temp_storage,
temp_size,
Expand Down
9 changes: 9 additions & 0 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ struct AgentBlockSort
_CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining)
{
ValueT items_local[ITEMS_PER_THREAD];

_CCCL_PDL_GRID_DEPENDENCY_SYNC();

_CCCL_IF_CONSTEXPR (!KEYS_ONLY)
{
_CCCL_IF_CONSTEXPR (IS_LAST_TILE)
Expand All @@ -198,6 +201,7 @@ struct AgentBlockSort
}

CTA_SYNC();
_CCCL_PDL_TRIGGER_NEXT_LAUNCH();

_CCCL_IF_CONSTEXPR (IS_LAST_TILE)
{
Expand Down Expand Up @@ -336,6 +340,8 @@ struct AgentPartition
const OffsetT keys2_beg = keys1_end;
const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));

_CCCL_PDL_GRID_DEPENDENCY_SYNC();

// The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage
if (partition_idx + 1 == num_partitions)
{
Expand Down Expand Up @@ -535,6 +541,8 @@ struct AgentMerge
const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);

_CCCL_PDL_GRID_DEPENDENCY_SYNC();

// load keys1 & keys2
KeyT keys_local[ITEMS_PER_THREAD];
if (ping)
Expand Down Expand Up @@ -576,6 +584,7 @@ struct AgentMerge
}

CTA_SYNC();
_CCCL_PDL_TRIGGER_NEXT_LAUNCH();

// use binary search in shared memory
// to find merge path for each of thread
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO

/**
* @brief Kernel that copies data from a batch of given source buffers to their corresponding
* destination buffer. If a buffer's size is to large to be copied by a single thread block, that
* destination buffer. If a buffer's size is too large to be copied by a single thread block, that
* buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
* collaborate on each of these buffers. All other buffers get copied straight away.
* collaborate on each of these buffers. All other buffers get copied straight away.o
*
* @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
* @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers
Expand Down
6 changes: 4 additions & 2 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ __launch_bounds__(
VSmemHelperT::discard_temp_storage(temp_storage);
}

// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with
// cudaErrorIllegalAddress.
template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
bool ping,
Expand Down Expand Up @@ -618,7 +620,7 @@ struct DispatchMergeSort : SelectedPolicy

// Partition
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
partition_grid_size, threads_per_partition_block, 0, stream)
partition_grid_size, threads_per_partition_block, 0, stream, true)
.doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
ping,
d_output_keys,
Expand All @@ -645,7 +647,7 @@ struct DispatchMergeSort : SelectedPolicy

// Merge
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream)
static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
.doit(
DeviceMergeSortMergeKernel<MaxPolicyT,
KeyInputIteratorT,
Expand Down
53 changes: 53 additions & 0 deletions libcudacxx/include/cuda/std/__cccl/cuda_capabilities.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CCCL_CUDA_CAPABILITIES
#define __CCCL_CUDA_CAPABILITIES

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail
#if !_CCCL_COMPILER(NVRTC)
# include <cuda_runtime_api.h>
#endif // !_CCCL_COMPILER(NVRTC)

#include <nv/target>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

// True, when programmatic dependent launch is available, otherwise false.
#define _CCCL_HAS_PDL _CCCL_CUDACC_AT_LEAST(11, 8)
#if _CCCL_HAS_PDL
// Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
// memory access in a kernel.
# define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
// Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
// Heuristic(ahendriksen): put it after the last load.
# define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
#else
# define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
# define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()
#endif // _CCCL_HAS_PDL

#endif // __CCCL_CUDA_CAPABILITIES
29 changes: 27 additions & 2 deletions thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include <thrust/system/cuda/config.h>

#include <cuda/cmath>
#include <cuda/std/__cccl/cuda_capabilities.h>

THRUST_NAMESPACE_BEGIN

Expand All @@ -53,19 +54,43 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron
dim3 const grid;
dim3 const block;
Size const shared_mem;
bool const dependent_launch;
cudaStream_t const stream;

THRUST_RUNTIME_FUNCTION triple_chevron(dim3 grid_, dim3 block_, Size shared_mem_ = 0, cudaStream_t stream_ = nullptr)
/// @param dependent_launch Launches the kernel using programmatic dependent launch if available.
THRUST_RUNTIME_FUNCTION triple_chevron(
dim3 grid_, dim3 block_, Size shared_mem_ = 0, cudaStream_t stream_ = nullptr, bool dependent_launch = false)
: grid(grid_)
, block(block_)
, shared_mem(shared_mem_)
, dependent_launch(dependent_launch)
, stream(stream_)
{}

template <class K, class... Args>
cudaError_t _CCCL_HOST doit_host(K k, Args const&... args) const
{
k<<<grid, block, shared_mem, stream>>>(args...);
#if _THRUST_HAS_PDL
if (dependent_launch)
{
cudaLaunchAttribute attribute[1];
attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attribute[0].val.programmaticStreamSerializationAllowed = 1;

cudaLaunchConfig_t config{};
config.gridDim = grid;
config.blockDim = block;
config.dynamicSmemBytes = shared_mem;
config.stream = stream;
config.attrs = attribute;
config.numAttrs = 1;
cudaLaunchKernelEx(&config, k, args...);
}
else
#endif // _THRUST_HAS_PDL
{
k<<<grid, block, shared_mem, stream>>>(args...);
}
return cudaPeekAtLastError();
}

Expand Down
Loading