NVIDIA · bernhardmgruber · Dec 11, 2024 · Dec 9, 2024 · Dec 11, 2024 · gevtushenko
@@ -127,7 +127,7 @@ void keys(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto* temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     dispatch_t::Dispatch(
       temp_storage,
       temp_size,

@@ -172,6 +172,9 @@ struct AgentBlockSort
   _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining)
   {
     ValueT items_local[ITEMS_PER_THREAD];
+
+    _CCCL_PDL_GRID_DEPENDENCY_SYNC();
+
     _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
     {
       _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
@@ -198,6 +201,7 @@ struct AgentBlockSort
     }
 
     CTA_SYNC();
+    _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
     {
@@ -336,6 +340,8 @@ struct AgentPartition
     const OffsetT keys2_beg = keys1_end;
     const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
 
+    _CCCL_PDL_GRID_DEPENDENCY_SYNC();
+
     // The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage
     if (partition_idx + 1 == num_partitions)
     {
@@ -535,6 +541,8 @@ struct AgentMerge
     const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
     const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
+    _CCCL_PDL_GRID_DEPENDENCY_SYNC();
+
     // load keys1 & keys2
     KeyT keys_local[ITEMS_PER_THREAD];
     if (ping)
@@ -576,6 +584,7 @@ struct AgentMerge
     }
 
     CTA_SYNC();
+    _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     // use binary search in shared memory
     // to find merge path for each of thread

@@ -183,9 +183,9 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
 
 /**
  * @brief Kernel that copies data from a batch of given source buffers to their corresponding
- * destination buffer. If a buffer's size is to large to be copied by a single thread block, that
+ * destination buffer. If a buffer's size is too large to be copied by a single thread block, that
  * buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
- * collaborate on each of these buffers. All other buffers get copied straight away.
+ * collaborate on each of these buffers. All other buffers get copied straight away.o
  *
  * @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
  * @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers

@@ -212,6 +212,8 @@ __launch_bounds__(
   VSmemHelperT::discard_temp_storage(temp_storage);
 }
 
+// TODO(bgruber): if we put a call to cudaTriggerProgrammaticLaunchCompletion inside this kernel, the tests fail with
+// cudaErrorIllegalAddress.
 template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
   bool ping,
@@ -618,7 +620,7 @@ struct DispatchMergeSort : SelectedPolicy
 
         // Partition
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          partition_grid_size, threads_per_partition_block, 0, stream)
+          partition_grid_size, threads_per_partition_block, 0, stream, true)
           .doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
                 ping,
                 d_output_keys,
@@ -645,7 +647,7 @@ struct DispatchMergeSort : SelectedPolicy
 
         // Merge
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream)
+          static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
           .doit(
             DeviceMergeSortMergeKernel<MaxPolicyT,
                                        KeyInputIteratorT,

@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CCCL_CUDA_CAPABILITIES
+#define __CCCL_CUDA_CAPABILITIES
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail
+#if !_CCCL_COMPILER(NVRTC)
+#  include <cuda_runtime_api.h>
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#include <nv/target>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// True, when programmatic dependent launch is available, otherwise false.
+#define _CCCL_HAS_PDL _CCCL_CUDACC_AT_LEAST(11, 8)
+#if _CCCL_HAS_PDL
+// Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
+// memory access in a kernel.
+#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
+// Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
+// Heuristic(ahendriksen): put it after the last load.
+#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
+#else
+#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
+#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()
+#endif // _CCCL_HAS_PDL
+
+#endif // __CCCL_CUDA_CAPABILITIES
@@ -39,6 +39,7 @@
 #include <thrust/system/cuda/config.h>
 
 #include <cuda/cmath>
+#include <cuda/std/__cccl/cuda_capabilities.h>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -53,19 +54,43 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron
   dim3 const grid;
   dim3 const block;
   Size const shared_mem;
+  bool const dependent_launch;
   cudaStream_t const stream;
 
-  THRUST_RUNTIME_FUNCTION triple_chevron(dim3 grid_, dim3 block_, Size shared_mem_ = 0, cudaStream_t stream_ = nullptr)
+  /// @param dependent_launch Launches the kernel using programmatic dependent launch if available.
+  THRUST_RUNTIME_FUNCTION triple_chevron(
+    dim3 grid_, dim3 block_, Size shared_mem_ = 0, cudaStream_t stream_ = nullptr, bool dependent_launch = false)
       : grid(grid_)
       , block(block_)
       , shared_mem(shared_mem_)
+      , dependent_launch(dependent_launch)
       , stream(stream_)
   {}
 
   template <class K, class... Args>
   cudaError_t _CCCL_HOST doit_host(K k, Args const&... args) const
   {
-    k<<<grid, block, shared_mem, stream>>>(args...);
+#if _THRUST_HAS_PDL
+    if (dependent_launch)
+    {
+      cudaLaunchAttribute attribute[1];
+      attribute[0].id                                         = cudaLaunchAttributeProgrammaticStreamSerialization;
+      attribute[0].val.programmaticStreamSerializationAllowed = 1;
+
+      cudaLaunchConfig_t config{};
+      config.gridDim          = grid;
+      config.blockDim         = block;
+      config.dynamicSmemBytes = shared_mem;
+      config.stream           = stream;
+      config.attrs            = attribute;
+      config.numAttrs         = 1;
+      cudaLaunchKernelEx(&config, k, args...);
+    }
+    else
+#endif // _THRUST_HAS_PDL
+    {
+      k<<<grid, block, shared_mem, stream>>>(args...);
+    }
     return cudaPeekAtLastError();
   }