Optimize non fixed size segmented reduce for small segments using max_segment_size#7718
Optimize non fixed size segmented reduce for small segments using max_segment_size#7718srinivasyadav18 wants to merge 3 commits intoNVIDIA:mainfrom
Conversation
This comment has been minimized.
This comment has been minimized.
😬 CI Workflow Results🟥 Finished in 2h 44m: Pass: 37%/104 | Total: 4d 10h | Max: 2h 44m | Hits: 89%/39834See results here. |
bernhardmgruber
left a comment
There was a problem hiding this comment.
I think this PR is massively complicated by the fact that the segmented reduction dispatch was already refactored to the new tuning API, and the fixed size segmented dispatch was not. I strongly suggest to refactor the fixed size dispatch first (#7641) and then rebase this PR.
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | ||
| // SPDX-License-Identifier: BSD-3 |
There was a problem hiding this comment.
Critical: Please use the correct license header. See https://github.com/NVIDIA/cccl/wiki/Cpp-Coding-Guidelines. Applies to all new files.
| using value_types = nvbench::type_list<int32_t, int64_t, float, double>; | ||
| using op_t = cub::detail::arg_max; | ||
| using some_offset_types = nvbench::type_list<int32_t>; |
There was a problem hiding this comment.
Critical: Please apply the build time optimization as documented here: https://nvidia.github.io/cccl/unstable/cub/tuning.html#nvbench-attributes. Applies to variable_sum.cu as well.
| [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> segmented_reduce_policy | ||
| { | ||
| constexpr auto policies = | ||
| policy_selector{classify_type<AccumT>, classify_op<ReductionOpT>, int{sizeof(OffsetT)}, int{sizeof(AccumT)}}; | ||
| return policies(arch); | ||
| using fs = typename policy_hub<AccumT, OffsetT, ReductionOpT>::MaxPolicy; | ||
| using rp = typename fs::ReducePolicy; | ||
| using sp = typename fs::SmallReducePolicy; | ||
| using mp = typename fs::MediumReducePolicy; | ||
| const auto base = reduce::agent_reduce_policy{ | ||
| rp::BLOCK_THREADS, rp::ITEMS_PER_THREAD, rp::VECTOR_LOAD_LENGTH, rp::BLOCK_ALGORITHM, rp::LOAD_MODIFIER}; | ||
| return segmented_reduce_policy{ | ||
| base, | ||
| agent_warp_reduce_policy{ | ||
| base.block_threads, sp::WARP_THREADS, sp::ITEMS_PER_THREAD, sp::VECTOR_LOAD_LENGTH, sp::LOAD_MODIFIER}, | ||
| agent_warp_reduce_policy{ | ||
| base.block_threads, mp::WARP_THREADS, mp::ITEMS_PER_THREAD, mp::VECTOR_LOAD_LENGTH, mp::LOAD_MODIFIER}}; | ||
| } |
There was a problem hiding this comment.
Critical: This is breaking the tuning API design, since it decouples the policy_selector_from_types from its corresponding policy_selector. The former must always be implemented as the latter.
There was a problem hiding this comment.
This should also fix the policy mismatch error you are seeing in the c parallel and python tests
| ctk_path, | ||
| "-rdc=true", | ||
| "-dlto", | ||
| "-DCUB_DISABLE_CDP", |
There was a problem hiding this comment.
Important: you need to add "-default-device" to be able to compile the new lambda you added to the kernel, see transform.cu for example
| } | ||
| }; | ||
|
|
||
| using dispatch_t = cub::detail::reduce::DispatchFixedSizeSegmentedReduce< |
There was a problem hiding this comment.
Nit: it seems that this alias was useful, I would reintroduce it
|
|
||
| // Generate input data | ||
| thrust::device_vector<T> in = generate(elements); | ||
| thrust::device_vector<output_t> out(num_segments); |
There was a problem hiding this comment.
Nit:
| thrust::device_vector<output_t> out(num_segments); | |
| thrust::device_vector<output_t> out(num_segments, thrust::no_init); |
| auto get_in = [&] { | ||
| if constexpr (is_argmin || is_argmax) | ||
| { | ||
| return d_indexed_in; | ||
| } | ||
| else | ||
| { | ||
| return d_raw_in; | ||
| } | ||
| }; | ||
|
|
||
| using input_it_t = decltype(get_in()); | ||
| input_it_t d_in = get_in(); |
There was a problem hiding this comment.
| auto get_in = [&] { | |
| if constexpr (is_argmin || is_argmax) | |
| { | |
| return d_indexed_in; | |
| } | |
| else | |
| { | |
| return d_raw_in; | |
| } | |
| }; | |
| using input_it_t = decltype(get_in()); | |
| input_it_t d_in = get_in(); | |
| auto d_in = [&] { | |
| if constexpr (is_argmin || is_argmax) | |
| { | |
| return d_indexed_in; | |
| } | |
| else | |
| { | |
| return d_raw_in; | |
| } | |
| }(); |
|
|
||
| // Create wrapped iterator for argmin/argmax operations | ||
| [[maybe_unused]] auto d_indexed_in = thrust::make_transform_iterator( | ||
| thrust::counting_iterator<::cuda::std::int64_t>{0}, |
There was a problem hiding this comment.
Suggestion: if possible
| thrust::counting_iterator<::cuda::std::int64_t>{0}, | |
| cuda::counting_iterator<::cuda::std::int64_t>{0}, |
| {}, | ||
| guaranteed_max_seg_size); | ||
|
|
||
| thrust::device_vector<nvbench::uint8_t> temp(temp_size); |
There was a problem hiding this comment.
| thrust::device_vector<nvbench::uint8_t> temp(temp_size); | |
| thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init); |
| [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> segmented_reduce_policy | ||
| { | ||
| constexpr auto policies = | ||
| policy_selector{classify_type<AccumT>, classify_op<ReductionOpT>, int{sizeof(OffsetT)}, int{sizeof(AccumT)}}; | ||
| return policies(arch); | ||
| using fs = typename policy_hub<AccumT, OffsetT, ReductionOpT>::MaxPolicy; | ||
| using rp = typename fs::ReducePolicy; | ||
| using sp = typename fs::SmallReducePolicy; | ||
| using mp = typename fs::MediumReducePolicy; | ||
| const auto base = reduce::agent_reduce_policy{ | ||
| rp::BLOCK_THREADS, rp::ITEMS_PER_THREAD, rp::VECTOR_LOAD_LENGTH, rp::BLOCK_ALGORITHM, rp::LOAD_MODIFIER}; | ||
| return segmented_reduce_policy{ | ||
| base, | ||
| agent_warp_reduce_policy{ | ||
| base.block_threads, sp::WARP_THREADS, sp::ITEMS_PER_THREAD, sp::VECTOR_LOAD_LENGTH, sp::LOAD_MODIFIER}, | ||
| agent_warp_reduce_policy{ | ||
| base.block_threads, mp::WARP_THREADS, mp::ITEMS_PER_THREAD, mp::VECTOR_LOAD_LENGTH, mp::LOAD_MODIFIER}}; | ||
| } |
There was a problem hiding this comment.
This should also fix the policy mismatch error you are seeing in the c parallel and python tests
Description
closes #6898
Checklist