Skip to content

Commit

Permalink
Remove all warm-up runs from Thrust benchmarks (#2838)
Browse files Browse the repository at this point in the history
NVBench handles warm-up runs.

Fixes: #2836
  • Loading branch information
bernhardmgruber authored Nov 15, 2024
1 parent 1dd5bc7 commit f4d358a
Show file tree
Hide file tree
Showing 34 changed files with 4 additions and 70 deletions.
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/adjacent_difference/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/adjacent_difference/custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/adjacent_difference/in_place.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), vec.begin(), vec.end(), vec.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::adjacent_difference(policy(alloc, launch), vec.begin(), vec.end(), vec.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/copy/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::copy(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::copy(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/copy/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(selected_elements);

caching_allocator_t alloc;
thrust::copy_if(policy(alloc), input.cbegin(), input.cend(), output.begin(), select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::copy_if(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), select_op);
});
Expand Down
1 change: 0 additions & 1 deletion thrust/benchmarks/bench/equal/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ static void benchmark(nvbench::state& state, nvbench::type_list<T>)
// of `elements` corresponds to the
// actual elements read in an early
// exit
do_not_optimize(thrust::equal(policy(alloc), a.begin(), a.end(), b.begin()));
state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
do_not_optimize(thrust::equal(policy(alloc, launch), a.begin(), a.end(), b.begin()));
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/fill/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::fill(policy(alloc), output.begin(), output.end(), T{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::fill(policy(alloc, launch), output.begin(), output.end(), T{42});
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/for_each/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)

square_t<T> op{};
caching_allocator_t alloc;
thrust::for_each(policy(alloc), in.begin(), in.end(), op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::for_each(policy(alloc, launch), in.begin(), in.end(), op);
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/inner_product/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(1);

caching_allocator_t alloc;
thrust::inner_product(policy(alloc), lhs.begin(), lhs.end(), rhs.begin(), T{0});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::inner_product(policy(alloc, launch), lhs.begin(), lhs.end(), rhs.begin(), T{0});
});
Expand Down
3 changes: 0 additions & 3 deletions thrust/benchmarks/bench/merge/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::merge(
policy(alloc), in.cbegin(), in.cbegin() + elements_in_lhs, in.cbegin() + elements_in_lhs, in.cend(), out.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::merge(
policy(alloc, launch),
Expand Down
8 changes: 0 additions & 8 deletions thrust/benchmarks/bench/partition/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::partition_copy(
policy(alloc),
input.cbegin(),
input.cend(),
output.begin(),
thrust::make_reverse_iterator(output.begin() + elements),
select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::partition_copy(
policy(alloc, launch),
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/reduce/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(1);

caching_allocator_t alloc;
do_not_optimize(thrust::reduce(policy(alloc), in.begin(), in.end()));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
do_not_optimize(thrust::reduce(policy(alloc, launch), in.begin(), in.end()));
});
Expand Down
3 changes: 0 additions & 3 deletions thrust/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ static void basic(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<ValueT>(unique_keys);

caching_allocator_t alloc;
thrust::reduce_by_key(
policy(alloc), in_keys.begin(), in_keys.end(), in_vals.begin(), out_keys.begin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::reduce_by_key(
policy(alloc, launch), in_keys.begin(), in_keys.end(), in_vals.begin(), out_keys.begin(), out_vals.begin());
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<ValueT>(elements);

caching_allocator_t alloc;
thrust::exclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::exclusive_scan_by_key(policy(alloc, launch), keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/exclusive/max.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::exclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin(), T{}, max_t{});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::exclusive_scan(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), T{}, max_t{});
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/exclusive/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::exclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::exclusive_scan(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/inclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<ValueT>(elements);

caching_allocator_t alloc;
thrust::inclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::inclusive_scan_by_key(policy(alloc, launch), keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/inclusive/max.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::inclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin(), max_t{});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::inclusive_scan(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), max_t{});
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/scan/inclusive/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::inclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::inclusive_scan(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
Expand Down
1 change: 1 addition & 0 deletions thrust/benchmarks/bench/set_operations/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T>, OpT op)
thrust::sort(input.begin() + elements_in_A, input.end());

caching_allocator_t alloc;
// not a warm-up run, we need to run once to determine the size of the output
const auto result_ends =
op(policy(alloc),
input.cbegin(),
Expand Down
1 change: 1 addition & 0 deletions thrust/benchmarks/bench/set_operations/by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ static void basic(nvbench::state& state, nvbench::type_list<KeyT, ValueT>, OpT o
thrust::sort(in_keys.begin() + elements_in_A, in_keys.end());

caching_allocator_t alloc;
// not a warm-up run, we need to run once to determine the size of the output
auto result_ends = op(
policy(alloc),
in_keys.cbegin(),
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/shuffle/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)

auto do_engine = [&](auto&& engine_constructor) {
caching_allocator_t alloc;
thrust::shuffle(policy(alloc), data.begin(), data.end(), engine_constructor());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::shuffle(policy(alloc, launch), data.begin(), data.end(), engine_constructor());
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::sort(policy(alloc), vec.begin(), vec.end());

state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
vec = input;
timer.start();
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/sort/keys_custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
thrust::sort(policy(alloc), vec.begin(), vec.end(), less_t{});

state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
vec = input;
timer.start();
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ static void basic(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<ValueT>(elements);

caching_allocator_t alloc;
thrust::sort_by_key(policy(alloc), keys.begin(), keys.end(), vals.begin());

state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
keys = in_keys;
vals = in_vals;
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/sort/pairs_custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ static void basic(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<ValueT>(elements);

caching_allocator_t alloc;
thrust::sort_by_key(policy(alloc), keys.begin(), keys.end(), vals.begin(), less_t{});

state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
keys = in_keys;
vals = in_vals;
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/tabulate/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)

caching_allocator_t alloc;
seg_size_t<T> op{thrust::raw_pointer_cast(input.data())};
thrust::tabulate(policy(alloc), output.begin(), output.end(), op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::tabulate(policy(alloc, launch), output.begin(), output.end(), op);
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/transform/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ template <typename... Args>
void bench_transform(nvbench::state& state, Args&&... args)
{
caching_allocator_t alloc; // transform shouldn't allocate, but let's be consistent
thrust::transform(policy(alloc), ::cuda::std::forward<Args>(args)...); // warmup (queries and caches occupancy)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::transform(policy(alloc, launch), ::cuda::std::forward<Args>(args)...);
});
Expand Down
2 changes: 0 additions & 2 deletions thrust/benchmarks/bench/transform_reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(1);

caching_allocator_t alloc;
do_not_optimize(thrust::transform_reduce(policy(alloc), in.begin(), in.end(), square_t<T>{}, T{}, thrust::plus<T>{}));

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
do_not_optimize(
thrust::transform_reduce(policy(alloc, launch), in.begin(), in.end(), square_t<T>{}, T{}, thrust::plus<T>{}));
Expand Down
1 change: 1 addition & 0 deletions thrust/benchmarks/bench/unique/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
thrust::device_vector<T> output(elements);

caching_allocator_t alloc;
// not a warm-up run, we need to run once to determine the size of the output
const auto new_end = thrust::unique_copy(policy(alloc), input.cbegin(), input.cend(), output.begin());
const std::size_t unique_items = thrust::distance(output.begin(), new_end);

Expand Down
1 change: 1 addition & 0 deletions thrust/benchmarks/bench/unique/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ static void basic(nvbench::state& state, nvbench::type_list<KeyT, ValueT>)
thrust::device_vector<ValueT> out_vals(elements);

caching_allocator_t alloc;
// not a warm-up run, we need to run once to determine the size of the output
const auto [new_key_end, new_val_end] = thrust::unique_by_key_copy(
policy(alloc), in_keys.cbegin(), in_keys.cend(), in_vals.cbegin(), out_keys.begin(), out_vals.begin());

Expand Down
3 changes: 0 additions & 3 deletions thrust/benchmarks/bench/vectorized_search/base.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_element_count(needles);

caching_allocator_t alloc;
thrust::binary_search(
policy(alloc), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::binary_search(
policy(alloc, launch), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());
Expand Down
3 changes: 0 additions & 3 deletions thrust/benchmarks/bench/vectorized_search/lower_bound.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_element_count(needles);

caching_allocator_t alloc;
thrust::lower_bound(
policy(alloc), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::lower_bound(
policy(alloc, launch), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());
Expand Down
3 changes: 0 additions & 3 deletions thrust/benchmarks/bench/vectorized_search/upper_bound.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
state.add_element_count(needles);

caching_allocator_t alloc;
thrust::upper_bound(
policy(alloc), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::upper_bound(
policy(alloc, launch), data.begin(), data.begin() + elements, data.begin() + elements, data.end(), result.begin());
Expand Down

0 comments on commit f4d358a

Please sign in to comment.