Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add microbenchmarks for various components #1722

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF)
option(GINKGO_BUILD_TESTS "Generate build files for unit tests" ON)
option(GINKGO_BUILD_EXAMPLES "Build Ginkgo's examples" ON)
option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON)
option(GINKGO_BUILD_MICROBENCHMARKS "Build Ginkgo's microbenchmarks (requires GINKGO_BUILD_BENCHMARKS)" OFF)
option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON)
option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP})
option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI})
Expand Down
3 changes: 3 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ add_subdirectory(tools)
if (GINKGO_BUILD_TESTS)
add_subdirectory(test)
endif()
if(GINKGO_BUILD_MICROBENCHMARKS)
add_subdirectory(gpu-microbenchmarks)
endif()

configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY)

Expand Down
40 changes: 40 additions & 0 deletions benchmark/gpu-microbenchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
if(GINKGO_BUILD_CUDA AND GINKGO_BUILD_HIP)
message(FATAL_ERROR "gpubench doesn't support CUDA and HIP at the same time")
endif()
if(NOT (GINKGO_BUILD_CUDA OR GINKGO_BUILD_HIP))
message(FATAL_ERROR "gpubench only supports CUDA or HIP")
endif()
if(GINKGO_BUILD_CUDA)
set(GPU_LANG CUDA)
set(USE_HIP OFF)
else()
set(GPU_LANG HIP)
set(USE_HIP ON)
endif()
message(STATUS "Fetching external gpubench")
include(FetchContent)
FetchContent_Declare(
gpubench
GIT_REPOSITORY https://github.com/upsj/gpubench.git
GIT_TAG 0a5ebdc5aedd3fe5be6b5defeedd35bf43efd231
)
FetchContent_GetProperties(gpubench)
if(NOT gpubench_POPULATED)
FetchContent_Populate(gpubench)
add_subdirectory(${gpubench_SOURCE_DIR} ${gpubench_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()

function(add_benchmark name)
set(targetname ${name}-microbench)
string(TOLOWER ${GPU_LANG} GPU_LANG_LOWER)
add_executable(${targetname} ${name}.gpu.cpp)
set_source_files_properties(${name}.gpu.cpp PROPERTIES LANGUAGE ${GPU_LANG})
target_link_libraries(${targetname} PRIVATE nvbench::main ginkgo)
target_include_directories(${targetname} PRIVATE ${PROJECT_SOURCE_DIR})
target_compile_options(${targetname} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda> -lineinfo)
target_compile_definitions(${targetname} PRIVATE GKO_COMPILING_${GPU_LANG} GKO_DEVICE_NAMESPACE=${GPU_LANG_LOWER})
endfunction(add_benchmark name)

add_benchmark(memory)
add_benchmark(sorting)
add_benchmark(bitvector)
247 changes: 247 additions & 0 deletions benchmark/gpu-microbenchmarks/bitvector.gpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
// SPDX-FileCopyrightText: 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include <cstdint>

#include <thrust/binary_search.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>

#include <nvbench/nvbench.hpp>

#include <ginkgo/core/base/intrinsics.hpp>
#include <ginkgo/core/base/math.hpp>
#ifdef USE_HIP
#include <thrust/system/hip/detail/execution_policy.h>
#else
#include <thrust/system/cuda/detail/execution_policy.h>
#endif

template <typename MaskType>
__device__ MaskType prefix_mask(int lane)
{
return (MaskType{1} << lane) - 1;
}

const auto sizes = nvbench::range<int, std::int64_t>(16, 28, 2);
const auto threadblock_size = 512;

using mask_types = nvbench::type_list<std::uint32_t, std::uint64_t>;
using rank_types = nvbench::type_list<std::int32_t, std::int64_t>;


template <typename MaskType, typename RankType>
__global__ void compute_ranks(const MaskType* __restrict__ masks,
const RankType* __restrict__ ranks,
RankType* __restrict__ out, int size)
{
const auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < size) {
constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
const auto block_i = i / block_size;
const auto local_i = i % block_size;
out[i] = ranks[block_i] +
gko::detail::popcount(masks[block_i] &
prefix_mask<MaskType>(local_i));
}
}

template <typename MaskType, typename RankType>
void rank_operation(nvbench::state& state,
nvbench::type_list<MaskType, RankType>)
{
const auto size = static_cast<std::size_t>(state.get_int64("size"));

constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
const auto block_count = gko::ceildiv(size, block_size);
thrust::device_vector<MaskType> masks(block_count, ~MaskType{});
thrust::device_vector<RankType> ranks(block_count, 0);
thrust::sequence(ranks.begin(), ranks.end(), RankType{});
thrust::for_each(ranks.begin(), ranks.end(),
[] __device__(RankType & rank) { rank *= block_size; });
thrust::device_vector<RankType> output(size, 0);
const auto num_threadblocks = gko::ceildiv(size, threadblock_size);

state.add_element_count(size, "Items");
state.add_global_memory_reads<MaskType>(block_count, "Masks");
state.add_global_memory_reads<RankType>(block_count, "Ranks");
state.add_global_memory_writes<RankType>(size, "OutSize");

state.exec([&](nvbench::launch& launch) {
compute_ranks<<<num_threadblocks, threadblock_size, 0,
launch.get_stream()>>>(
thrust::raw_pointer_cast(masks.data()),
thrust::raw_pointer_cast(ranks.data()),
thrust::raw_pointer_cast(output.data()), size);
});
// compare to reference
auto ref = thrust::host_vector<RankType>(size);
thrust::sequence(ref.begin(), ref.end(), RankType{});
if (ref != output) {
std::cout << "FAIL\n";
}
}

NVBENCH_BENCH_TYPES(rank_operation, NVBENCH_TYPE_AXES(mask_types, rank_types))
.set_type_axes_names({"mask", "rank"})
.add_int64_power_of_two_axis("size", sizes);

//

template <typename RankType>
void binary_search_operation(nvbench::state& state,
nvbench::type_list<RankType>)
{
const auto size = static_cast<std::size_t>(state.get_int64("size"));

thrust::device_vector<RankType> ranks(size, 0);
thrust::sequence(ranks.begin(), ranks.end(), RankType{});
auto queries = ranks;
thrust::device_vector<RankType> output(size, 0);

state.add_element_count(size, "Items");
state.add_global_memory_reads<RankType>(size, "Ranks");
state.add_global_memory_reads<RankType>(size, "Queries");
state.add_global_memory_writes<RankType>(size, "OutSize");

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
#ifdef USE_HIP
auto policy = thrust::hip::par.on(launch.get_stream());
#else
auto policy = thrust::cuda::par.on(launch.get_stream());
#endif
thrust::lower_bound(policy, ranks.begin(), ranks.end(), queries.begin(),
queries.end(), output.begin());
});
if (output != ranks) {
std::cout << "FAIL\n";
}
}

NVBENCH_BENCH_TYPES(binary_search_operation, NVBENCH_TYPE_AXES(rank_types))
.add_int64_power_of_two_axis("size", sizes);

//

template <typename MaskType>
__global__ void compute_select(const MaskType* __restrict__ masks,
int* __restrict__ out, int size)
{
const auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < size) {
constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
int offset = 0;
const auto mask = masks[i / block_size];
const auto rank = threadIdx.x % block_size;
for (int range_size = block_size; range_size > 1; range_size /= 2) {
const auto mid = offset + range_size / 2;
const auto half_count =
gko::detail::popcount(mask & prefix_mask<MaskType>(mid));
offset = half_count <= rank ? mid : offset;
}
out[i] = offset;
}
}

template <typename MaskType>
void select_operation(nvbench::state& state, nvbench::type_list<MaskType>)
{
const auto size = static_cast<std::size_t>(state.get_int64("size"));

constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
const auto block_count = gko::ceildiv(size, block_size);
thrust::device_vector<MaskType> masks(block_count, ~MaskType{});
thrust::device_vector<int> output(size, 0);
const auto num_threadblocks = gko::ceildiv(size, threadblock_size);

state.add_element_count(size, "Items");
state.add_global_memory_reads<MaskType>(block_count, "Masks");
state.add_global_memory_writes<int>(size, "OutSize");

state.exec([&](nvbench::launch& launch) {
compute_select<<<num_threadblocks, threadblock_size, 0,
launch.get_stream()>>>(
thrust::raw_pointer_cast(masks.data()),
thrust::raw_pointer_cast(output.data()), size);
});
auto ref = thrust::host_vector<int>(size);
thrust::sequence(ref.begin(), ref.end(), 0);
thrust::for_each(ref.begin(), ref.end(),
[](int& rank) { rank %= block_size; });
if (ref != output) {
std::cout << "FAIL\n";
for (int i = 0; i < 50; i++) {
std::cout << ref[i] << ' ' << output[i] << '\n';
}
}
}

NVBENCH_BENCH_TYPES(select_operation, NVBENCH_TYPE_AXES(mask_types))
.add_int64_power_of_two_axis("size", sizes);

//

template <typename MaskType>
__global__ void compute_select_even(const MaskType* __restrict__ masks,
int* __restrict__ out, int size)
{
const auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < size) {
constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
int offset = 0;
const auto mask = masks[i / (block_size / 2)];
const auto rank = threadIdx.x % (block_size / 2);
for (int range_size = block_size; range_size > 1; range_size /= 2) {
const auto mid = offset + range_size / 2;
const auto half_count =
gko::detail::popcount(mask & prefix_mask<MaskType>(mid));
offset = half_count <= rank ? mid : offset;
}
out[i] = offset;
}
}

template <typename MaskType>
void select_even_operation(nvbench::state& state, nvbench::type_list<MaskType>)
{
// Allocate input data:
const auto size = static_cast<std::size_t>(state.get_int64("size"));

constexpr auto block_size = CHAR_BIT * sizeof(MaskType);
const auto block_count = gko::ceildiv(size, block_size);
MaskType mask{};
for (int i = 0; i < block_size; i += 2) {
mask |= MaskType{1} << i;
}
thrust::device_vector<MaskType> masks(block_count, mask);
thrust::device_vector<int> output(size / 2, 0);
const auto num_threadblocks = gko::ceildiv(size / 2, threadblock_size);

state.add_element_count(size, "Items");
state.add_global_memory_reads<MaskType>(block_count, "Masks");
state.add_global_memory_writes<int>(size / 2, "OutSize");

state.exec([&](nvbench::launch& launch) {
compute_select_even<<<num_threadblocks, threadblock_size, 0,
launch.get_stream()>>>(
thrust::raw_pointer_cast(masks.data()),
thrust::raw_pointer_cast(output.data()), size / 2);
});
auto ref = thrust::host_vector<int>(size / 2);
thrust::sequence(ref.begin(), ref.end(), 0);
thrust::for_each(ref.begin(), ref.end(),
[](int& rank) { rank = (rank % (block_size / 2)) * 2; });
if (ref != output) {
std::cout << "FAIL\n";
for (int i = 0; i < 50; i++) {
std::cout << ref[i] << ' ' << output[i] << '\n';
}
}
}

NVBENCH_BENCH_TYPES(select_even_operation, NVBENCH_TYPE_AXES(mask_types))
.add_int64_power_of_two_axis("size", sizes);
Loading
Loading