Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Catch2 Benchmarking #1723

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,15 @@ option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF)
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
option(alpaka_ENABLE_WERROR "Treat all warnings as errors." OFF)
option(BUILD_TESTING "Build the testing tree." OFF)
option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)
include(CTest)
endif()

option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)

include(CMakeDependentOption)
cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARK" OFF)

################################################################################
# Internal variables.
Expand Down Expand Up @@ -154,7 +155,7 @@ if(alpaka_BUILD_BENCHMARKS)
endif()

# Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING)
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND (BUILD_TESTING OR alpaka_BUILD_BENCHMARK))
add_subdirectory("test/")
endif()

Expand Down
12 changes: 6 additions & 6 deletions cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ else()
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>,$<COMPILE_LANGUAGE:CUDA>>:SHELL:-Xcompiler -Og>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:MSVC>>:SHELL:/Od>")

target_link_options(alpaka INTERFACE "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>>:SHELL:-Og>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>")
endif()
Expand Down Expand Up @@ -358,7 +358,7 @@ endif()
if(alpaka_ACC_GPU_CUDA_ENABLE)
# Save the user-defined host compiler (if any)
set(_alpaka_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER})

check_language(CUDA)

if(CMAKE_CUDA_COMPILER)
Expand Down Expand Up @@ -619,9 +619,9 @@ if(alpaka_ACC_SYCL_ENABLE)
list(JOIN alpaka_SYCL_TARGETS "," alpaka_SYCL_TARGETS_CONCAT)
alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
target_link_options(alpaka INTERFACE "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")

#-----------------------------------------------------------------------------------------------------------------
# Determine actual hardware to compile for
# Determine actual hardware to compile for
if(alpaka_SYCL_ONEAPI_CPU)
set(alpaka_SYCL_ONEAPI_CPU_ISA "avx2" CACHE STRING "Intel ISA to compile for")
set_property(CACHE alpaka_SYCL_ONEAPI_CPU_ISA PROPERTY STRINGS "sse4.2;avx;avx2;avx512")
Expand Down Expand Up @@ -663,7 +663,7 @@ if(alpaka_ACC_SYCL_ENABLE)
PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
# If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")

target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
endif()

Expand Down Expand Up @@ -781,7 +781,7 @@ if(TARGET alpaka)

# the alpaka library itself
# SYSTEM voids showing warnings produced by alpaka when used in user applications.
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
else()
target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
Expand Down
118 changes: 118 additions & 0 deletions include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#pragma once

#include <alpaka/alpaka.hpp>

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
#endif

#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
#endif
Comment on lines +14 to +20
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dislike those. Can't we just have a prelude in alpaka.hpp after BoostPredef that checks those in one place?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as it takes ALPAKA_HOST_ONLY into account.


#include <alpaka/test/Check.hpp>
#include <alpaka/test/queue/Queue.hpp>

#include <catch2/benchmark/catch_benchmark.hpp>

#include <string>
#include <utility>

namespace alpaka::test
{
//! The fixture for executing a kernel on a given accelerator.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
//! The fixture for executing a kernel on a given accelerator.
//! The fixture for benchmarking the execution of a kernel on a given accelerator.

template<typename TAcc>
class KernelExecutionBenchmarkFixture
{
public:
using Acc = TAcc;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Device = Dev<Acc>;
using Platform = alpaka::Platform<Acc>;
using Queue = test::DefaultQueue<Device>;
using WorkDiv = WorkDivMembers<Dim, Idx>;

KernelExecutionBenchmarkFixture(WorkDiv workDiv) : m_workDiv(std::move(workDiv))
{
}

template<typename TExtent>
KernelExecutionBenchmarkFixture(TExtent const& extent)
: KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
getDevByIdx<Acc>(0u),
extent,
Vec<Dim, Idx>::ones(),
false,
GridBlockExtentSubDivRestrictions::Unrestricted))
{
}

template<typename TKernelFnObj, typename... TArgs>
auto operator()(
TKernelFnObj const& kernelFnObj,
std::string const& benchmarkName,
float& result,
TArgs&&... args) -> bool
{
// Allocate result buffers
auto bufAccResult = allocBuf<float, Idx>(m_device, static_cast<Idx>(1u));
auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));

int numRuns = 0;
result = 0.0f;

// The following block is executed unknown times during estimation phase, then once per benchmark sample
BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
{
numRuns++;
memset(m_queue, bufAccResult, 0);
wait(m_queue);

// Only the following part is measured as the benchmark part
meter.measure(
[&]
{
exec<Acc>(
m_queue,
m_workDiv,
kernelFnObj,
getPtrNative(bufAccResult),
std::forward<TArgs>(args)...); // run the measured kernel
wait(m_queue); // wait for the kernel to actually run
});

// Copy the result value to the host
memcpy(m_queue, bufHostResult, bufAccResult);
wait(m_queue);

auto const resultLocal = *getPtrNative(bufHostResult);
result += resultLocal;
return resultLocal; // make sure the benchmark call is not optimized away
};
result /= static_cast<float>(numRuns);

return true;
// TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
// returns limited to bools?
// return result;
}

protected:
PlatformCpu m_platformHost{};
DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
Platform m_platform{};
Device m_device{getDevByIdx(m_platform, 0)};
Queue m_queue{m_device};
WorkDiv m_workDiv;
};
} // namespace alpaka::test
11 changes: 8 additions & 3 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ add_subdirectory(common)

list(APPEND _alpaka_TEST_OPTIONS --colour-mode default)

add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
if(BUILD_TESTING)
add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
endif()
if(alpaka_BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif()
2 changes: 2 additions & 0 deletions test/analysis/headerCheck/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ if(alpaka_CHECK_HEADERS)
PRIVATE common)

set_target_properties(headerCheckTest PROPERTIES FOLDER "test/analysis")
# Catch2 benchmark macros must be defined, otherwise the benchmarking headers will not pass the check.
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})

Expand Down
13 changes: 13 additions & 0 deletions test/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

cmake_minimum_required(VERSION 3.18)

add_subdirectory("rand/")
33 changes: 33 additions & 0 deletions test/benchmark/rand/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

set(_TARGET_NAME "randBenchmark")

append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)

alpaka_add_executable(
${_TARGET_NAME}
${_FILES_SOURCE})
target_link_libraries(
${_TARGET_NAME}
PRIVATE common)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

if(alpaka_CI)
# For non-benchmarking CI test runs - It will only run the benchmark once to see if it works at all.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
# Real automated benchmark runs will need to collect more samples (the default 100 is fine). The CI will then
# have to set another variable to indicate if it is only testing, or if it wants to do a full benchmark.
else()
# For full benchmark run - will collect 100 samples for good benchmark statistics.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
endif()
91 changes: 91 additions & 0 deletions test/benchmark/rand/src/randBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* Copyright 2022 Jiri Vyskocil
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <alpaka/rand/Traits.hpp>
#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
#include <alpaka/test/acc/TestAccs.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>

class RandBenchmarkKernel
{
public:
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc, typename TIdx>
ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
{
// Get the global linearized thread idx.
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx
= static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);

// Setup generator engine and distribution.
auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));

float number = 0;
for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
{
number += dist(engine);
}

alpaka::atomicAdd(
acc,
result,
number); // TODO: we're measuring the atomicAdd time too, this is not what we want
}
};

// TODO: This takes an enormous time to finish and is probably useless anyway:
// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
// Running the benchmark on a single default accelerator instead
TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
{
// using Acc = TestType;
using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const platform = alpaka::Platform<Acc>{};
auto const dev = alpaka::getDevByIdx(platform, 0);

Idx const numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
std::cout << "Hardware threads: " << numThreads << std::endl;

#ifdef ALPAKA_CI // Reduced benchmark set for automated test runs.
unsigned const numPoints = GENERATE(10u, 1'000'000u);
#else
unsigned const numPoints = GENERATE(10u, 100000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
#endif

WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
dev,
Vec::all(numThreads * numThreads),
Vec::all(numThreads),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};

alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);

RandBenchmarkKernel kernel;

float result = 0.0f;

REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
// TODO: Actually check the result
std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
<< " should probably converge to 0.5." << std::flush;
}
2 changes: 1 addition & 1 deletion thirdParty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MPL-2.0
#

if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
if(alpaka_USE_INTERNAL_CATCH2)
message(STATUS "Catch2: Using INTERNAL version 3.3.2")
# Force Catch2's CMake to pick up the variables we set below
Expand Down
Loading