Skip to content

Commit

Permalink
Catch2 Benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
sliwowitz committed May 18, 2022
1 parent dfc286a commit d23e2cb
Show file tree
Hide file tree
Showing 10 changed files with 276 additions and 8 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)

option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)

option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)

option(BUILD_TESTING "Build the testing tree." OFF)

option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
Expand Down Expand Up @@ -138,7 +140,8 @@ endif()
if(alpaka_BUILD_EXAMPLES)
add_subdirectory("example/")
endif()
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
enable_testing()
add_subdirectory("test/")
endif()

Expand Down
4 changes: 2 additions & 2 deletions cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ if(alpaka_ACC_SYCL_ENABLE)

# Enable device-side printing to stdout
cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
endif()

Expand Down Expand Up @@ -855,7 +855,7 @@ if(TARGET alpaka)

# the alpaka library itself
# SYSTEM voids showing warnings produced by alpaka when used in user applications.
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
else()
target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
Expand Down
120 changes: 120 additions & 0 deletions include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#pragma once

#include <alpaka/alpaka.hpp>

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
#endif

#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
#endif

#include <alpaka/test/Check.hpp>
#include <alpaka/test/queue/Queue.hpp>

#include <catch2/catch.hpp>

#include <string>
#include <utility>

namespace alpaka::test
{
//! The fixture for executing a kernel on a given accelerator.
template<typename TAcc>
class KernelExecutionBenchmarkFixture
{
public:
using Acc = TAcc;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using DevAcc = Dev<Acc>;
using PltfAcc = Pltf<DevAcc>;
using QueueAcc = test::DefaultQueue<DevAcc>;
using WorkDiv = WorkDivMembers<Dim, Idx>;

KernelExecutionBenchmarkFixture(WorkDiv workDiv)
: m_devHost(getDevByIdx<PltfCpu>(0u))
, m_devAcc(getDevByIdx<PltfAcc>(0u))
, m_queue(m_devAcc)
, m_workDiv(std::move(workDiv))
{
}

template<typename TExtent>
KernelExecutionBenchmarkFixture(TExtent const& extent)
: KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
getDevByIdx<PltfAcc>(0u),
extent,
Vec<Dim, Idx>::ones(),
false,
GridBlockExtentSubDivRestrictions::Unrestricted))
{
}

template<typename TKernelFnObj, typename... TArgs>
auto operator()(
TKernelFnObj const& kernelFnObj,
std::string const& benchmarkName,
float& result,
TArgs&&... args) -> bool
{
// Allocate result buffers
auto bufAccResult = allocBuf<float, Idx>(m_devAcc, static_cast<Idx>(1u));
auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));

int numRuns = 0;
result = 0.0f;

// The following block is executed unknown times during estimation phase, then once per benchmark sample
BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
{
numRuns++;
memset(m_queue, bufAccResult, 0);
wait(m_queue);

// Only the following part is measured as the benchmark part
meter.measure(
[&]
{
exec<Acc>(
m_queue,
m_workDiv,
kernelFnObj,
getPtrNative(bufAccResult),
std::forward<TArgs>(args)...); // run the measured kernel
wait(m_queue); // wait for the kernel to actually run
});

// Copy the result value to the host
memcpy(m_queue, bufHostResult, bufAccResult);
wait(m_queue);

auto const resultLocal = *getPtrNative(bufHostResult);
result += resultLocal;
return resultLocal; // make sure the benchmark call is not optimized away
};
result /= static_cast<float>(numRuns);

return true;
// TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
// returns limited to bools?
// return result;
}

protected:
DevCpu m_devHost;
DevAcc m_devAcc;
QueueAcc m_queue;
WorkDiv m_workDiv;
};
} // namespace alpaka::test
2 changes: 1 addition & 1 deletion script/run_generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ mkdir -p build/
cd build/

"${ALPAKA_CI_CMAKE_EXECUTABLE}" --log-level=VERBOSE -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM}\
-Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON \
-Dalpaka_BUILD_EXAMPLES=ON -Dalpaka_BUILD_BENCHMARK=ON -DBUILD_TESTING=ON \
"$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF -DBoost_ARCHITECTURE="-x64" \
"$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_C_COMPILER)" "$(env2cmake CMAKE_CXX_COMPILER)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" "$(env2cmake CMAKE_CXX_EXTENSIONS)"\
"$(env2cmake alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \
Expand Down
13 changes: 9 additions & 4 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
# Copyright 2015-2022 Benjamin Worpitz, Axel Huebl, Jan Stephan, Jiri Vyskocil
#
# This file is part of alpaka.
#
Expand All @@ -21,6 +21,11 @@ add_subdirectory(common)

list(APPEND _alpaka_TEST_OPTIONS --use-colour yes)

add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
if(BUILD_TESTING)
add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
endif()
if(alpaka_BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif()
2 changes: 2 additions & 0 deletions test/analysis/headerCheck/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ if(alpaka_CHECK_HEADERS)
PRIVATE common)

set_target_properties(headerCheckTest PROPERTIES FOLDER "test/analysis")
# Catch2 benchmark macros must be defined, otherwise the benchmarking headers will not pass the check.
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})

Expand Down
13 changes: 13 additions & 0 deletions test/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

cmake_minimum_required(VERSION 3.18)

add_subdirectory("rand/")
33 changes: 33 additions & 0 deletions test/benchmark/rand/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

set(_TARGET_NAME "randBenchmark")

append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)

alpaka_add_executable(
${_TARGET_NAME}
${_FILES_SOURCE})
target_link_libraries(
${_TARGET_NAME}
PRIVATE common)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

if(alpaka_CI)
# For non-benchmarking CI test runs - It will only run the benchmark once to see if it works at all.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
# Real automated benchmark runs will need to collect more samples (the default 100 is fine). The CI will then
# have to set another variable to indicate if it is only testing, or if it wants to do a full benchmark.
else()
# For full benchmark run - will collect 100 samples wor godd benchmark statistics.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
endif()
88 changes: 88 additions & 0 deletions test/benchmark/rand/src/randBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/* Copyright 2022 Jiri Vyskocil
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <alpaka/rand/Traits.hpp>
#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
#include <alpaka/test/acc/TestAccs.hpp>

#include <catch2/catch.hpp>

class RandBenchmarkKernel
{
public:
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc, typename TIdx>
ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
{
// Get the global linearized thread idx.
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx
= static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);

// Setup generator engine and distribution.
auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));

float number = 0;
for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
{
number += dist(engine);
}

alpaka::atomicAdd(
acc,
result,
number); // TODO: we're measuring the atomicAdd time too, this is not what we want
}
};

// TODO: This takes an enormous time to finish and is probably useless anyway:
// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
// Running the benchmark on a single default accelerator instead
TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
{
// using Acc = TestType;
using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const devAcc = alpaka::getDevByIdx<Acc>(0u);

const Idx numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
std::cout << "Hardware threads: " << numThreads << std::endl;

#ifdef ALPAKA_CI // Reduced benchmark set for automated test runs.
const unsigned numPoints = GENERATE(10u, 1'000'000u);
#else
const unsigned numPoints = GENERATE(10u, 100'000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
#endif

WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
devAcc,
Vec::all(numThreads * numThreads),
Vec::all(numThreads),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};

alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);

RandBenchmarkKernel kernel;

float result = 0.0f;

REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
// TODO: Actually check the result
std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
<< " should probably converge to 0.5." << std::flush;
}
4 changes: 4 additions & 0 deletions test/catch_main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ set_target_properties(CatchMain PROPERTIES
WINDOWS_EXPORT_ALL_SYMBOLS ON
)

if(alpaka_BUILD_BENCHMARK)
target_compile_definitions(CatchMain PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
endif()

target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
# Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2
Expand Down

0 comments on commit d23e2cb

Please sign in to comment.