Skip to content

Commit

Permalink
atomic: Extend support for custom execution policies
Browse files Browse the repository at this point in the history
  • Loading branch information
stotko committed Nov 19, 2024
1 parent f51a985 commit ae8acbb
Show file tree
Hide file tree
Showing 15 changed files with 569 additions and 5 deletions.
1 change: 1 addition & 0 deletions benchmarks/stdgpu/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ main(int argc, char* argv[])
stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),
stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -
stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));
printf("+---------------------------------------------------------+\n");

return EXIT_SUCCESS;
}
48 changes: 48 additions & 0 deletions src/stdgpu/atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,18 @@ public:
STDGPU_HOST_DEVICE T
load(const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] order The memory order
* \return The current value of this object
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
T
load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \return The current value of this object
Expand All @@ -225,6 +237,18 @@ public:
STDGPU_HOST_DEVICE void
store(const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] desired The value to store to the atomic object
* \param[in] order The memory order
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \param[in] desired The value to store to the atomic object
Expand Down Expand Up @@ -496,6 +520,18 @@ public:
STDGPU_HOST_DEVICE T
load(const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] order The memory order
* \return The current value of this object
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
T
load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;

/**
* \brief Loads and returns the current value of the atomic object
* \return The current value of this object
Expand All @@ -512,6 +548,18 @@ public:
STDGPU_HOST_DEVICE void
store(const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] desired The value to store to the atomic object
* \param[in] order The memory order
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Replaces the current value with desired
* \param[in] desired The value to store to the atomic object
Expand Down
3 changes: 2 additions & 1 deletion src/stdgpu/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
TYPE HEADERS
BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
FILES impl/atomic_detail.cuh
impl/error.h)
impl/error.h
impl/memory_detail.h)
endif()

target_compile_features(stdgpu PUBLIC cuda_std_17)
Expand Down
79 changes: 79 additions & 0 deletions src/stdgpu/cuda/impl/memory_detail.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright 2024 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STDGPU_CUDA_MEMORY_DETAIL_H
#define STDGPU_CUDA_MEMORY_DETAIL_H

#include <thrust/detail/execution_policy.h>
#include <thrust/system/cuda/detail/util.h>

#include <stdgpu/cuda/impl/error.h>

namespace stdgpu::cuda
{

template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_impl(ExecutionPolicy&& policy,
void* destination,
const void* source,
index64_t bytes,
cudaMemcpyKind kind,
bool needs_sychronization)
{
cudaStream_t stream = thrust::cuda_cub::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));

STDGPU_CUDA_SAFE_CALL(cudaMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
if (needs_sychronization)
{
STDGPU_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
}
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToHost, true);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToHost, true);
}

} // namespace stdgpu::cuda

#endif // STDGPU_CUDA_MEMORY_DETAIL_H
52 changes: 52 additions & 0 deletions src/stdgpu/cuda/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#define STDGPU_CUDA_MEMORY_H

#include <stdgpu/cstddef.h>
#include <stdgpu/execution.h>
#include <stdgpu/type_traits.h>

namespace stdgpu::cuda
{
Expand Down Expand Up @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
void
memcpy_host_to_host(void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

} // namespace stdgpu::cuda

#include <stdgpu/cuda/impl/memory_detail.h>

#endif // STDGPU_CUDA_MEMORY_H
3 changes: 2 additions & 1 deletion src/stdgpu/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
TYPE HEADERS
BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
FILES impl/atomic_detail.h
impl/error.h)
impl/error.h
impl/memory_detail.h)
endif()

target_compile_features(stdgpu PUBLIC hip_std_17)
Expand Down
80 changes: 80 additions & 0 deletions src/stdgpu/hip/impl/memory_detail.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2024 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STDGPU_HIP_MEMORY_DETAIL_H
#define STDGPU_HIP_MEMORY_DETAIL_H

#include <thrust/detail/execution_policy.h>
#include <thrust/system/hip/detail/util.h>

#include <stdgpu/hip/impl/error.h>

namespace stdgpu::hip
{

template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_impl(ExecutionPolicy&& policy,
void* destination,
const void* source,
index64_t bytes,
hipMemcpyKind kind,
bool needs_sychronization)
{
cudaStream_t stream =
thrust::hip_rocprim::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));

STDGPU_HIP_SAFE_CALL(hipMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
if (needs_sychronization)
{
STDGPU_HIP_SAFE_CALL(hipStreamSynchronize(stream));
}
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToHost, true);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToHost, true);
}

} // namespace stdgpu::hip

#endif // STDGPU_HIP_MEMORY_DETAIL_H
52 changes: 52 additions & 0 deletions src/stdgpu/hip/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#define STDGPU_HIP_MEMORY_H

#include <stdgpu/cstddef.h>
#include <stdgpu/execution.h>
#include <stdgpu/type_traits.h>

namespace stdgpu::hip
{
Expand Down Expand Up @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
void
memcpy_host_to_host(void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

} // namespace stdgpu::hip

#include <stdgpu/hip/impl/memory_detail.h>

#endif // STDGPU_HIP_MEMORY_H
Loading

0 comments on commit ae8acbb

Please sign in to comment.