Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

atomic: Extend support for custom execution policies #447

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/stdgpu/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ main(int argc, char* argv[])
stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),
stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -
stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));
printf("+---------------------------------------------------------+\n");

return EXIT_SUCCESS;
}
48 changes: 48 additions & 0 deletions src/stdgpu/atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,18 @@ public:
STDGPU_HOST_DEVICE T
load(const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] order The memory order
* \return The current value of this object
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
T
load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \return The current value of this object
Expand All @@ -225,6 +237,18 @@ public:
STDGPU_HOST_DEVICE void
store(const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] desired The value to store to the atomic object
* \param[in] order The memory order
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \param[in] desired The value to store to the atomic object
Expand Down Expand Up @@ -496,6 +520,18 @@ public:
STDGPU_HOST_DEVICE T
load(const memory_order order = memory_order_seq_cst) const;

/**
* \brief Atomically loads and returns the current value of the atomic object
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] order The memory order
* \return The current value of this object
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
T
load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;

/**
* \brief Loads and returns the current value of the atomic object
* \return The current value of this object
Expand All @@ -512,6 +548,18 @@ public:
STDGPU_HOST_DEVICE void
store(const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Atomically replaces the current value with desired one
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] desired The value to store to the atomic object
* \param[in] order The memory order
*/
template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);

/**
* \brief Replaces the current value with desired
* \param[in] desired The value to store to the atomic object
Expand Down
3 changes: 2 additions & 1 deletion src/stdgpu/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
TYPE HEADERS
BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
FILES impl/atomic_detail.cuh
impl/error.h)
impl/error.h
impl/memory_detail.h)
endif()

target_compile_features(stdgpu PUBLIC cuda_std_17)
Expand Down
79 changes: 79 additions & 0 deletions src/stdgpu/cuda/impl/memory_detail.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright 2024 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STDGPU_CUDA_MEMORY_DETAIL_H
#define STDGPU_CUDA_MEMORY_DETAIL_H

#include <thrust/detail/execution_policy.h>
#include <thrust/system/cuda/detail/util.h>

#include <stdgpu/cuda/impl/error.h>

namespace stdgpu::cuda
{

template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_impl(ExecutionPolicy&& policy,
void* destination,
const void* source,
index64_t bytes,
cudaMemcpyKind kind,
bool needs_sychronization)
{
cudaStream_t stream = thrust::cuda_cub::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));

STDGPU_CUDA_SAFE_CALL(cudaMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
if (needs_sychronization)
{
STDGPU_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
}
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToHost, true);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToHost, true);
}

} // namespace stdgpu::cuda

#endif // STDGPU_CUDA_MEMORY_DETAIL_H
52 changes: 52 additions & 0 deletions src/stdgpu/cuda/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#define STDGPU_CUDA_MEMORY_H

#include <stdgpu/cstddef.h>
#include <stdgpu/execution.h>
#include <stdgpu/type_traits.h>

namespace stdgpu::cuda
{
Expand Down Expand Up @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
void
memcpy_host_to_host(void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

} // namespace stdgpu::cuda

#include <stdgpu/cuda/impl/memory_detail.h>

#endif // STDGPU_CUDA_MEMORY_H
3 changes: 2 additions & 1 deletion src/stdgpu/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
TYPE HEADERS
BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
FILES impl/atomic_detail.h
impl/error.h)
impl/error.h
impl/memory_detail.h)
endif()

target_compile_features(stdgpu PUBLIC hip_std_17)
Expand Down
80 changes: 80 additions & 0 deletions src/stdgpu/hip/impl/memory_detail.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2024 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STDGPU_HIP_MEMORY_DETAIL_H
#define STDGPU_HIP_MEMORY_DETAIL_H

#include <thrust/detail/execution_policy.h>
#include <thrust/system/hip/detail/util.h>

#include <stdgpu/hip/impl/error.h>

namespace stdgpu::hip
{

template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_impl(ExecutionPolicy&& policy,
void* destination,
const void* source,
index64_t bytes,
hipMemcpyKind kind,
bool needs_sychronization)
{
cudaStream_t stream =
thrust::hip_rocprim::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));

STDGPU_HIP_SAFE_CALL(hipMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
if (needs_sychronization)
{
STDGPU_HIP_SAFE_CALL(hipStreamSynchronize(stream));
}
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToHost, true);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToDevice, false);
}

template <typename ExecutionPolicy,
STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
{
memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToHost, true);
}

} // namespace stdgpu::hip

#endif // STDGPU_HIP_MEMORY_DETAIL_H
52 changes: 52 additions & 0 deletions src/stdgpu/hip/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#define STDGPU_HIP_MEMORY_H

#include <stdgpu/cstddef.h>
#include <stdgpu/execution.h>
#include <stdgpu/type_traits.h>

namespace stdgpu::hip
{
Expand Down Expand Up @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
void
memcpy_host_to_host(void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from device to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to device
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

/**
* \brief Performs platform-specific memory copy from host to host
* \tparam ExecutionPolicy The type of the execution policy
* \param[in] policy The execution policy
* \param[in] destination The destination array
* \param[in] source The source array
* \param[in] bytes The size of the allocated array
*/
template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
void
memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);

} // namespace stdgpu::hip

#include <stdgpu/hip/impl/memory_detail.h>

#endif // STDGPU_HIP_MEMORY_H
Loading