From ae8acbb68647cdc00fe58049a824c3dbb339d582 Mon Sep 17 00:00:00 2001
From: Patrick Stotko <stotko@cs.uni-bonn.de>
Date: Tue, 19 Nov 2024 13:17:16 +0100
Subject: [PATCH] atomic: Extend support for custom execution policies

---
 benchmarks/stdgpu/main.cpp             |  1 +
 src/stdgpu/atomic.cuh                  | 48 ++++++++++++++++
 src/stdgpu/cuda/CMakeLists.txt         |  3 +-
 src/stdgpu/cuda/impl/memory_detail.h   | 79 +++++++++++++++++++++++++
 src/stdgpu/cuda/memory.h               | 52 +++++++++++++++++
 src/stdgpu/hip/CMakeLists.txt          |  3 +-
 src/stdgpu/hip/impl/memory_detail.h    | 80 ++++++++++++++++++++++++++
 src/stdgpu/hip/memory.h                | 52 +++++++++++++++++
 src/stdgpu/impl/atomic_detail.cuh      | 67 ++++++++++++++++++++-
 src/stdgpu/impl/memory_detail.h        | 46 +++++++++++++++
 src/stdgpu/openmp/CMakeLists.txt       |  3 +-
 src/stdgpu/openmp/impl/memory_detail.h | 61 ++++++++++++++++++++
 src/stdgpu/openmp/memory.h             | 52 +++++++++++++++++
 tests/stdgpu/atomic.inc                | 26 +++++++++
 tests/stdgpu/main.cpp                  |  1 +
 15 files changed, 569 insertions(+), 5 deletions(-)
 create mode 100644 src/stdgpu/cuda/impl/memory_detail.h
 create mode 100644 src/stdgpu/hip/impl/memory_detail.h
 create mode 100644 src/stdgpu/openmp/impl/memory_detail.h
diff --git a/benchmarks/stdgpu/main.cpp b/benchmarks/stdgpu/main.cpp
index 7d913072f..9d81e50a2 100644
--- a/benchmarks/stdgpu/main.cpp
+++ b/benchmarks/stdgpu/main.cpp
@@ -65,6 +65,7 @@ main(int argc, char* argv[])
            stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),
            stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -
                    stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));
+    printf("+---------------------------------------------------------+\n");
 
     return EXIT_SUCCESS;
 }
diff --git a/src/stdgpu/atomic.cuh b/src/stdgpu/atomic.cuh
index 36c5f4131..ed594dd50 100644
--- a/src/stdgpu/atomic.cuh
+++ b/src/stdgpu/atomic.cuh
@@ -210,6 +210,18 @@ public:
     STDGPU_HOST_DEVICE T
     load(const memory_order order = memory_order_seq_cst) const;
 
+    /**
+     * \brief Atomically loads and returns the current value of the atomic object
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] order The memory order
+     * \return The current value of this object
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    T
+    load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;
+
     /**
      * \brief Atomically loads and returns the current value of the atomic object
      * \return The current value of this object
@@ -225,6 +237,18 @@ public:
     STDGPU_HOST_DEVICE void
     store(const T desired, const memory_order order = memory_order_seq_cst);
 
+    /**
+     * \brief Atomically replaces the current value with desired one
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] desired The value to store to the atomic object
+     * \param[in] order The memory order
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    void
+    store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);
+
     /**
      * \brief Atomically replaces the current value with desired one
      * \param[in] desired The value to store to the atomic object
@@ -496,6 +520,18 @@ public:
     STDGPU_HOST_DEVICE T
     load(const memory_order order = memory_order_seq_cst) const;
 
+    /**
+     * \brief Atomically loads and returns the current value of the atomic object
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] order The memory order
+     * \return The current value of this object
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    T
+    load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;
+
     /**
      * \brief Loads and returns the current value of the atomic object
      * \return The current value of this object
@@ -512,6 +548,18 @@ public:
     STDGPU_HOST_DEVICE void
     store(const T desired, const memory_order order = memory_order_seq_cst);
 
+    /**
+     * \brief Atomically replaces the current value with desired one
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] desired The value to store to the atomic object
+     * \param[in] order The memory order
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    void
+    store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);
+
     /**
      * \brief Replaces the current value with desired
      * \param[in] desired The value to store to the atomic object
diff --git a/src/stdgpu/cuda/CMakeLists.txt b/src/stdgpu/cuda/CMakeLists.txt
index 781d420b0..27889cf58 100644
--- a/src/stdgpu/cuda/CMakeLists.txt
+++ b/src/stdgpu/cuda/CMakeLists.txt
@@ -22,7 +22,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
                                  TYPE HEADERS
                                  BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
                                  FILES impl/atomic_detail.cuh
-                                       impl/error.h)
+                                       impl/error.h
+                                       impl/memory_detail.h)
 endif()
 
 target_compile_features(stdgpu PUBLIC cuda_std_17)
diff --git a/src/stdgpu/cuda/impl/memory_detail.h b/src/stdgpu/cuda/impl/memory_detail.h
new file mode 100644
index 000000000..8b9b6035f
--- /dev/null
+++ b/src/stdgpu/cuda/impl/memory_detail.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2024 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef STDGPU_CUDA_MEMORY_DETAIL_H
+#define STDGPU_CUDA_MEMORY_DETAIL_H
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <stdgpu/cuda/impl/error.h>
+
+namespace stdgpu::cuda
+{
+
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_impl(ExecutionPolicy&& policy,
+            void* destination,
+            const void* source,
+            index64_t bytes,
+            cudaMemcpyKind kind,
+            bool needs_sychronization)
+{
+    cudaStream_t stream = thrust::cuda_cub::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));
+
+    STDGPU_CUDA_SAFE_CALL(cudaMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
+    if (needs_sychronization)
+    {
+        STDGPU_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+    }
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToHost, true);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToHost, true);
+}
+
+} // namespace stdgpu::cuda
+
+#endif // STDGPU_CUDA_MEMORY_DETAIL_H
diff --git a/src/stdgpu/cuda/memory.h b/src/stdgpu/cuda/memory.h
index 7c6af7a17..71805cccb 100644
--- a/src/stdgpu/cuda/memory.h
+++ b/src/stdgpu/cuda/memory.h
@@ -17,6 +17,8 @@
 #define STDGPU_CUDA_MEMORY_H
 
 #include <stdgpu/cstddef.h>
+#include <stdgpu/execution.h>
+#include <stdgpu/type_traits.h>
 
 namespace stdgpu::cuda
 {
@@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
 void
 memcpy_host_to_host(void* destination, const void* source, index64_t bytes);
 
+/**
+ * \brief Performs platform-specific memory copy from device to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from device to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
 } // namespace stdgpu::cuda
 
+#include <stdgpu/cuda/impl/memory_detail.h>
+
 #endif // STDGPU_CUDA_MEMORY_H
diff --git a/src/stdgpu/hip/CMakeLists.txt b/src/stdgpu/hip/CMakeLists.txt
index 4d8f9c76c..c84bc883c 100644
--- a/src/stdgpu/hip/CMakeLists.txt
+++ b/src/stdgpu/hip/CMakeLists.txt
@@ -21,7 +21,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
                                  TYPE HEADERS
                                  BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
                                  FILES impl/atomic_detail.h
-                                       impl/error.h)
+                                       impl/error.h
+                                       impl/memory_detail.h)
 endif()
 
 target_compile_features(stdgpu PUBLIC hip_std_17)
diff --git a/src/stdgpu/hip/impl/memory_detail.h b/src/stdgpu/hip/impl/memory_detail.h
new file mode 100644
index 000000000..fb945bc6d
--- /dev/null
+++ b/src/stdgpu/hip/impl/memory_detail.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2024 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef STDGPU_HIP_MEMORY_DETAIL_H
+#define STDGPU_HIP_MEMORY_DETAIL_H
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/hip/detail/util.h>
+
+#include <stdgpu/hip/impl/error.h>
+
+namespace stdgpu::hip
+{
+
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_impl(ExecutionPolicy&& policy,
+            void* destination,
+            const void* source,
+            index64_t bytes,
+            hipMemcpyKind kind,
+            bool needs_sychronization)
+{
+    cudaStream_t stream =
+            thrust::hip_rocprim::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));
+
+    STDGPU_HIP_SAFE_CALL(hipMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
+    if (needs_sychronization)
+    {
+        STDGPU_HIP_SAFE_CALL(hipStreamSynchronize(stream));
+    }
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToHost, true);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToHost, true);
+}
+
+} // namespace stdgpu::hip
+
+#endif // STDGPU_HIP_MEMORY_DETAIL_H
diff --git a/src/stdgpu/hip/memory.h b/src/stdgpu/hip/memory.h
index 9c215c49e..840a5c5a9 100644
--- a/src/stdgpu/hip/memory.h
+++ b/src/stdgpu/hip/memory.h
@@ -17,6 +17,8 @@
 #define STDGPU_HIP_MEMORY_H
 
 #include <stdgpu/cstddef.h>
+#include <stdgpu/execution.h>
+#include <stdgpu/type_traits.h>
 
 namespace stdgpu::hip
 {
@@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
 void
 memcpy_host_to_host(void* destination, const void* source, index64_t bytes);
 
+/**
+ * \brief Performs platform-specific memory copy from device to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from device to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
 } // namespace stdgpu::hip
 
+#include <stdgpu/hip/impl/memory_detail.h>
+
 #endif // STDGPU_HIP_MEMORY_H
diff --git a/src/stdgpu/impl/atomic_detail.cuh b/src/stdgpu/impl/atomic_detail.cuh
index 09f7f8d8c..d64a8cbb6 100644
--- a/src/stdgpu/impl/atomic_detail.cuh
+++ b/src/stdgpu/impl/atomic_detail.cuh
@@ -205,12 +205,30 @@ atomic<T, Allocator>::load(const memory_order order) const
     return _value_ref.load(order);
 }
 
+template <typename T, typename Allocator>
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+inline T
+atomic<T, Allocator>::load(ExecutionPolicy&& policy, const memory_order order) const
+{
+    return _value_ref.load(std::forward<ExecutionPolicy>(policy), order);
+}
+
 template <typename T, typename Allocator>
 inline STDGPU_HOST_DEVICE atomic<T, Allocator>::operator T() const
 {
     return _value_ref.operator T();
 }
 
+template <typename T, typename Allocator>
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+inline void
+atomic<T, Allocator>::store(ExecutionPolicy&& policy, const T desired, const memory_order order)
+{
+    _value_ref.store(std::forward<ExecutionPolicy>(policy), desired, order);
+}
+
 template <typename T, typename Allocator>
 inline STDGPU_HOST_DEVICE void
 atomic<T, Allocator>::store(const T desired, const memory_order order)
@@ -430,12 +448,36 @@ atomic_ref<T>::load([[maybe_unused]] const memory_order order) const
 
     detail::atomic_consistency_thread_fence(order);
 #else
-    copyDevice2HostArray<T>(_value, 1, &local_value, MemoryCopy::NO_CHECK);
+    local_value = load(execution::device, order);
 #endif
 
     return local_value;
 }
 
+template <typename T>
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+inline T
+atomic_ref<T>::load(ExecutionPolicy&& policy, [[maybe_unused]] const memory_order order) const
+{
+    if (_value == nullptr)
+    {
+        return 0;
+    }
+
+    T local_value;
+    stdgpu::detail::memcpy(std::forward<ExecutionPolicy>(policy),
+                           // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
+                           static_cast<void*>(const_cast<std::remove_cv_t<T>*>(&local_value)),
+                           // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
+                           static_cast<void*>(const_cast<std::remove_cv_t<T>*>(_value)),
+                           1 * static_cast<stdgpu::index64_t>(sizeof(T)), // NOLINT(bugprone-sizeof-expression)
+                           stdgpu::dynamic_memory_type::host,
+                           stdgpu::dynamic_memory_type::device);
+
+    return local_value;
+}
+
 template <typename T>
 inline STDGPU_HOST_DEVICE atomic_ref<T>::operator T() const
 {
@@ -458,10 +500,31 @@ atomic_ref<T>::store(const T desired, [[maybe_unused]] const memory_order order)
 
     detail::atomic_store_thread_fence(order);
 #else
-    copyHost2DeviceArray<T>(&desired, 1, _value, MemoryCopy::NO_CHECK);
+    store(execution::device, desired, order);
 #endif
 }
 
+template <typename T>
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+inline void
+atomic_ref<T>::store(ExecutionPolicy&& policy, const T desired, [[maybe_unused]] const memory_order order)
+{
+    if (_value == nullptr)
+    {
+        return;
+    }
+
+    stdgpu::detail::memcpy(std::forward<ExecutionPolicy>(policy),
+                           // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
+                           static_cast<void*>(const_cast<std::remove_cv_t<T>*>(_value)),
+                           // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
+                           static_cast<void*>(const_cast<std::remove_cv_t<T>*>(&desired)),
+                           1 * static_cast<stdgpu::index64_t>(sizeof(T)), // NOLINT(bugprone-sizeof-expression)
+                           stdgpu::dynamic_memory_type::device,
+                           stdgpu::dynamic_memory_type::host);
+}
+
 // NOLINTNEXTLINE(misc-unconventional-assign-operator,cppcoreguidelines-c-copy-assignment-signature)
 template <typename T>
 // NOLINTNEXTLINE(misc-unconventional-assign-operator,cppcoreguidelines-c-copy-assignment-signature)
diff --git a/src/stdgpu/impl/memory_detail.h b/src/stdgpu/impl/memory_detail.h
index f98f4b7db..91fc1e47b 100644
--- a/src/stdgpu/impl/memory_detail.h
+++ b/src/stdgpu/impl/memory_detail.h
@@ -28,6 +28,8 @@
 #include <stdgpu/type_traits.h>
 #include <stdgpu/utility.h>
 
+#include STDGPU_DETAIL_BACKEND_HEADER(memory.h)
+
 namespace stdgpu::detail
 {
 
@@ -52,6 +54,50 @@ memcpy(void* destination,
        dynamic_memory_type source_type,
        const bool external_memory);
 
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy(ExecutionPolicy&& policy,
+       void* destination,
+       const void* source,
+       index64_t bytes,
+       dynamic_memory_type destination_type,
+       dynamic_memory_type source_type)
+{
+    if (source_type == dynamic_memory_type::device && destination_type == dynamic_memory_type::device)
+    {
+        stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_device_to_device(std::forward<ExecutionPolicy>(policy),
+                                                                  destination,
+                                                                  source,
+                                                                  bytes);
+    }
+    else if (source_type == dynamic_memory_type::device && destination_type == dynamic_memory_type::host)
+    {
+        stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_device_to_host(std::forward<ExecutionPolicy>(policy),
+                                                                destination,
+                                                                source,
+                                                                bytes);
+    }
+    else if (source_type == dynamic_memory_type::host && destination_type == dynamic_memory_type::device)
+    {
+        stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_host_to_device(std::forward<ExecutionPolicy>(policy),
+                                                                destination,
+                                                                source,
+                                                                bytes);
+    }
+    else if (source_type == dynamic_memory_type::host && destination_type == dynamic_memory_type::host)
+    {
+        stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_host_to_host(std::forward<ExecutionPolicy>(policy),
+                                                              destination,
+                                                              source,
+                                                              bytes);
+    }
+    else
+    {
+        printf("stdgpu::detail::memcpy : Unsupported dynamic source or destination memory type\n");
+        return;
+    }
+}
+
 template <typename Iterator, typename T>
 class uninitialized_fill_functor
 {
diff --git a/src/stdgpu/openmp/CMakeLists.txt b/src/stdgpu/openmp/CMakeLists.txt
index 8d18d1a4e..7002eede7 100644
--- a/src/stdgpu/openmp/CMakeLists.txt
+++ b/src/stdgpu/openmp/CMakeLists.txt
@@ -21,7 +21,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
     target_sources(stdgpu PUBLIC FILE_SET stdgpu_backend_header_implementations
                                  TYPE HEADERS
                                  BASE_DIRS ${STDGPU_INCLUDE_LOCAL_DIR}
-                                 FILES impl/atomic_detail.h)
+                                 FILES impl/atomic_detail.h
+                                 impl/memory_detail.h)
 endif()
 
 target_compile_definitions(stdgpu PUBLIC THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
diff --git a/src/stdgpu/openmp/impl/memory_detail.h b/src/stdgpu/openmp/impl/memory_detail.h
new file mode 100644
index 000000000..bceb1b735
--- /dev/null
+++ b/src/stdgpu/openmp/impl/memory_detail.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2024 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef STDGPU_OPENMP_MEMORY_DETAIL_H
+#define STDGPU_OPENMP_MEMORY_DETAIL_H
+
+#include <cstring>
+
+namespace stdgpu::openmp
+{
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device([[maybe_unused]] ExecutionPolicy&& policy,
+                        void* destination,
+                        const void* source,
+                        index64_t bytes)
+{
+    std::memcpy(destination, source, static_cast<std::size_t>(bytes));
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    std::memcpy(destination, source, static_cast<std::size_t>(bytes));
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    std::memcpy(destination, source, static_cast<std::size_t>(bytes));
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    std::memcpy(destination, source, static_cast<std::size_t>(bytes));
+}
+
+} // namespace stdgpu::openmp
+
+#endif // STDGPU_OPENMP_MEMORY_DETAIL_H
diff --git a/src/stdgpu/openmp/memory.h b/src/stdgpu/openmp/memory.h
index 130ec9d7b..11a2149fd 100644
--- a/src/stdgpu/openmp/memory.h
+++ b/src/stdgpu/openmp/memory.h
@@ -17,6 +17,8 @@
 #define STDGPU_OPENMP_MEMORY_H
 
 #include <stdgpu/cstddef.h>
+#include <stdgpu/execution.h>
+#include <stdgpu/type_traits.h>
 
 namespace stdgpu::openmp
 {
@@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
 void
 memcpy_host_to_host(void* destination, const void* source, index64_t bytes);
 
+/**
+ * \brief Performs platform-specific memory copy from device to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from device to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
 } // namespace stdgpu::openmp
 
+#include <stdgpu/openmp/impl/memory_detail.h>
+
 #endif // STDGPU_OPENMP_MEMORY_H
diff --git a/tests/stdgpu/atomic.inc b/tests/stdgpu/atomic.inc
index 7faddf724..9eccc6676 100644
--- a/tests/stdgpu/atomic.inc
+++ b/tests/stdgpu/atomic.inc
@@ -174,7 +174,12 @@ empty_container()
 
     const T new_value = static_cast<T>(42);
     empty_container.store(new_value);
+
+    EXPECT_EQ(empty_container.load(), T());
+
     empty_container = new_value;
+
+    EXPECT_EQ(empty_container.load(), T());
 }
 
 TEST_F(stdgpu_atomic, empty_container_int)
@@ -2449,5 +2454,26 @@ TEST_F(stdgpu_atomic, custom_execution_policy)
 
     stdgpu::atomic<int> value = stdgpu::atomic<int>::createDeviceObject(policy);
 
+    EXPECT_EQ(value.load(policy), int());
+
+    const int new_value = 42;
+    value.store(policy, new_value);
+
+    EXPECT_EQ(value.load(policy), new_value);
+
     stdgpu::atomic<int>::destroyDeviceObject(policy, value);
 }
+
+TEST_F(stdgpu_atomic, custom_execution_policy_empty_container)
+{
+    test_utils::custom_device_policy policy;
+
+    stdgpu::atomic<int> empty_container;
+
+    EXPECT_EQ(empty_container.load(policy), int());
+
+    const int new_value = 42;
+    empty_container.store(policy, new_value);
+
+    EXPECT_EQ(empty_container.load(policy), int());
+}
diff --git a/tests/stdgpu/main.cpp b/tests/stdgpu/main.cpp
index 04e363ec0..b5fa0c1c1 100644
--- a/tests/stdgpu/main.cpp
+++ b/tests/stdgpu/main.cpp
@@ -66,6 +66,7 @@ main(int argc, char* argv[])
            stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),
            stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -
                    stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));
+    printf("+---------------------------------------------------------+\n");
 
     return result;
 }