stotko
diff --git a/‎benchmarks/stdgpu/main.cpp
+1 b/‎benchmarks/stdgpu/main.cpp
+1
diff --git a/‎src/stdgpu/atomic.cuh
+48 b/‎src/stdgpu/atomic.cuh
+48
diff --git a/‎src/stdgpu/cuda/impl/memory_detail.h
+81 b/‎src/stdgpu/cuda/impl/memory_detail.h
+81
diff --git a/‎src/stdgpu/cuda/memory.h
+52 b/‎src/stdgpu/cuda/memory.h
+52
diff --git a/‎src/stdgpu/hip/impl/memory_detail.h
+82 b/‎src/stdgpu/hip/impl/memory_detail.h
+82
diff --git a/‎src/stdgpu/hip/memory.h
+52 b/‎src/stdgpu/hip/memory.h
+52
@@ -65,6 +65,7 @@ main(int argc, char* argv[])
            stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),
            stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -
                    stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));
+    printf("+---------------------------------------------------------+\n");
 
     return EXIT_SUCCESS;
 }
@@ -210,6 +210,18 @@ public:
     STDGPU_HOST_DEVICE T
     load(const memory_order order = memory_order_seq_cst) const;
 
+    /**
+     * \brief Atomically loads and returns the current value of the atomic object
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] order The memory order
+     * \return The current value of this object
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    T
+    load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;
+
     /**
      * \brief Atomically loads and returns the current value of the atomic object
      * \return The current value of this object
@@ -225,6 +237,18 @@ public:
     STDGPU_HOST_DEVICE void
     store(const T desired, const memory_order order = memory_order_seq_cst);
 
+    /**
+     * \brief Atomically replaces the current value with desired one
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] desired The value to store to the atomic object
+     * \param[in] order The memory order
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    void
+    store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);
+
     /**
      * \brief Atomically replaces the current value with desired one
      * \param[in] desired The value to store to the atomic object
@@ -496,6 +520,18 @@ public:
     STDGPU_HOST_DEVICE T
     load(const memory_order order = memory_order_seq_cst) const;
 
+    /**
+     * \brief Atomically loads and returns the current value of the atomic object
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] order The memory order
+     * \return The current value of this object
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    T
+    load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const;
+
     /**
      * \brief Loads and returns the current value of the atomic object
      * \return The current value of this object
@@ -512,6 +548,18 @@ public:
     STDGPU_HOST_DEVICE void
     store(const T desired, const memory_order order = memory_order_seq_cst);
 
+    /**
+     * \brief Atomically replaces the current value with desired one
+     * \tparam ExecutionPolicy The type of the execution policy
+     * \param[in] policy The execution policy
+     * \param[in] desired The value to store to the atomic object
+     * \param[in] order The memory order
+     */
+    template <typename ExecutionPolicy,
+              STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+    void
+    store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst);
+
     /**
      * \brief Replaces the current value with desired
      * \param[in] desired The value to store to the atomic object
 
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2024 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef STDGPU_CUDA_MEMORY_DETAIL_H
+#define STDGPU_CUDA_MEMORY_DETAIL_H
+
+#include <stdgpu/cuda/memory.h>
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <stdgpu/cuda/impl/error.h>
+
+namespace stdgpu::cuda
+{
+
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_impl(ExecutionPolicy&& policy,
+            void* destination,
+            const void* source,
+            index64_t bytes,
+            cudaMemcpyKind kind,
+            bool needs_sychronization)
+{
+    cudaStream_t stream = thrust::cuda_cub::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));
+
+    STDGPU_CUDA_SAFE_CALL(cudaMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
+    if (needs_sychronization)
+    {
+        STDGPU_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+    }
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyDeviceToHost, true);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, cudaMemcpyHostToHost, true);
+}
+
+} // namespace stdgpu::cuda
+
+#endif // STDGPU_CUDA_MEMORY_DETAIL_H
@@ -17,6 +17,8 @@
 #define STDGPU_CUDA_MEMORY_H
 
 #include <stdgpu/cstddef.h>
+#include <stdgpu/execution.h>
+#include <stdgpu/type_traits.h>
 
 namespace stdgpu::cuda
 {
@@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
 void
 memcpy_host_to_host(void* destination, const void* source, index64_t bytes);
 
+/**
+ * \brief Performs platform-specific memory copy from device to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from device to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
 } // namespace stdgpu::cuda
 
+#include <stdgpu/cuda/impl/memory_detail.h>
+
 #endif // STDGPU_CUDA_MEMORY_H
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2024 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef STDGPU_HIP_MEMORY_DETAIL_H
+#define STDGPU_HIP_MEMORY_DETAIL_H
+
+#include <stdgpu/hip/memory.h>
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/hip/detail/util.h>
+
+#include <stdgpu/hip/impl/error.h>
+
+namespace stdgpu::hip
+{
+
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_impl(ExecutionPolicy&& policy,
+            void* destination,
+            const void* source,
+            index64_t bytes,
+            hipMemcpyKind kind,
+            bool needs_sychronization)
+{
+    cudaStream_t stream =
+            thrust::hip_rocprim::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy)));
+
+    STDGPU_HIP_SAFE_CALL(hipMemcpyAsync(destination, source, static_cast<std::size_t>(bytes), kind, stream));
+    if (needs_sychronization)
+    {
+        STDGPU_HIP_SAFE_CALL(hipStreamSynchronize(stream));
+    }
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyDeviceToHost, true);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToDevice, false);
+}
+
+template <typename ExecutionPolicy,
+          STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes)
+{
+    memcpy_impl(std::forward<ExecutionPolicy>(policy), destination, source, bytes, hipMemcpyHostToHost, true);
+}
+
+} // namespace stdgpu::hip
+
+#endif // STDGPU_HIP_MEMORY_DETAIL_H
@@ -17,6 +17,8 @@
 #define STDGPU_HIP_MEMORY_H
 
 #include <stdgpu/cstddef.h>
+#include <stdgpu/execution.h>
+#include <stdgpu/type_traits.h>
 
 namespace stdgpu::hip
 {
@@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes);
 void
 memcpy_host_to_host(void* destination, const void* source, index64_t bytes);
 
+/**
+ * \brief Performs platform-specific memory copy from device to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from device to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to device
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
+/**
+ * \brief Performs platform-specific memory copy from host to host
+ * \tparam ExecutionPolicy The type of the execution policy
+ * \param[in] policy The execution policy
+ * \param[in] destination The destination array
+ * \param[in] source The source array
+ * \param[in] bytes The size of the allocated array
+ */
+template <typename ExecutionPolicy, STDGPU_DETAIL_OVERLOAD_IF(is_execution_policy_v<remove_cvref_t<ExecutionPolicy>>)>
+void
+memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes);
+
 } // namespace stdgpu::hip
 
+#include <stdgpu/hip/impl/memory_detail.h>
+
 #endif // STDGPU_HIP_MEMORY_H
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ main(int argc, char* argv[])`
`65`	`65`	`stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host),`
`66`	`66`	`stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) -`
`67`	`67`	`stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host));`
	`68`	`+ printf("+---------------------------------------------------------+\n");`
`68`	`69`
`69`	`70`	`return EXIT_SUCCESS;`
`70`	`71`	`}`