diff --git a/benchmarks/stdgpu/main.cpp b/benchmarks/stdgpu/main.cpp index 7d913072..9d81e50a 100644 --- a/benchmarks/stdgpu/main.cpp +++ b/benchmarks/stdgpu/main.cpp @@ -65,6 +65,7 @@ main(int argc, char* argv[]) stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host), stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) - stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host)); + printf("+---------------------------------------------------------+\n"); return EXIT_SUCCESS; } diff --git a/src/stdgpu/atomic.cuh b/src/stdgpu/atomic.cuh index 36c5f413..ed594dd5 100644 --- a/src/stdgpu/atomic.cuh +++ b/src/stdgpu/atomic.cuh @@ -210,6 +210,18 @@ public: STDGPU_HOST_DEVICE T load(const memory_order order = memory_order_seq_cst) const; + /** + * \brief Atomically loads and returns the current value of the atomic object + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] order The memory order + * \return The current value of this object + */ + template >)> + T + load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const; + /** * \brief Atomically loads and returns the current value of the atomic object * \return The current value of this object @@ -225,6 +237,18 @@ public: STDGPU_HOST_DEVICE void store(const T desired, const memory_order order = memory_order_seq_cst); + /** + * \brief Atomically replaces the current value with desired one + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] desired The value to store to the atomic object + * \param[in] order The memory order + */ + template >)> + void + store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst); + /** * \brief Atomically replaces the current value with desired one * \param[in] desired The value to store to the atomic object @@ -496,6 +520,18 @@ public: STDGPU_HOST_DEVICE T load(const memory_order order = memory_order_seq_cst) const; + /** + * \brief Atomically loads and returns the current value of the atomic object + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] order The memory order + * \return The current value of this object + */ + template >)> + T + load(ExecutionPolicy&& policy, const memory_order order = memory_order_seq_cst) const; + /** * \brief Loads and returns the current value of the atomic object * \return The current value of this object @@ -512,6 +548,18 @@ public: STDGPU_HOST_DEVICE void store(const T desired, const memory_order order = memory_order_seq_cst); + /** + * \brief Atomically replaces the current value with desired one + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] desired The value to store to the atomic object + * \param[in] order The memory order + */ + template >)> + void + store(ExecutionPolicy&& policy, const T desired, const memory_order order = memory_order_seq_cst); + /** * \brief Replaces the current value with desired * \param[in] desired The value to store to the atomic object diff --git a/src/stdgpu/cuda/impl/memory_detail.h b/src/stdgpu/cuda/impl/memory_detail.h new file mode 100644 index 00000000..209ae671 --- /dev/null +++ b/src/stdgpu/cuda/impl/memory_detail.h @@ -0,0 +1,81 @@ +/* + * Copyright 2024 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STDGPU_CUDA_MEMORY_DETAIL_H +#define STDGPU_CUDA_MEMORY_DETAIL_H + +#include + +#include +#include + +#include + +namespace stdgpu::cuda +{ + +template >)> +void +memcpy_impl(ExecutionPolicy&& policy, + void* destination, + const void* source, + index64_t bytes, + cudaMemcpyKind kind, + bool needs_sychronization) +{ + cudaStream_t stream = thrust::cuda_cub::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy))); + + STDGPU_CUDA_SAFE_CALL(cudaMemcpyAsync(destination, source, static_cast(bytes), kind, stream)); + if (needs_sychronization) + { + STDGPU_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } +} + +template >)> +void +memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, cudaMemcpyDeviceToDevice, false); +} + +template >)> +void +memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, cudaMemcpyDeviceToHost, true); +} + +template >)> +void +memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, cudaMemcpyHostToDevice, false); +} + +template >)> +void +memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, cudaMemcpyHostToHost, true); +} + +} // namespace stdgpu::cuda + +#endif // STDGPU_CUDA_MEMORY_DETAIL_H diff --git a/src/stdgpu/cuda/memory.h b/src/stdgpu/cuda/memory.h index 7c6af7a1..71805ccc 100644 --- a/src/stdgpu/cuda/memory.h +++ b/src/stdgpu/cuda/memory.h @@ -17,6 +17,8 @@ #define STDGPU_CUDA_MEMORY_H #include +#include +#include namespace stdgpu::cuda { @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes); void memcpy_host_to_host(void* destination, const void* source, index64_t bytes); +/** + * \brief Performs platform-specific memory copy from device to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from device to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + } // namespace stdgpu::cuda +#include + #endif // STDGPU_CUDA_MEMORY_H diff --git a/src/stdgpu/hip/impl/memory_detail.h b/src/stdgpu/hip/impl/memory_detail.h new file mode 100644 index 00000000..7a644f82 --- /dev/null +++ b/src/stdgpu/hip/impl/memory_detail.h @@ -0,0 +1,82 @@ +/* + * Copyright 2024 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STDGPU_HIP_MEMORY_DETAIL_H +#define STDGPU_HIP_MEMORY_DETAIL_H + +#include + +#include +#include + +#include + +namespace stdgpu::hip +{ + +template >)> +void +memcpy_impl(ExecutionPolicy&& policy, + void* destination, + const void* source, + index64_t bytes, + hipMemcpyKind kind, + bool needs_sychronization) +{ + cudaStream_t stream = + thrust::hip_rocprim::stream(thrust::detail::derived_cast(thrust::detail::strip_const(policy))); + + STDGPU_HIP_SAFE_CALL(hipMemcpyAsync(destination, source, static_cast(bytes), kind, stream)); + if (needs_sychronization) + { + STDGPU_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } +} + +template >)> +void +memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, hipMemcpyDeviceToDevice, false); +} + +template >)> +void +memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, hipMemcpyDeviceToHost, true); +} + +template >)> +void +memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, hipMemcpyHostToDevice, false); +} + +template >)> +void +memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + memcpy_impl(std::forward(policy), destination, source, bytes, hipMemcpyHostToHost, true); +} + +} // namespace stdgpu::hip + +#endif // STDGPU_HIP_MEMORY_DETAIL_H diff --git a/src/stdgpu/hip/memory.h b/src/stdgpu/hip/memory.h index 9c215c49..840a5c5a 100644 --- a/src/stdgpu/hip/memory.h +++ b/src/stdgpu/hip/memory.h @@ -17,6 +17,8 @@ #define STDGPU_HIP_MEMORY_H #include +#include +#include namespace stdgpu::hip { @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes); void memcpy_host_to_host(void* destination, const void* source, index64_t bytes); +/** + * \brief Performs platform-specific memory copy from device to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from device to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + } // namespace stdgpu::hip +#include + #endif // STDGPU_HIP_MEMORY_H diff --git a/src/stdgpu/impl/atomic_detail.cuh b/src/stdgpu/impl/atomic_detail.cuh index 09f7f8d8..d64a8cbb 100644 --- a/src/stdgpu/impl/atomic_detail.cuh +++ b/src/stdgpu/impl/atomic_detail.cuh @@ -205,12 +205,30 @@ atomic::load(const memory_order order) const return _value_ref.load(order); } +template +template >)> +inline T +atomic::load(ExecutionPolicy&& policy, const memory_order order) const +{ + return _value_ref.load(std::forward(policy), order); +} + template inline STDGPU_HOST_DEVICE atomic::operator T() const { return _value_ref.operator T(); } +template +template >)> +inline void +atomic::store(ExecutionPolicy&& policy, const T desired, const memory_order order) +{ + _value_ref.store(std::forward(policy), desired, order); +} + template inline STDGPU_HOST_DEVICE void atomic::store(const T desired, const memory_order order) @@ -430,12 +448,36 @@ atomic_ref::load([[maybe_unused]] const memory_order order) const detail::atomic_consistency_thread_fence(order); #else - copyDevice2HostArray(_value, 1, &local_value, MemoryCopy::NO_CHECK); + local_value = load(execution::device, order); #endif return local_value; } +template +template >)> +inline T +atomic_ref::load(ExecutionPolicy&& policy, [[maybe_unused]] const memory_order order) const +{ + if (_value == nullptr) + { + return 0; + } + + T local_value; + stdgpu::detail::memcpy(std::forward(policy), + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + static_cast(const_cast*>(&local_value)), + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + static_cast(const_cast*>(_value)), + 1 * static_cast(sizeof(T)), // NOLINT(bugprone-sizeof-expression) + stdgpu::dynamic_memory_type::host, + stdgpu::dynamic_memory_type::device); + + return local_value; +} + template inline STDGPU_HOST_DEVICE atomic_ref::operator T() const { @@ -458,10 +500,31 @@ atomic_ref::store(const T desired, [[maybe_unused]] const memory_order order) detail::atomic_store_thread_fence(order); #else - copyHost2DeviceArray(&desired, 1, _value, MemoryCopy::NO_CHECK); + store(execution::device, desired, order); #endif } +template +template >)> +inline void +atomic_ref::store(ExecutionPolicy&& policy, const T desired, [[maybe_unused]] const memory_order order) +{ + if (_value == nullptr) + { + return; + } + + stdgpu::detail::memcpy(std::forward(policy), + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + static_cast(const_cast*>(_value)), + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + static_cast(const_cast*>(&desired)), + 1 * static_cast(sizeof(T)), // NOLINT(bugprone-sizeof-expression) + stdgpu::dynamic_memory_type::device, + stdgpu::dynamic_memory_type::host); +} + // NOLINTNEXTLINE(misc-unconventional-assign-operator,cppcoreguidelines-c-copy-assignment-signature) template // NOLINTNEXTLINE(misc-unconventional-assign-operator,cppcoreguidelines-c-copy-assignment-signature) diff --git a/src/stdgpu/impl/memory_detail.h b/src/stdgpu/impl/memory_detail.h index f98f4b7d..91fc1e47 100644 --- a/src/stdgpu/impl/memory_detail.h +++ b/src/stdgpu/impl/memory_detail.h @@ -28,6 +28,8 @@ #include #include +#include STDGPU_DETAIL_BACKEND_HEADER(memory.h) + namespace stdgpu::detail { @@ -52,6 +54,50 @@ memcpy(void* destination, dynamic_memory_type source_type, const bool external_memory); +template >)> +void +memcpy(ExecutionPolicy&& policy, + void* destination, + const void* source, + index64_t bytes, + dynamic_memory_type destination_type, + dynamic_memory_type source_type) +{ + if (source_type == dynamic_memory_type::device && destination_type == dynamic_memory_type::device) + { + stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_device_to_device(std::forward(policy), + destination, + source, + bytes); + } + else if (source_type == dynamic_memory_type::device && destination_type == dynamic_memory_type::host) + { + stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_device_to_host(std::forward(policy), + destination, + source, + bytes); + } + else if (source_type == dynamic_memory_type::host && destination_type == dynamic_memory_type::device) + { + stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_host_to_device(std::forward(policy), + destination, + source, + bytes); + } + else if (source_type == dynamic_memory_type::host && destination_type == dynamic_memory_type::host) + { + stdgpu::STDGPU_BACKEND_NAMESPACE::memcpy_host_to_host(std::forward(policy), + destination, + source, + bytes); + } + else + { + printf("stdgpu::detail::memcpy : Unsupported dynamic source or destination memory type\n"); + return; + } +} + template class uninitialized_fill_functor { diff --git a/src/stdgpu/openmp/impl/memory_detail.h b/src/stdgpu/openmp/impl/memory_detail.h new file mode 100644 index 00000000..ea734132 --- /dev/null +++ b/src/stdgpu/openmp/impl/memory_detail.h @@ -0,0 +1,63 @@ +/* + * Copyright 2024 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STDGPU_OPENMP_MEMORY_DETAIL_H +#define STDGPU_OPENMP_MEMORY_DETAIL_H + +#include + +#include + +namespace stdgpu::openmp +{ + +template >)> +void +memcpy_device_to_device([[maybe_unused]] ExecutionPolicy&& policy, + void* destination, + const void* source, + index64_t bytes) +{ + std::memcpy(destination, source, static_cast(bytes)); +} + +template >)> +void +memcpy_device_to_host([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + std::memcpy(destination, source, static_cast(bytes)); +} + +template >)> +void +memcpy_host_to_device([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + std::memcpy(destination, source, static_cast(bytes)); +} + +template >)> +void +memcpy_host_to_host([[maybe_unused]] ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes) +{ + std::memcpy(destination, source, static_cast(bytes)); +} + +} // namespace stdgpu::openmp + +#endif // STDGPU_OPENMP_MEMORY_DETAIL_H diff --git a/src/stdgpu/openmp/memory.h b/src/stdgpu/openmp/memory.h index 130ec9d7..11a2149f 100644 --- a/src/stdgpu/openmp/memory.h +++ b/src/stdgpu/openmp/memory.h @@ -17,6 +17,8 @@ #define STDGPU_OPENMP_MEMORY_H #include +#include +#include namespace stdgpu::openmp { @@ -90,6 +92,56 @@ memcpy_host_to_device(void* destination, const void* source, index64_t bytes); void memcpy_host_to_host(void* destination, const void* source, index64_t bytes); +/** + * \brief Performs platform-specific memory copy from device to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from device to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_device_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to device + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_device(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + +/** + * \brief Performs platform-specific memory copy from host to host + * \tparam ExecutionPolicy The type of the execution policy + * \param[in] policy The execution policy + * \param[in] destination The destination array + * \param[in] source The source array + * \param[in] bytes The size of the allocated array + */ +template >)> +void +memcpy_host_to_host(ExecutionPolicy&& policy, void* destination, const void* source, index64_t bytes); + } // namespace stdgpu::openmp +#include + #endif // STDGPU_OPENMP_MEMORY_H diff --git a/tests/stdgpu/atomic.inc b/tests/stdgpu/atomic.inc index 7faddf72..a5d7719c 100644 --- a/tests/stdgpu/atomic.inc +++ b/tests/stdgpu/atomic.inc @@ -2449,5 +2449,13 @@ TEST_F(stdgpu_atomic, custom_execution_policy) stdgpu::atomic value = stdgpu::atomic::createDeviceObject(policy); + const int new_value = 42; + + EXPECT_EQ(value.load(policy), int()); + + value.store(policy, new_value); + + EXPECT_EQ(value.load(policy), new_value); + stdgpu::atomic::destroyDeviceObject(policy, value); } diff --git a/tests/stdgpu/main.cpp b/tests/stdgpu/main.cpp index 04e363ec..b5fa0c1c 100644 --- a/tests/stdgpu/main.cpp +++ b/tests/stdgpu/main.cpp @@ -66,6 +66,7 @@ main(int argc, char* argv[]) stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host), stdgpu::get_allocation_count(stdgpu::dynamic_memory_type::host) - stdgpu::get_deallocation_count(stdgpu::dynamic_memory_type::host)); + printf("+---------------------------------------------------------+\n"); return result; }