From bc29d444679a2caab8917c0d6a043a672ecf701c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 21 Nov 2024 12:32:39 +0100 Subject: [PATCH] Add asynchronous execution WIP WIP WIP WIP WIP --- .wordlist.txt | 3 +- .../hip_runtime_api/asynchronous/async.drawio | 106 +++ .../hip_runtime_api/asynchronous/async.svg | 4 + .../hip_runtime_api/asynchronous/event.drawio | 130 ++++ .../hip_runtime_api/asynchronous/event.svg | 4 + .../asynchronous/sequential.drawio | 94 +++ .../asynchronous/sequential.svg | 4 + docs/how-to/hip_runtime_api.rst | 1 + docs/how-to/hip_runtime_api/asynchronous.rst | 618 ++++++++++++++++++ docs/sphinx/_toc.yml.in | 1 + 10 files changed, 964 insertions(+), 1 deletion(-) create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/async.drawio create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/async.svg create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/event.drawio create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/event.svg create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg create mode 100644 docs/how-to/hip_runtime_api/asynchronous.rst diff --git a/.wordlist.txt b/.wordlist.txt index b3b8686678..a7955394f8 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -7,7 +7,7 @@ APUs AQL AXPY asm -Asynchrony +asynchrony backtrace Bitcode bitcode @@ -118,6 +118,7 @@ overindexing oversubscription overutilized parallelizable +parallelized pixelated pragmas preallocated diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio new file mode 100644 index 0000000000..809d8bfb84 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio @@ -0,0 +1,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/async.svg b/docs/data/how-to/hip_runtime_api/asynchronous/async.svg new file mode 100644 index 0000000000..4125fde2bc --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/async.svg @@ -0,0 +1,4 @@ + + + +
time
time
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
D2H
data2
D2H...
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio new file mode 100644 index 0000000000..f2bd3215d3 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/event.svg new file mode 100644 index 0000000000..f196cc0a42 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/event.svg @@ -0,0 +1,4 @@ + + + +
time
time
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
event1
event1
event2
event2
D2H
data2
D2H...
create events
create events
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio new file mode 100644 index 0000000000..95beaa7b09 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg new file mode 100644 index 0000000000..50bf510aae --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg @@ -0,0 +1,4 @@ + + + +
time
time
default stream
default stream
H2D
data1
H2D...
H2D
data2
H2D...
kernel
data1
kernel...
kernel
data2
kernel...
D2H
data1
D2H...
D2H
data2
D2H...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/how-to/hip_runtime_api.rst b/docs/how-to/hip_runtime_api.rst index 223f1b296e..f20d4178e6 100644 --- a/docs/how-to/hip_runtime_api.rst +++ b/docs/how-to/hip_runtime_api.rst @@ -40,6 +40,7 @@ Here are the various HIP Runtime API high level functions: * :doc:`./hip_runtime_api/initialization` * :doc:`./hip_runtime_api/memory_management` * :doc:`./hip_runtime_api/error_handling` +* :doc:`./hip_runtime_api/asynchronous` * :doc:`./hip_runtime_api/cooperative_groups` * :doc:`./hip_runtime_api/hipgraph` * :doc:`./hip_runtime_api/call_stack` diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst new file mode 100644 index 0000000000..62791361de --- /dev/null +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -0,0 +1,618 @@ +.. meta:: + :description: This topic describes asynchronous concurrent execution in HIP + :keywords: AMD, ROCm, HIP, asynchronous concurrent execution, asynchronous, async, concurrent, concurrency + +.. _asynchronous_how-to: + +******************************************************************************* +Asynchronous concurrent execution +******************************************************************************* + +Asynchronous concurrent execution empowers developers to achieve efficient +parallelism and resource utilization. By understanding and implementing key +concepts and best practices, significant performance improvements are within +reach. Techniques such as overlapping computation and data transfer, managing +concurrent kernel execution, and utilizing graphs offer a robust framework for +optimizing GPU performance. As GPU technology evolves, the principles of +asynchronous execution remain critical for achieving high throughput and low +latency. Developers are encouraged to explore and experiment with these +techniques to fully harness their potential. + +Streams and concurrent execution +=============================================================================== + +Streams play a crucial role in managing the execution order of kernels and +memory operations on the GPU. By utilizing streams, developers can ensure +efficient execution of tasks, leading to improved performance and resource +utilization. + +Managing streams +------------------------------------------------------------------------------- + +A stream is a sequence of commands that execute in order on the GPU, but +commands in different streams can run concurrently. Streams enable the overlap +of computation and data transfer, ensuring continuous GPU activity. To create a +stream, the function :cpp:func:`hipStreamCreate` is used, returning a handle to +the newly created stream. Assigning different operations to different streams +allows multiple tasks to run simultaneously, improving overall performance. +Proper management of streams is crucial for effective asynchrony. Using streams +wisely can significantly reduce idle times and enhance the efficiency of GPU +applications. + +Concurrent execution between host and device +------------------------------------------------------------------------------- + +Concurrent execution between the host (CPU) and device (GPU) allows the CPU to +perform other tasks while the GPU is executing kernels. Kernels can be launched +asynchronously using ``hipLaunchKernelDefault`` with a stream, enabling the CPU +to continue executing other code while the GPU processes the kernel. Similarly, +memory operations like :cpp:func:`hipMemcpyAsync` can be performed +asynchronously, allowing data transfers between the host and device without +blocking the CPU. This concurrent execution model is vital for achieving high +performance, as it ensures that both the CPU and GPU are utilized efficiently. +By distributing workloads between the host and device, developers can +significantly reduce execution time and improve application responsiveness. + +Concurrent kernel execution +------------------------------------------------------------------------------- + +Concurrent execution of multiple kernels on the GPU allows different kernels to +run simultaneously, leveraging the parallel processing capabilities of the GPU. +Utilizing multiple streams enables developers to launch kernels concurrently, +maximizing GPU resource usage. Managing dependencies between kernels is crucial +for ensuring correct execution order. This can be achieved using +:cpp:func:`hipStreamWaitEvent`, which allows a kernel to wait for a specific +event before starting execution. Proper management of concurrent kernel +execution can lead to significant performance gains, particularly in +applications with independent tasks that can be parallelized. By maximizing the +utilization of GPU cores, developers can achieve higher throughput and +efficiency. + +Overlap of data transfer and kernel execution +=============================================================================== + +One of the primary benefits of asynchronous operations is the ability to +overlap data transfer with kernel execution, leading to better resource +utilization and improved performance. + +Querying device capabilities +------------------------------------------------------------------------------- + +Some AMD HIP-enabled devices can perform asynchronous memory copy operations to +or from the GPU concurrently with kernel execution. Applications can query this +capability by checking the ``asyncEngineCount`` device property. Devices with +an ``asyncEngineCount`` greater than zero support concurrent data transfers. +Additionally, if host memory is involved in the copy, it should be page-locked +to ensure optimal performance. + +Asynchronous memory operations +------------------------------------------------------------------------------- + +Asynchronous memory operations allow data to be transferred between the host +and device while kernels are being executed on the GPU. Using operations like +:cpp:func:`hipMemcpyAsync`, developers can initiate data transfers without +waiting for the previous operation to complete. This overlap of computation and +data transfer ensures that the GPU is not idle while waiting for data. Examples +include launching kernels in one stream while performing data transfers in +another. By carefully orchestrating these operations, developers can achieve +significant performance improvements. This technique is especially useful in +applications with large data sets that need to be processed quickly. + +Concurrent data transfers +------------------------------------------------------------------------------- + +Concurrent data transfers are supported between the host and device, within +device memory, and among multiple devices. Using :cpp:func:`hipMemcpyAsync`, +data can be transferred asynchronously, allowing for efficient data movement +without blocking other operations. :cpp:func:`hipMemcpyPeerAsync` enables data +transfers between different GPUs, facilitating multi-GPU communication. +Concurrent data transfers are important for applications that require frequent +and large data movements. By overlapping data transfers with computation, +developers can minimize idle times and enhance performance. Proper management +of data transfers can lead to efficient utilization of the memory bandwidth and +reduce bottlenecks. This is particularly important for applications that need +to handle large volumes of data efficiently. + +Concurrent data transfers with intra-device copies +------------------------------------------------------------------------------- + +It is also possible to perform intra-device copies simultaneously with kernel +execution on devices that support the ``concurrentKernels`` device property +and/or with copies to or from the device (for devices that support the +``asyncEngineCount`` property). Intra-device copies can be initiated using +standard memory copy functions with destination and source addresses residing on +the same device. + +Synchronization, event management and synchronous calls +=============================================================================== + +Synchronization and event management are crucial for coordinating tasks and +ensuring correct execution order, while synchronous calls are sometimes +necessary for maintaining data consistency. + +Synchronous calls +------------------------------------------------------------------------------- + +Despite the benefits of asynchronous operations, there are scenarios where +synchronous calls are necessary. Synchronous calls ensure task completion +before moving to the next operation, crucial for data consistency and correct +execution order. For example, :cpp:func:`hipMemcpy` for data transfers waits +for completion before returning control to the host. Similarly, synchronous +kernel launches are used when immediate completion is required. When a +synchronous function is called, control is not returned to the host thread +before the device has completed the requested task. The behavior of the host +thread—whether to yield, block, or spin—can be specified using +``hipSetDeviceFlags`` with specific flags. Understanding when to use +synchronous calls is crucial for managing execution flow and avoiding data +races. + +Events for synchronization +------------------------------------------------------------------------------- + +Events are critical for synchronization and tracking asynchronous operation +progress. By creating an event with :cpp:func:`hipEventCreate` and recording it +with ``hipEventRecord``, developers can synchronize operations across streams, +ensuring correct task execution order. :cpp:func:`hipEventSynchronize` allows +waiting for an event to complete before proceeding with the next operation. +Events are vital for managing dependencies and maintaining data consistency. +Proper event usage avoids data races and ensures efficient task execution. +Leveraging events effectively can optimize performance and enhance the overall +efficiency of GPU applications. + + +Programmatic dependent launch and synchronization +------------------------------------------------------------------------------- + +While CUDA supports programmatic dependent launches allowing a secondary kernel +to start before the primary kernel finishes, HIP achieves similar functionality +using streams and events. By employing :cpp:func:`hipStreamWaitEvent`, it is +possible to manage the execution order without explicit hardware support. This +mechanism allows a secondary kernel to launch as soon as the necessary +conditions are met, even if the primary kernel is still running. Such an +approach optimizes resource utilization and improves performance by efficiently +overlapping operations, especially in complex applications with interdependent +tasks. + +Example +------------------------------------------------------------------------------- + +.. tab-set:: + + .. tab-item:: asynchronous + + .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/async.svg + :alt: Asynchronous concurrency + :align: center + + .. code-block:: cpp + + #include + #include + + // GPU Kernel + __global__ void kernel(int *data, int value) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + data[idx] = value; + } + + int main() + { + constexpr int N = 1024; + + int *h_data1, *h_data2, *d_data1, *d_data2; + h_data1 = new int[N]; + h_data2 = new int[N]; + + // Initialize host data + for(int i = 0; i < N; ++i) + { + h_data1[i] = i; + h_data2[i] = i * 2; + } + + // Set device flags to control the host thread's behavior (e.g., yielding) + hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + + // Allocate device memory + hipMalloc(&d_data1, N * sizeof(*d_data1)); + hipMalloc(&d_data2, N * sizeof(*d_data2)); + + // Create streams + hipStream_t stream1, stream2; + hipStreamCreate(&stream1); + hipStreamCreate(&stream2); + + // Stream 1: Host to Device 1 + hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); + + // Stream 1: Kernel 1 + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream1, d_data1, 1); + + // Stream 1: Device to Host 1 + hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); + + // Stream 2: Host to Device 2 + hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); + + // Stream 2: Kernel 2 + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream2, d_data2, 2); + + // Stream 2: Device to Host 2 + hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream2); + + // Wait for all operations in both streams to complete + hipStreamSynchronize(stream1); + hipStreamSynchronize(stream2); + + // Cleanup + hipStreamDestroy(stream1); + hipStreamDestroy(stream2); + hipFree(d_data1); + hipFree(d_data2); + delete[] h_data1; + delete[] h_data2; + + std::cout << "Asynchronous execution completed successfully." << std::endl; + + return 0; + } + + .. tab-item:: hipStreamWaitEvent + + .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/event.svg + :alt: Asynchronous concurrency with events + :align: center + + .. code-block:: cpp + + #include + #include + + // GPU Kernel + __global__ void kernel(int *data, int value) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + data[idx] = value; + } + + int main() + { + constexpr int N = 1024; + + int *h_data1, *h_data2, *d_data1, *d_data2; + h_data1 = new int[N]; + h_data2 = new int[N]; + + // Initialize host data + for(int i = 0; i < N; ++i) + { + h_data1[i] = i; + h_data2[i] = i * 2; + } + + // Set device flags to control the host thread's behavior (e.g., yielding) + hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + + // Allocate device memory + hipMalloc(&d_data1, N * sizeof(*d_data1)); + hipMalloc(&d_data2, N * sizeof(*d_data2)); + + // Create streams + hipStream_t stream1, stream2; + hipStreamCreate(&stream1); + hipStreamCreate(&stream2); + + // Create events + hipEvent_t event1, event2; + hipEventCreate(&event1); + hipEventCreate(&event2); + + // Stream 1: Host to Device 1 + hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); + + // Stream 1: Kernel 1 + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream1, d_data1, 1); + + // Record event after the GPU kernel in stream1 + hipEventRecord(event1, stream1); + + // Stream 1: Device to Host 1 (after event) + hipStreamWaitEvent(stream2, event1, 0); + hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream2); + + // Stream 2: Host to Device 2 + hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); + + // Stream 2: Kernel 2 + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream2, d_data2, 2); + + // Record event after the GPU kernel in stream2 + hipEventRecord(event2, stream2); + + // Stream 2: Device to Host 2 (after event) + hipStreamWaitEvent(stream1, event2, 0); + hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream1); + + // Wait for all operations in both streams to complete + hipStreamSynchronize(stream1); + hipStreamSynchronize(stream2); + + // Cleanup + hipEventDestroy(event1); + hipEventDestroy(event2); + hipStreamDestroy(stream1); + hipStreamDestroy(stream2); + hipFree(d_data1); + hipFree(d_data2); + delete[] h_data1; + delete[] h_data2; + + std::cout << "Asynchronous execution with events completed successfully." << std::endl; + + return 0; + } + + .. tab-item:: sequential + + .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/sequential.svg + :alt: Asynchronous concurrency with events + :align: center + + + .. code-block:: cpp + + #include + #include + + // GPU Kernel + __global__ void kernel(int *data, int value) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + data[idx] = value; + } + + int main() + { + constexpr int N = 1024; + + int *h_data1, *h_data2, *d_data1, *d_data2; + h_data1 = new int[N]; + h_data2 = new int[N]; + + // Initialize host data + for(int i = 0; i < N; ++i) + { + h_data1[i] = i; + h_data2[i] = i * 2; + } + + // Set device flags to control the host thread's behavior (e.g., yielding) + hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + + // Allocate device memory + hipMalloc(&d_data1, N * sizeof(*d_data1)); + hipMalloc(&d_data2, N * sizeof(*d_data2)); + + // Host to Device copies + hipMemcpy(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice); + hipMemcpy(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice); + + // Launch the GPU kernels + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, 0, d_data1, 1); + hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, 0, d_data2, 2); + + // Device to Host copies + hipMemcpy(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost); + hipMemcpy(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost); + + // Wait for all operations to complete + hipDeviceSynchronize(); + + // Cleanup + hipFree(d_data1); + hipFree(d_data2); + delete[] h_data1; + delete[] h_data2; + + std::cout << "Sequential execution completed successfully." << std::endl; + + return 0; + } + +HIP Graphs +=============================================================================== + +HIP Graphs provide a way to represent complex workflows as a series of +interconnected tasks. By creating and managing graphs, developers can optimize +dependent task execution. Graphs reduce the overhead associated with launching +individual kernels and memory operations, providing a high-level abstraction +for managing dependencies and synchronizing tasks. Examples include +representing a sequence of kernels and memory operations as a single graph. +Using graphs enhances performance and simplifies complex workflow management. +This technique is particularly useful for applications with intricate +dependencies and multiple execution stages. + +For more details, see the :ref:`how_to_HIP_graph` documentation. + +Example +------------------------------------------------------------------------------- + +This example demonstrates the use of HIP Graphs to manage asynchronous +concurrent execution of two kernels. It creates a graph with nodes for the +kernel executions and memory copies, which are then instantiated and launched +in two separate streams. This setup ensures efficient and concurrent execution, +leveraging the high-level abstraction of HIP Graphs to simplify the workflow +and improve performance. + +.. code-block:: cpp + + #include + #include + + __global__ void kernel(int *data, int value) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + data[idx] = value; + } + + int main() + { + constexpr int N = 1024; + + int *d_data1, *d_data2; + int h_data1[N], h_data2[N]; + + hipGraph_t graph; + hipGraphExec_t graphExec; + hipStream_t stream1, stream2; + hipGraphNode_t kernelNode1, kernelNode2, memcpyNode1, memcpyNode2; + hipKernelNodeParams kernelNodeParams1 = {0}; + hipKernelNodeParams kernelNodeParams2 = {0}; + hipMemcpy3DParms memcpyParams1 = {0}; + hipMemcpy3DParms memcpyParams2 = {0}; + + // Allocate device memory + hipMalloc(&d_data1, N * sizeof(*d_data1)); + hipMalloc(&d_data2, N * sizeof(*d_data2)); + + // Create streams + hipStreamCreate(&stream1); + hipStreamCreate(&stream2); + + // Create an empty graph + hipGraphCreate(&graph, 0); + + // Define kernel1 node parameters + void *kernelArgs1[] = {&d_data1, &N}; + kernelNodeParams1.func = reinterpret_cast(kernel); + kernelNodeParams1.gridDim = dim3(N / 256); + kernelNodeParams1.blockDim = dim3(256); + kernelNodeParams1.sharedMemBytes = 0; + kernelNodeParams1.kernelParams = kernelArgs1; + kernelNodeParams1.extra = nullptr; + + // Define kernel2 node parameters + void *kernelArgs2[] = {&d_data2, &N}; + kernelNodeParams2.func = reinterpret_cast(kernel); + kernelNodeParams2.gridDim = dim3(N / 256); + kernelNodeParams2.blockDim = dim3(256); + kernelNodeParams2.sharedMemBytes = 0; + kernelNodeParams2.kernelParams = kernelArgs2; + kernelNodeParams2.extra = nullptr; + + // Add kernel nodes to graph + hipGraphAddKernelNode(&kernelNode1, graph, nullptr, 0, &kernelNodeParams1); + hipGraphAddKernelNode(&kernelNode2, graph, nullptr, 0, &kernelNodeParams2); + + // Define memcpy node parameters for stream1 + memcpyParams1.srcArray = nullptr; + memcpyParams1.srcPos = make_hipPos(0, 0, 0); + memcpyParams1.dstArray = nullptr; + memcpyParams1.dstPos = make_hipPos(0, 0, 0); + memcpyParams1.extent = make_hipExtent(N * sizeof(*d_data1), 1, 1); + memcpyParams1.kind = hipMemcpyDeviceToHost; + memcpyParams1.srcPtr = make_hipPitchedPtr(d_data1, N * sizeof(*d_data1), N, 1); + memcpyParams1.dstPtr = make_hipPitchedPtr(h_data1, N * sizeof(*d_data1), N, 1); + + // Define memcpy node parameters for stream2 + memcpyParams2.srcArray = nullptr; + memcpyParams2.srcPos = make_hipPos(0, 0, 0); + memcpyParams2.dstArray = nullptr; + memcpyParams2.dstPos = make_hipPos(0, 0, 0); + memcpyParams2.extent = make_hipExtent(N * sizeof(*d_data2), 1, 1); + memcpyParams2.kind = hipMemcpyDeviceToHost; + memcpyParams2.srcPtr = make_hipPitchedPtr(d_data2, N * sizeof(*d_data2), N, 1); + memcpyParams2.dstPtr = make_hipPitchedPtr(h_data2, N * sizeof(*d_data2), N, 1); + + // Add memcpy nodes to graph + hipGraphAddMemcpyNode(&memcpyNode1, graph, &kernelNode1, 1, &memcpyParams1); + hipGraphAddMemcpyNode(&memcpyNode2, graph, &kernelNode2, 1, &memcpyParams2); + + // Instantiate the graph + hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0); + + // Launch the graph asynchronously in different streams + hipGraphLaunch(graphExec, stream1); + hipGraphLaunch(graphExec, stream2); + + // Wait for all operations in both streams to complete + hipStreamSynchronize(stream1); + hipStreamSynchronize(stream2); + + // Cleanup + hipGraphExecDestroy(graphExec); + hipGraphDestroy(graph); + hipStreamDestroy(stream1); + hipStreamDestroy(stream2); + hipFree(d_data1); + hipFree(d_data2); + + std::cout << "Graph executed with asynchronous concurrent execution." << std::endl; + + return 0; + } + +Best practices and performance optimization +=============================================================================== + +Achieving optimal performance in GPU-accelerated applications involves adhering +to best practices and continuously tuning the code to ensure efficient resource +utilization. + +Implementing best practices +------------------------------------------------------------------------------- + +Following best practices for managing asynchronous operations is crucial for +achieving optimal performance. Here are some key strategies to consider: + +- minimize synchronization overhead: Synchronize only when necessary to avoid + stalling the GPU and hindering parallelism. + +- leverage asynchronous operations: Use asynchronous memory transfers and + kernel launches to overlap computation and data transfer, maximizing resource + utilization. + +- balance workloads: Distribute tasks efficiently between the host and device + to ensure both are fully utilized. This can significantly enhance application + responsiveness and performance. + +- utilize multiple streams: Create and manage multiple streams to run commands + concurrently, reducing idle times and improving overall efficiency. + +By implementing these strategies, developers can significantly enhance +application responsiveness and overall performance. These best practices are +essential for effective asynchronous operation management and for fully +leveraging the capabilities of modern GPUs. + +Balancing and profiling +------------------------------------------------------------------------------- + +Profiling tools help identify bottlenecks by providing detailed insights into +the execution of GPU-accelerated applications. These tools allow developers to +visualize how computational tasks and memory operations are distributed across +different hardware resources. By analyzing these visualizations, developers can +pinpoint areas where the application may be spending excessive time, such as +during synchronization points or data transfers. + +Key profiling metrics include: + +- kernel execution time: Measuring the time spent executing each kernel helps + identify which kernels are taking longer than expected and may need + optimization. + +- memory transfer time: Assessing the duration of data transfers between the + host and device can highlight inefficiencies or bottlenecks in memory + operations. + +- stream utilization: Evaluating how streams are utilized can reveal whether + resources are being used effectively or if some streams are underutilized. + +- concurrency: Analyzing the overlap of computation and data transfers helps + identify opportunities to improve concurrency and reduce idle times. + +Using profiling tools, developers gain a comprehensive understanding of their +application's performance characteristics, making informed decisions about +where to focus optimization efforts. Regular profiling and adjustments ensure +that applications run at their best, maintaining high efficiency and +performance. \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 5d69918b99..1c0adb05df 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -50,6 +50,7 @@ subtrees: - file: how-to/hip_runtime_api/memory_management/stream_ordered_allocator - file: how-to/hip_runtime_api/error_handling - file: how-to/hip_runtime_api/cooperative_groups + - file: how-to/hip_runtime_api/asynchronous - file: how-to/hip_runtime_api/hipgraph - file: how-to/hip_runtime_api/call_stack - file: how-to/hip_runtime_api/multi_device