diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio index b2b30b858e..fee6cab1ed 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio @@ -1,301 +1,301 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg index 0b62bbda0e..54b48bcd9a 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg @@ -1,2 +1,4 @@ -timetimedefault streamdefault streamH2Ddata1H2D...H2Ddata2H2D...kerneldata1kernel...kerneldata2kernel...D2Hdata1D2H...D2Hdata2D2H...H2Ddata2H2D...kerneldata2kernel...stream2stream2D2Hdata2D2H...H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata1D2H...default streamdefault streamSeqeuntial calls:Seqeuntial calls:Asynchronous calls:Asynchronous calls:Asynchronous calls with hipEvent:Asynchronous calls with hipEvent: -H2Ddata2H2D...kerneldata2kernel...stream2stream2H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata1D2H...default streamdefault streamevent1event1event2event2D2Hdata2D2H...create eventscreate eventsText is not SVG - cannot display \ No newline at end of file + + + +timetimedefault streamdefault streamH2Ddata1H2D...H2Ddata2H2D...kerneldata1kernel...kerneldata2kernel...D2Hdata1D2H...D2Hdata2D2H...H2Ddata2H2D...kerneldata2kernel...stream2stream2D2Hdata2D2H...H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata1D2H...default streamdefault streamSeqeuntial calls:Seqeuntial calls:Asynchronous calls:Asynchronous calls:Asynchronous calls with hipEvent:Asynchronous calls with hipEvent: H2Ddata2H2D...kerneldata2kernel...stream2stream2H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata2D2H...default streamdefault streamevent1event1event2event2D2Hdata1D2H...create eventscreate eventsText is not SVG - cannot display \ No newline at end of file diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst index f55b0adca0..8596bf3d1c 100644 --- a/docs/how-to/hip_runtime_api/asynchronous.rst +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -94,9 +94,9 @@ When running kernels in parallel, the execution time can increase due to contention for shared resources. This is because multiple kernels may attempt to access the same GPU resources simultaneously, leading to delays. -Asynchronous kernel execution is beneficial only under specific conditions. -It is most effective when the kernels do not fully utilize the GPU's resources. -In such cases, overlapping kernel execution can improve overall throughput and +Asynchronous kernel execution is beneficial only under specific conditions It +is most effective when the kernels do not fully utilize the GPU's resources. In +such cases, overlapping kernel execution can improve overall throughput and efficiency by keeping the GPU busy without exceeding its capacity. Overlap of data transfer and kernel execution @@ -161,8 +161,8 @@ Synchronization, event management and synchronous calls =============================================================================== Synchronization and event management are important for coordinating tasks and -ensuring correct execution order, and synchronous calls are -necessary for maintaining data consistency. +ensuring correct execution order, and synchronous calls are necessary for +maintaining data consistency. Synchronous calls ------------------------------------------------------------------------------- @@ -199,7 +199,7 @@ Example ------------------------------------------------------------------------------- The examples shows the difference between sequential, asynchronous calls and -asynchronous calls with hipEvents. +asynchronous calls with ``hipEvents``. .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg :alt: Compare the different calls @@ -396,28 +396,30 @@ The example codes // Stream 1: Host to Device 1 hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); + // Stream 2: Host to Device 2 + hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); + // Stream 1: Kernel 1 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream1, d_data1, 1); + hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream1, d_data1, 1); - // Record event after the GPU kernel in stream1 + // Record event after the GPU kernel in Stream 1 hipEventRecord(event1, stream1); - // Stream 1: Device to Host 1 (after event) + // Stream 2: Wait for Event 1 before starting Kernel 2 hipStreamWaitEvent(stream2, event1, 0); - hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream2); - - // Stream 2: Host to Device 2 - hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); // Stream 2: Kernel 2 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream2, d_data2, 2); + hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream2, d_data2, 2); - // Record event after the GPU kernel in stream2 + // Record event after Kernel 2 in Stream 2 hipEventRecord(event2, stream2); - // Stream 2: Device to Host 2 (after event) + // Stream 1: Wait for Event 2 before Device to Host copy hipStreamWaitEvent(stream1, event2, 0); - hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream1); + hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); + + // Stream 2: Device to Host 2 (after Kernel 2) + hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream2); // Wait for all operations in both streams to complete hipStreamSynchronize(stream1);