diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio index fee6cab1ed..8907d025f2 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio @@ -1,301 +1,289 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg index 54b48bcd9a..43d146db2d 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg @@ -1,4 +1,2 @@ - - - -timetimedefault streamdefault streamH2Ddata1H2D...H2Ddata2H2D...kerneldata1kernel...kerneldata2kernel...D2Hdata1D2H...D2Hdata2D2H...H2Ddata2H2D...kerneldata2kernel...stream2stream2D2Hdata2D2H...H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata1D2H...default streamdefault streamSeqeuntial calls:Seqeuntial calls:Asynchronous calls:Asynchronous calls:Asynchronous calls with hipEvent:Asynchronous calls with hipEvent: H2Ddata2H2D...kerneldata2kernel...stream2stream2H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata2D2H...default streamdefault streamevent1event1event2event2D2Hdata1D2H...create eventscreate eventsText is not SVG - cannot display \ No newline at end of file +timetimedefault streamdefault streamH2Ddata1H2D...H2Ddata2H2D...kerneldata1kernel...kerneldata2kernel...D2Hdata1D2H...D2Hdata2D2H...H2Ddata2H2D...kerneldata2kernel...stream2stream2D2Hdata2D2H...H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata1D2H...default streamdefault streamSeqeuntial calls:Seqeuntial calls:Asynchronous calls:Asynchronous calls:Asynchronous calls with hipEvent:Asynchronous calls with hipEvent: +H2Ddata2H2D...kerneldata2kernel...stream2stream2H2Ddata1H2D...kerneldata1kernel...stream1stream1D2Hdata2D2H...default streamdefault streameventeventD2Hdata1D2H...create eventscreate eventsText is not SVG - cannot display \ No newline at end of file diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst index 8596bf3d1c..18359ed32a 100644 --- a/docs/how-to/hip_runtime_api/asynchronous.rst +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -48,10 +48,6 @@ to the newly created stream: - :cpp:func:`hipStreamCreateWithPriority``: Allows creating a stream with a specified priority, enabling prioritization of certain tasks. -Assigning different operations to different streams allows multiple tasks to -run simultaneously, improving overall -:ref:`performance`. - The :cpp:func:`hipStreamSynchronize` function is used to block the calling host thread until all previously submitted tasks in a specified HIP stream have completed. It ensures that all operations in the given stream, such as kernel @@ -209,7 +205,7 @@ The example codes .. tab-set:: - .. tab-item:: sequential + .. tab-item:: Sequential .. code-block:: cpp @@ -271,7 +267,7 @@ The example codes return 0; } - .. tab-item:: asynchronous + .. tab-item:: Asynchronous .. code-block:: cpp @@ -389,9 +385,8 @@ The example codes hipStreamCreate(&stream2); // Create events - hipEvent_t event1, event2; - hipEventCreate(&event1); - hipEventCreate(&event2); + hipEvent_t event; + hipEventCreate(&event); // Stream 1: Host to Device 1 hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); @@ -402,22 +397,18 @@ The example codes // Stream 1: Kernel 1 hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream1, d_data1, 1); + // Stream 1: Device to Host 2 (after Kernel 1) + hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); + // Record event after the GPU kernel in Stream 1 - hipEventRecord(event1, stream1); + hipEventRecord(event, stream1); // Stream 2: Wait for Event 1 before starting Kernel 2 - hipStreamWaitEvent(stream2, event1, 0); + hipStreamWaitEvent(stream2, event, 0); // Stream 2: Kernel 2 hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream2, d_data2, 2); - // Record event after Kernel 2 in Stream 2 - hipEventRecord(event2, stream2); - - // Stream 1: Wait for Event 2 before Device to Host copy - hipStreamWaitEvent(stream1, event2, 0); - hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); - // Stream 2: Device to Host 2 (after Kernel 2) hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream2); @@ -426,8 +417,7 @@ The example codes hipStreamSynchronize(stream2); // Cleanup - hipEventDestroy(event1); - hipEventDestroy(event2); + hipEventDestroy(event); hipStreamDestroy(stream1); hipStreamDestroy(stream2); hipFree(d_data1); diff --git a/docs/how-to/performance_guidelines.rst b/docs/how-to/performance_guidelines.rst index 9b62e4854e..26d74975fd 100644 --- a/docs/how-to/performance_guidelines.rst +++ b/docs/how-to/performance_guidelines.rst @@ -34,12 +34,14 @@ reveal and efficiently provide as much parallelism as possible. The parallelism can be performed at the application level, device level, and multiprocessor level. +.. _application_parallel_execution: + Application level -------------------------------------------------------------------------------- To enable parallel execution of the application across the host and devices, use -asynchronous calls and streams. Assign workloads based on efficiency: serial to -the host or parallel to the devices. +:ref:`asynchronous calls and streams `. Assign workloads +based on efficiency: serial to the host or parallel to the devices. For parallel workloads, when threads belonging to the same block need to synchronize to share data, use :cpp:func:`__syncthreads()` (see: