diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio index fee6cab1ed..8907d025f2 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio @@ -1,301 +1,289 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg index 54b48bcd9a..43d146db2d 100644 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg @@ -1,4 +1,2 @@ - - - -
time
time
default stream
default stream
H2D
data1
H2D...
H2D
data2
H2D...
kernel
data1
kernel...
kernel
data2
kernel...
D2H
data1
D2H...
D2H
data2
D2H...
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
D2H
data2
D2H...
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
Seqeuntial calls:
Seqeuntial calls:
Asynchronous calls:
Asynchronous calls:
Asynchronous calls with hipEvent:
Asynchronous calls with hipEvent:
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data2
D2H...
default stream
default stream
event1
event1
event2
event2
D2H
data1
D2H...
create events
create events
Text is not SVG - cannot display
\ No newline at end of file +
time
time
default stream
default stream
H2D
data1
H2D...
H2D
data2
H2D...
kernel
data1
kernel...
kernel
data2
kernel...
D2H
data1
D2H...
D2H
data2
D2H...
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
D2H
data2
D2H...
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
Seqeuntial calls:
Seqeuntial calls:
Asynchronous calls:
Asynchronous calls:
Asynchronous calls with hipEvent:
Asynchronous calls with hipEvent: +
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data2
D2H...
default stream
default stream
event
event
D2H
data1
D2H...
create events
create events
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst index 8596bf3d1c..18359ed32a 100644 --- a/docs/how-to/hip_runtime_api/asynchronous.rst +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -48,10 +48,6 @@ to the newly created stream: - :cpp:func:`hipStreamCreateWithPriority``: Allows creating a stream with a specified priority, enabling prioritization of certain tasks. -Assigning different operations to different streams allows multiple tasks to -run simultaneously, improving overall -:ref:`performance`. - The :cpp:func:`hipStreamSynchronize` function is used to block the calling host thread until all previously submitted tasks in a specified HIP stream have completed. It ensures that all operations in the given stream, such as kernel @@ -209,7 +205,7 @@ The example codes .. tab-set:: - .. tab-item:: sequential + .. tab-item:: Sequential .. code-block:: cpp @@ -271,7 +267,7 @@ The example codes return 0; } - .. tab-item:: asynchronous + .. tab-item:: Asynchronous .. code-block:: cpp @@ -389,9 +385,8 @@ The example codes hipStreamCreate(&stream2); // Create events - hipEvent_t event1, event2; - hipEventCreate(&event1); - hipEventCreate(&event2); + hipEvent_t event; + hipEventCreate(&event); // Stream 1: Host to Device 1 hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); @@ -402,22 +397,18 @@ The example codes // Stream 1: Kernel 1 hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream1, d_data1, 1); + // Stream 1: Device to Host 2 (after Kernel 1) + hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); + // Record event after the GPU kernel in Stream 1 - hipEventRecord(event1, stream1); + hipEventRecord(event, stream1); // Stream 2: Wait for Event 1 before starting Kernel 2 - hipStreamWaitEvent(stream2, event1, 0); + hipStreamWaitEvent(stream2, event, 0); // Stream 2: Kernel 2 hipLaunchKernelGGL(kernel, dim3(N / 256), dim3(256), 0, stream2, d_data2, 2); - // Record event after Kernel 2 in Stream 2 - hipEventRecord(event2, stream2); - - // Stream 1: Wait for Event 2 before Device to Host copy - hipStreamWaitEvent(stream1, event2, 0); - hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); - // Stream 2: Device to Host 2 (after Kernel 2) hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream2); @@ -426,8 +417,7 @@ The example codes hipStreamSynchronize(stream2); // Cleanup - hipEventDestroy(event1); - hipEventDestroy(event2); + hipEventDestroy(event); hipStreamDestroy(stream1); hipStreamDestroy(stream2); hipFree(d_data1); diff --git a/docs/how-to/performance_guidelines.rst b/docs/how-to/performance_guidelines.rst index 9b62e4854e..26d74975fd 100644 --- a/docs/how-to/performance_guidelines.rst +++ b/docs/how-to/performance_guidelines.rst @@ -34,12 +34,14 @@ reveal and efficiently provide as much parallelism as possible. The parallelism can be performed at the application level, device level, and multiprocessor level. +.. _application_parallel_execution: + Application level -------------------------------------------------------------------------------- To enable parallel execution of the application across the host and devices, use -asynchronous calls and streams. Assign workloads based on efficiency: serial to -the host or parallel to the devices. +:ref:`asynchronous calls and streams `. Assign workloads +based on efficiency: serial to the host or parallel to the devices. For parallel workloads, when threads belonging to the same block need to synchronize to share data, use :cpp:func:`__syncthreads()` (see: