From 6a139c6fd78c23496abcd9614b89dd72ab532ea3 Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Fri, 6 Dec 2024 19:18:07 +0100 Subject: [PATCH] Rewrite some sections + internal review --- .../hip_runtime_api/asynchronous/async.drawio | 106 --- .../hip_runtime_api/asynchronous/async.svg | 4 - .../hip_runtime_api/asynchronous/event.drawio | 130 ---- .../hip_runtime_api/asynchronous/event.svg | 4 - .../asynchronous/sequential.drawio | 94 --- .../asynchronous/sequential.svg | 4 - .../sequential_async_event.drawio | 298 ++++++++ .../asynchronous/sequential_async_event.svg | 2 + docs/how-to/hip_runtime_api/asynchronous.rst | 719 +++++++----------- docs/how-to/performance_guidelines.rst | 8 +- docs/sphinx/_toc.yml.in | 4 +- 11 files changed, 601 insertions(+), 772 deletions(-) delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/async.drawio delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/async.svg delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/event.drawio delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/event.svg delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio delete mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio create mode 100644 docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio deleted file mode 100644 index 809d8bfb84..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/async.drawio +++ /dev/null @@ -1,106 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/async.svg b/docs/data/how-to/hip_runtime_api/asynchronous/async.svg deleted file mode 100644 index 4125fde2bc..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/async.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
time
time
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
D2H
data2
D2H...
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio deleted file mode 100644 index f2bd3215d3..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/event.drawio +++ /dev/null @@ -1,130 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/event.svg deleted file mode 100644 index f196cc0a42..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/event.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
time
time
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
event1
event1
event2
event2
D2H
data2
D2H...
create events
create events
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio deleted file mode 100644 index 95beaa7b09..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.drawio +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg deleted file mode 100644 index 50bf510aae..0000000000 --- a/docs/data/how-to/hip_runtime_api/asynchronous/sequential.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
time
time
default stream
default stream
H2D
data1
H2D...
H2D
data2
H2D...
kernel
data1
kernel...
kernel
data2
kernel...
D2H
data1
D2H...
D2H
data2
D2H...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio new file mode 100644 index 0000000000..3d548182d7 --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.drawio @@ -0,0 +1,298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg new file mode 100644 index 0000000000..43d146db2d --- /dev/null +++ b/docs/data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg @@ -0,0 +1,2 @@ +
time
time
default stream
default stream
H2D
data1
H2D...
H2D
data2
H2D...
kernel
data1
kernel...
kernel
data2
kernel...
D2H
data1
D2H...
D2H
data2
D2H...
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
D2H
data2
D2H...
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data1
D2H...
default stream
default stream
Seqeuntial calls:
Seqeuntial calls:
Asynchronous calls:
Asynchronous calls:
Asynchronous calls with hipEvent:
Asynchronous calls with hipEvent: +
H2D
data2
H2D...
kernel
data2
kernel...
stream2
stream2
H2D
data1
H2D...
kernel
data1
kernel...
stream1
stream1
D2H
data2
D2H...
default stream
default stream
event
event
D2H
data1
D2H...
create events
create events
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/how-to/hip_runtime_api/asynchronous.rst b/docs/how-to/hip_runtime_api/asynchronous.rst index 62791361de..014b3d19cc 100644 --- a/docs/how-to/hip_runtime_api/asynchronous.rst +++ b/docs/how-to/hip_runtime_api/asynchronous.rst @@ -8,36 +8,56 @@ Asynchronous concurrent execution ******************************************************************************* -Asynchronous concurrent execution empowers developers to achieve efficient -parallelism and resource utilization. By understanding and implementing key -concepts and best practices, significant performance improvements are within -reach. Techniques such as overlapping computation and data transfer, managing -concurrent kernel execution, and utilizing graphs offer a robust framework for -optimizing GPU performance. As GPU technology evolves, the principles of -asynchronous execution remain critical for achieving high throughput and low -latency. Developers are encouraged to explore and experiment with these -techniques to fully harness their potential. +Asynchronous concurrent execution important for efficient parallelism and +resource utilization, with techniques such as overlapping computation and data +transfer, managing concurrent kernel execution with streams on single or +multiple devices or using HIP graphs. Streams and concurrent execution =============================================================================== -Streams play a crucial role in managing the execution order of kernels and -memory operations on the GPU. By utilizing streams, developers can ensure -efficient execution of tasks, leading to improved performance and resource -utilization. +All asynchronous APIs, such as kernel execution, data movement and potentially +data allocation/freeing all happen in the context of device streams. + +Streams are FIFO buffers of commands to execute in order on a given device. +Commands which enqueue tasks on a stream all return promptly and the command is +executed asynchronously. Multiple streams may point to the same device and +those streams may be fed from multiple concurrent host-side threads. Execution +on multiple streams may be concurrent but isn't required to be. Managing streams ------------------------------------------------------------------------------- -A stream is a sequence of commands that execute in order on the GPU, but -commands in different streams can run concurrently. Streams enable the overlap -of computation and data transfer, ensuring continuous GPU activity. To create a -stream, the function :cpp:func:`hipStreamCreate` is used, returning a handle to -the newly created stream. Assigning different operations to different streams -allows multiple tasks to run simultaneously, improving overall performance. -Proper management of streams is crucial for effective asynchrony. Using streams -wisely can significantly reduce idle times and enhance the efficiency of GPU -applications. +Streams enable the overlap of computation and data transfer, ensuring +continuous GPU activity. + +To create a stream, the following functions are used, each returning a handle +to the newly created stream: + +- :cpp:func:`hipStreamCreate`: Creates a stream with default settings. +- :cpp:func:`hipStreamCreateWithFlags`: Allows creating a stream, with specific + flags, listed below, enabling more control over stream behavior: + + - ``hipStreamDefault``: creates a default stream suitable for most + operations. It ensures that the stream is not non-blocking. + - ``hipStreamNonBlocking``: creates a non-blocking stream, allowing + concurrent execution of operations. It ensures that tasks can run + simultaneously without waiting for each other to complete, thus improving + overall performance. + +- :cpp:func:`hipStreamCreateWithPriority``: Allows creating a stream with a + specified priority, enabling prioritization of certain tasks. + +The :cpp:func:`hipStreamSynchronize` function is used to block the calling host +thread until all previously submitted tasks in a specified HIP stream have +completed. It ensures that all operations in the given stream, such as kernel +executions or memory transfers, are finished before the host thread proceeds. + +.. note:: + + If the :cpp:func:`hipStreamSynchronize` function input stream is 0 (or the + default stream), it waits for all operations in the default stream to + complete. Concurrent execution between host and device ------------------------------------------------------------------------------- @@ -48,25 +68,32 @@ asynchronously using ``hipLaunchKernelDefault`` with a stream, enabling the CPU to continue executing other code while the GPU processes the kernel. Similarly, memory operations like :cpp:func:`hipMemcpyAsync` can be performed asynchronously, allowing data transfers between the host and device without -blocking the CPU. This concurrent execution model is vital for achieving high -performance, as it ensures that both the CPU and GPU are utilized efficiently. -By distributing workloads between the host and device, developers can -significantly reduce execution time and improve application responsiveness. +blocking the CPU. Concurrent kernel execution ------------------------------------------------------------------------------- Concurrent execution of multiple kernels on the GPU allows different kernels to -run simultaneously, leveraging the parallel processing capabilities of the GPU. -Utilizing multiple streams enables developers to launch kernels concurrently, -maximizing GPU resource usage. Managing dependencies between kernels is crucial -for ensuring correct execution order. This can be achieved using -:cpp:func:`hipStreamWaitEvent`, which allows a kernel to wait for a specific -event before starting execution. Proper management of concurrent kernel -execution can lead to significant performance gains, particularly in -applications with independent tasks that can be parallelized. By maximizing the -utilization of GPU cores, developers can achieve higher throughput and -efficiency. +run simultaneously to maximize GPU resource usage. Managing dependencies +between kernels is crucial for ensuring correct execution order. This can be +achieved using :cpp:func:`hipStreamWaitEvent`, which allows a kernel to wait +for a specific event before starting execution. + +Independent kernels can only run concurrently, if there are enough registers +and share memories for the kernels. To reach concurrent kernel executions, the +developer may have to reduce the block size of the kernels. The kernel runtimes +can be misleading at concurrent kernel runs, that's why during optimization +it's better to check the trace files, to see if a kernel is blocking another +kernel, while they are running parallel. + +When running kernels in parallel, the execution time can increase due to +contention for shared resources. This is because multiple kernels may attempt +to access the same GPU resources simultaneously, leading to delays. + +Asynchronous kernel execution is beneficial only under specific conditions It +is most effective when the kernels do not fully utilize the GPU's resources. In +such cases, overlapping kernel execution can improve overall throughput and +efficiency by keeping the GPU busy without exceeding its capacity. Overlap of data transfer and kernel execution =============================================================================== @@ -75,6 +102,11 @@ One of the primary benefits of asynchronous operations is the ability to overlap data transfer with kernel execution, leading to better resource utilization and improved performance. +Asynchronous execution is particularly advantageous in iterative processes. For +instance, if an iteration calculation is initiated, it can be efficient to +prepare the input data simultaneously, provided that this preparation does not +depend on the kernel's execution. + Querying device capabilities ------------------------------------------------------------------------------- @@ -94,9 +126,8 @@ and device while kernels are being executed on the GPU. Using operations like waiting for the previous operation to complete. This overlap of computation and data transfer ensures that the GPU is not idle while waiting for data. Examples include launching kernels in one stream while performing data transfers in -another. By carefully orchestrating these operations, developers can achieve -significant performance improvements. This technique is especially useful in -applications with large data sets that need to be processed quickly. +another. This technique is especially useful in applications with large data +sets that need to be processed quickly. Concurrent data transfers ------------------------------------------------------------------------------- @@ -108,10 +139,9 @@ without blocking other operations. :cpp:func:`hipMemcpyPeerAsync` enables data transfers between different GPUs, facilitating multi-GPU communication. Concurrent data transfers are important for applications that require frequent and large data movements. By overlapping data transfers with computation, -developers can minimize idle times and enhance performance. Proper management -of data transfers can lead to efficient utilization of the memory bandwidth and -reduce bottlenecks. This is particularly important for applications that need -to handle large volumes of data efficiently. +developers can minimize idle times and enhance performance. This is +particularly important for applications that need to handle large volumes of +data efficiently. Concurrent data transfers with intra-device copies ------------------------------------------------------------------------------- @@ -120,45 +150,37 @@ It is also possible to perform intra-device copies simultaneously with kernel execution on devices that support the ``concurrentKernels`` device property and/or with copies to or from the device (for devices that support the ``asyncEngineCount`` property). Intra-device copies can be initiated using -standard memory copy functions with destination and source addresses residing on -the same device. +standard memory copy functions with destination and source addresses residing +on the same device. Synchronization, event management and synchronous calls =============================================================================== -Synchronization and event management are crucial for coordinating tasks and -ensuring correct execution order, while synchronous calls are sometimes -necessary for maintaining data consistency. +Synchronization and event management are important for coordinating tasks and +ensuring correct execution order, and synchronous calls are necessary for +maintaining data consistency. Synchronous calls ------------------------------------------------------------------------------- -Despite the benefits of asynchronous operations, there are scenarios where -synchronous calls are necessary. Synchronous calls ensure task completion -before moving to the next operation, crucial for data consistency and correct -execution order. For example, :cpp:func:`hipMemcpy` for data transfers waits -for completion before returning control to the host. Similarly, synchronous -kernel launches are used when immediate completion is required. When a -synchronous function is called, control is not returned to the host thread -before the device has completed the requested task. The behavior of the host -thread—whether to yield, block, or spin—can be specified using -``hipSetDeviceFlags`` with specific flags. Understanding when to use -synchronous calls is crucial for managing execution flow and avoiding data -races. +Synchronous calls ensure task completion before moving to the next operation. +For example, :cpp:func:`hipMemcpy` for data transfers waits for completion +before returning control to the host. Similarly, synchronous kernel launches +are used when immediate completion is required. When a synchronous function is +called, control is not returned to the host thread before the device has +completed the requested task. The behavior of the host thread—whether to yield, +block, or spin—can be specified using :cpp:func:`hipSetDeviceFlags` with +specific flags. Understanding when to use synchronous calls is important for +managing execution flow and avoiding data races. Events for synchronization ------------------------------------------------------------------------------- -Events are critical for synchronization and tracking asynchronous operation -progress. By creating an event with :cpp:func:`hipEventCreate` and recording it -with ``hipEventRecord``, developers can synchronize operations across streams, -ensuring correct task execution order. :cpp:func:`hipEventSynchronize` allows -waiting for an event to complete before proceeding with the next operation. -Events are vital for managing dependencies and maintaining data consistency. -Proper event usage avoids data races and ensures efficient task execution. -Leveraging events effectively can optimize performance and enhance the overall -efficiency of GPU applications. - +By creating an event with :cpp:func:`hipEventCreate` and recording it with +:cpp:func:`hipEventRecord`, developers can synchronize operations across +streams, ensuring correct task execution order. :cpp:func:`hipEventSynchronize` +allows waiting for an event to complete before proceeding with the next +operation. Programmatic dependent launch and synchronization ------------------------------------------------------------------------------- @@ -168,254 +190,283 @@ to start before the primary kernel finishes, HIP achieves similar functionality using streams and events. By employing :cpp:func:`hipStreamWaitEvent`, it is possible to manage the execution order without explicit hardware support. This mechanism allows a secondary kernel to launch as soon as the necessary -conditions are met, even if the primary kernel is still running. Such an -approach optimizes resource utilization and improves performance by efficiently -overlapping operations, especially in complex applications with interdependent -tasks. +conditions are met, even if the primary kernel is still running. Example ------------------------------------------------------------------------------- -.. tab-set:: +The examples shows the difference between sequential, asynchronous calls and +asynchronous calls with ``hipEvents``. - .. tab-item:: asynchronous +.. figure:: ../../data/how-to/hip_runtime_api/asynchronous/sequential_async_event.svg + :alt: Compare the different calls + :align: center - .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/async.svg - :alt: Asynchronous concurrency - :align: center +The example codes + +.. tab-set:: + + .. tab-item:: Sequential .. code-block:: cpp #include + #include #include - // GPU Kernel - __global__ void kernel(int *data, int value) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - data[idx] = value; + #define HIP_CHECK(expression) \ + { \ + const hipError_t status = expression; \ + if(status != hipSuccess){ \ + std::cerr << "HIP error " \ + << status << ": " \ + << hipGetErrorString(status) \ + << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ } + // GPU Kernels + __global__ void kernelA(double* arrayA, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] *= 2.0;} + }; + __global__ void kernelB(double* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayB[x] += 3.0;} + }; + int main() { - constexpr int N = 1024; + constexpr int numOfBlocks = 256; + constexpr int threadsPerBlock = 4096; + constexpr int numberOfIterations = 50; + size_t arraySize = 1U << 20; - int *h_data1, *h_data2, *d_data1, *d_data2; - h_data1 = new int[N]; - h_data2 = new int[N]; - - // Initialize host data - for(int i = 0; i < N; ++i) - { - h_data1[i] = i; - h_data2[i] = i * 2; - } - - // Set device flags to control the host thread's behavior (e.g., yielding) - hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + double *d_dataA; + double *d_dataB; + + std::vector vectorA(arraySize, 4.0); + std::vector vectorB(arraySize, 2.0); // Allocate device memory - hipMalloc(&d_data1, N * sizeof(*d_data1)); - hipMalloc(&d_data2, N * sizeof(*d_data2)); - - // Create streams - hipStream_t stream1, stream2; - hipStreamCreate(&stream1); - hipStreamCreate(&stream2); + HIP_CHECK(hipMalloc(&d_dataA, arraySize * sizeof(*d_dataA))); + HIP_CHECK(hipMalloc(&d_dataB, arraySize * sizeof(*d_dataB))); - // Stream 1: Host to Device 1 - hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); - - // Stream 1: Kernel 1 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream1, d_data1, 1); - - // Stream 1: Device to Host 1 - hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream1); - - // Stream 2: Host to Device 2 - hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); + for(unsigned int iteration = 0; iteration < numberOfIterations; iteration++) + { + // Host to Device copies + HIP_CHECK(hipMemcpy(d_dataA, vectorA.data(), arraySize * sizeof(*d_dataA), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_dataB, vectorB.data(), arraySize * sizeof(*d_dataB), hipMemcpyHostToDevice)); - // Stream 2: Kernel 2 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream2, d_data2, 2); + // Launch the GPU kernels + hipLaunchKernelGGL(kernelA, dim3(numOfBlocks), dim3(threadsPerBlock), 0, 0, d_dataA, arraySize); + hipLaunchKernelGGL(kernelB, dim3(numOfBlocks), dim3(threadsPerBlock), 0, 0, d_dataB, arraySize); - // Stream 2: Device to Host 2 - hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream2); + // Device to Host copies + HIP_CHECK(hipMemcpy(vectorA.data(), d_dataA, arraySize * sizeof(*vectorA.data()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(vectorB.data(), d_dataB, arraySize * sizeof(*vectorB.data()), hipMemcpyDeviceToHost)); + } - // Wait for all operations in both streams to complete - hipStreamSynchronize(stream1); - hipStreamSynchronize(stream2); + // Wait for all operations to complete + HIP_CHECK(hipDeviceSynchronize()); // Cleanup - hipStreamDestroy(stream1); - hipStreamDestroy(stream2); - hipFree(d_data1); - hipFree(d_data2); - delete[] h_data1; - delete[] h_data2; + hipFree(d_dataA); + hipFree(d_dataB); - std::cout << "Asynchronous execution completed successfully." << std::endl; + std::cout << "Sequential execution completed successfully." << std::endl; return 0; } - .. tab-item:: hipStreamWaitEvent - - .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/event.svg - :alt: Asynchronous concurrency with events - :align: center + .. tab-item:: Asynchronous .. code-block:: cpp #include + #include #include - // GPU Kernel - __global__ void kernel(int *data, int value) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - data[idx] = value; + #define HIP_CHECK(expression) \ + { \ + const hipError_t status = expression; \ + if(status != hipSuccess){ \ + std::cerr << "HIP error " \ + << status << ": " \ + << hipGetErrorString(status) \ + << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ } + // GPU Kernels + __global__ void kernelA(double* arrayA, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] *= 2.0;} + }; + __global__ void kernelB(double* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayB[x] += 3.0;} + }; + int main() { - constexpr int N = 1024; - - int *h_data1, *h_data2, *d_data1, *d_data2; - h_data1 = new int[N]; - h_data2 = new int[N]; - - // Initialize host data - for(int i = 0; i < N; ++i) - { - h_data1[i] = i; - h_data2[i] = i * 2; - } + constexpr int numOfBlocks = 256; + constexpr int threadsPerBlock = 4096; + size_t arraySize = 1U << 20; - // Set device flags to control the host thread's behavior (e.g., yielding) - hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + double *d_dataA; + double *d_dataB; + + std::vector vectorA(arraySize, 4.0); + std::vector vectorB(arraySize, 2.0); // Allocate device memory - hipMalloc(&d_data1, N * sizeof(*d_data1)); - hipMalloc(&d_data2, N * sizeof(*d_data2)); + HIP_CHECK(hipMalloc(&d_dataA, arraySize * sizeof(*d_dataA))); + HIP_CHECK(hipMalloc(&d_dataB, arraySize * sizeof(*d_dataB))); // Create streams - hipStream_t stream1, stream2; - hipStreamCreate(&stream1); - hipStreamCreate(&stream2); + hipStream_t streamA, streamB; + HIP_CHECK(hipStreamCreate(&streamA)); + HIP_CHECK(hipStreamCreate(&streamB)); - // Create events - hipEvent_t event1, event2; - hipEventCreate(&event1); - hipEventCreate(&event2); - - // Stream 1: Host to Device 1 - hipMemcpyAsync(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice, stream1); - - // Stream 1: Kernel 1 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream1, d_data1, 1); - - // Record event after the GPU kernel in stream1 - hipEventRecord(event1, stream1); + for(unsigned int iteration = 0; iteration < numberOfIterations; iteration++) + { + // Stream 1: Host to Device 1 + HIP_CHECK(hipMemcpyAsync(d_dataA, vectorA.data(), arraySize * sizeof(*d_dataA), hipMemcpyHostToDevice, streamA)); - // Stream 1: Device to Host 1 (after event) - hipStreamWaitEvent(stream2, event1, 0); - hipMemcpyAsync(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost, stream2); + // Stream 2: Host to Device 2 + HIP_CHECK(hipMemcpyAsync(d_dataB, vectorB.data(), arraySize * sizeof(*d_dataB), hipMemcpyHostToDevice, streamB)); - // Stream 2: Host to Device 2 - hipMemcpyAsync(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice, stream2); + // Stream 1: Kernel 1 + hipLaunchKernelGGL(kernelA, dim3(1024), dim3(1024), 0, streamA, d_dataA, arraySize); - // Stream 2: Kernel 2 - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, stream2, d_data2, 2); + // Stream 2: Kernel 2 + hipLaunchKernelGGL(kernelB, dim3(1024), dim3(1024), 0, streamB, d_dataB, arraySize); - // Record event after the GPU kernel in stream2 - hipEventRecord(event2, stream2); + // Stream 1: Device to Host 2 (after Kernel 1) + HIP_CHECK(hipMemcpyAsync(vectorA.data(), d_dataA, arraySize * sizeof(*vectorA.data()), hipMemcpyDeviceToHost, streamA)); - // Stream 2: Device to Host 2 (after event) - hipStreamWaitEvent(stream1, event2, 0); - hipMemcpyAsync(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost, stream1); + // Stream 2: Device to Host 2 (after Kernel 2) + HIP_CHECK(hipMemcpyAsync(vectorB.data(), d_dataB, arraySize * sizeof(*vectorB.data()), hipMemcpyDeviceToHost, streamB)); + } // Wait for all operations in both streams to complete - hipStreamSynchronize(stream1); - hipStreamSynchronize(stream2); + HIP_CHECK(hipStreamSynchronize(streamA)); + HIP_CHECK(hipStreamSynchronize(streamB)); // Cleanup - hipEventDestroy(event1); - hipEventDestroy(event2); - hipStreamDestroy(stream1); - hipStreamDestroy(stream2); - hipFree(d_data1); - hipFree(d_data2); - delete[] h_data1; - delete[] h_data2; + HIP_CHECK(hipStreamDestroy(streamA)); + HIP_CHECK(hipStreamDestroy(streamB)); + HIP_CHECK(hipFree(d_dataA)); + HIP_CHECK(hipFree(d_dataB)); std::cout << "Asynchronous execution with events completed successfully." << std::endl; return 0; } - .. tab-item:: sequential - - .. figure:: ../../data/how-to/hip_runtime_api/asynchronous/sequential.svg - :alt: Asynchronous concurrency with events - :align: center - + .. tab-item:: hipStreamWaitEvent .. code-block:: cpp #include + #include #include - // GPU Kernel - __global__ void kernel(int *data, int value) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - data[idx] = value; + #define HIP_CHECK(expression) \ + { \ + const hipError_t status = expression; \ + if(status != hipSuccess){ \ + std::cerr << "HIP error " \ + << status << ": " \ + << hipGetErrorString(status) \ + << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ } + // GPU Kernels + __global__ void kernelA(double* arrayA, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] *= 2.0;} + }; + __global__ void kernelB(double* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayB[x] += 3.0;} + }; + int main() { - constexpr int N = 1024; + constexpr int numOfBlocks = 256; + constexpr int threadsPerBlock = 4096; + size_t arraySize = 1U << 20; + + double *d_dataA; + double *d_dataB; + + std::vector vectorA(arraySize, 4.0); + std::vector vectorB(arraySize, 2.0); - int *h_data1, *h_data2, *d_data1, *d_data2; - h_data1 = new int[N]; - h_data2 = new int[N]; + // Allocate device memory + HIP_CHECK(hipMalloc(&d_dataA, arraySize * sizeof(*d_dataA))); + HIP_CHECK(hipMalloc(&d_dataB, arraySize * sizeof(*d_dataB))); + + // Create streams + hipStream_t streamA, streamB; + HIP_CHECK(hipStreamCreate(&streamA)); + HIP_CHECK(hipStreamCreate(&streamB)); + + // Create events + hipEvent_t event, eventA, eventB; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipEventCreate(&eventA)); + HIP_CHECK(hipEventCreate(&eventB)); - // Initialize host data - for(int i = 0; i < N; ++i) + for(unsigned int iteration = 0; iteration < numberOfIterations; iteration++) { - h_data1[i] = i; - h_data2[i] = i * 2; - } + // Stream 1: Host to Device 1 + HIP_CHECK(hipMemcpyAsync(d_dataA, vectorA.data(), arraySize * sizeof(*d_dataA), hipMemcpyHostToDevice, streamA)); - // Set device flags to control the host thread's behavior (e.g., yielding) - hipSetDeviceFlags(hipDeviceScheduleYield); // This makes the host thread yield + // Stream 2: Host to Device 2 + HIP_CHECK(hipMemcpyAsync(d_dataB, vectorB.data(), arraySize * sizeof(*d_dataB), hipMemcpyHostToDevice, streamB)); - // Allocate device memory - hipMalloc(&d_data1, N * sizeof(*d_data1)); - hipMalloc(&d_data2, N * sizeof(*d_data2)); + // Stream 1: Kernel 1 + hipLaunchKernelGGL(kernelA, dim3(1024), dim3(1024), 0, streamA, d_dataA, arraySize); - // Host to Device copies - hipMemcpy(d_data1, h_data1, N * sizeof(*d_data1), hipMemcpyHostToDevice); - hipMemcpy(d_data2, h_data2, N * sizeof(*d_data2), hipMemcpyHostToDevice); + // Record event after the GPU kernel in Stream 1 + HIP_CHECK(hipEventRecord(event, streamA)); - // Launch the GPU kernels - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, 0, d_data1, 1); - hipLaunchKernelGGL(kernel, dim3(N/256), dim3(256), 0, 0, d_data2, 2); + // Stream 2: Wait for Event 1 before starting Kernel 2 + HIP_CHECK(hipStreamWaitEvent(streamB, event, 0)); - // Device to Host copies - hipMemcpy(h_data1, d_data1, N * sizeof(*h_data1), hipMemcpyDeviceToHost); - hipMemcpy(h_data2, d_data2, N * sizeof(*h_data2), hipMemcpyDeviceToHost); + // Stream 2: Kernel 2 + hipLaunchKernelGGL(kernelB, dim3(1024), dim3(1024), 0, streamB, d_dataB, arraySize); - // Wait for all operations to complete - hipDeviceSynchronize(); + // Stream 1: Device to Host 2 (after Kernel 1) + HIP_CHECK(hipMemcpyAsync(vectorA.data(), d_dataA, arraySize * sizeof(*vectorA.data()), hipMemcpyDeviceToHost, streamA)); + + // Stream 2: Device to Host 2 (after Kernel 2) + HIP_CHECK(hipMemcpyAsync(vectorB.data(), d_dataB, arraySize * sizeof(*vectorB.data()), hipMemcpyDeviceToHost, streamB)); + } + + // Wait for all operations in both streams to complete + HIP_CHECK(hipEventRecord(eventA, streamA)); + HIP_CHECK(hipEventRecord(eventB, streamB)); + HIP_CHECK(hipStreamWaitEvent(streamA, eventA, 0)); + HIP_CHECK(hipStreamWaitEvent(streamB, eventB, 0)); // Cleanup - hipFree(d_data1); - hipFree(d_data2); - delete[] h_data1; - delete[] h_data2; + HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipEventDestroy(eventA)); + HIP_CHECK(hipEventDestroy(eventB)); + HIP_CHECK(hipStreamDestroy(streamA)); + HIP_CHECK(hipStreamDestroy(streamB)); + HIP_CHECK(hipFree(d_dataA)); + HIP_CHECK(hipFree(d_dataB)); - std::cout << "Sequential execution completed successfully." << std::endl; + std::cout << "Asynchronous execution with events completed successfully." << std::endl; return 0; } @@ -423,196 +474,12 @@ Example HIP Graphs =============================================================================== -HIP Graphs provide a way to represent complex workflows as a series of -interconnected tasks. By creating and managing graphs, developers can optimize -dependent task execution. Graphs reduce the overhead associated with launching -individual kernels and memory operations, providing a high-level abstraction -for managing dependencies and synchronizing tasks. Examples include -representing a sequence of kernels and memory operations as a single graph. -Using graphs enhances performance and simplifies complex workflow management. -This technique is particularly useful for applications with intricate -dependencies and multiple execution stages. +HIP graphs offer an efficient alternative to the standard method of launching +GPU tasks via streams. Comprising nodes for operations and edges for +dependencies, HIP graphs reduce kernel launch overhead and provide a high-level +abstraction for managing dependencies and synchronization. By representing +sequences of kernels and memory operations as a single graph, they simplify +complex workflows and enhance performance, particularly for applications with +intricate dependencies and multiple execution stages. For more details, see the :ref:`how_to_HIP_graph` documentation. - -Example -------------------------------------------------------------------------------- - -This example demonstrates the use of HIP Graphs to manage asynchronous -concurrent execution of two kernels. It creates a graph with nodes for the -kernel executions and memory copies, which are then instantiated and launched -in two separate streams. This setup ensures efficient and concurrent execution, -leveraging the high-level abstraction of HIP Graphs to simplify the workflow -and improve performance. - -.. code-block:: cpp - - #include - #include - - __global__ void kernel(int *data, int value) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - data[idx] = value; - } - - int main() - { - constexpr int N = 1024; - - int *d_data1, *d_data2; - int h_data1[N], h_data2[N]; - - hipGraph_t graph; - hipGraphExec_t graphExec; - hipStream_t stream1, stream2; - hipGraphNode_t kernelNode1, kernelNode2, memcpyNode1, memcpyNode2; - hipKernelNodeParams kernelNodeParams1 = {0}; - hipKernelNodeParams kernelNodeParams2 = {0}; - hipMemcpy3DParms memcpyParams1 = {0}; - hipMemcpy3DParms memcpyParams2 = {0}; - - // Allocate device memory - hipMalloc(&d_data1, N * sizeof(*d_data1)); - hipMalloc(&d_data2, N * sizeof(*d_data2)); - - // Create streams - hipStreamCreate(&stream1); - hipStreamCreate(&stream2); - - // Create an empty graph - hipGraphCreate(&graph, 0); - - // Define kernel1 node parameters - void *kernelArgs1[] = {&d_data1, &N}; - kernelNodeParams1.func = reinterpret_cast(kernel); - kernelNodeParams1.gridDim = dim3(N / 256); - kernelNodeParams1.blockDim = dim3(256); - kernelNodeParams1.sharedMemBytes = 0; - kernelNodeParams1.kernelParams = kernelArgs1; - kernelNodeParams1.extra = nullptr; - - // Define kernel2 node parameters - void *kernelArgs2[] = {&d_data2, &N}; - kernelNodeParams2.func = reinterpret_cast(kernel); - kernelNodeParams2.gridDim = dim3(N / 256); - kernelNodeParams2.blockDim = dim3(256); - kernelNodeParams2.sharedMemBytes = 0; - kernelNodeParams2.kernelParams = kernelArgs2; - kernelNodeParams2.extra = nullptr; - - // Add kernel nodes to graph - hipGraphAddKernelNode(&kernelNode1, graph, nullptr, 0, &kernelNodeParams1); - hipGraphAddKernelNode(&kernelNode2, graph, nullptr, 0, &kernelNodeParams2); - - // Define memcpy node parameters for stream1 - memcpyParams1.srcArray = nullptr; - memcpyParams1.srcPos = make_hipPos(0, 0, 0); - memcpyParams1.dstArray = nullptr; - memcpyParams1.dstPos = make_hipPos(0, 0, 0); - memcpyParams1.extent = make_hipExtent(N * sizeof(*d_data1), 1, 1); - memcpyParams1.kind = hipMemcpyDeviceToHost; - memcpyParams1.srcPtr = make_hipPitchedPtr(d_data1, N * sizeof(*d_data1), N, 1); - memcpyParams1.dstPtr = make_hipPitchedPtr(h_data1, N * sizeof(*d_data1), N, 1); - - // Define memcpy node parameters for stream2 - memcpyParams2.srcArray = nullptr; - memcpyParams2.srcPos = make_hipPos(0, 0, 0); - memcpyParams2.dstArray = nullptr; - memcpyParams2.dstPos = make_hipPos(0, 0, 0); - memcpyParams2.extent = make_hipExtent(N * sizeof(*d_data2), 1, 1); - memcpyParams2.kind = hipMemcpyDeviceToHost; - memcpyParams2.srcPtr = make_hipPitchedPtr(d_data2, N * sizeof(*d_data2), N, 1); - memcpyParams2.dstPtr = make_hipPitchedPtr(h_data2, N * sizeof(*d_data2), N, 1); - - // Add memcpy nodes to graph - hipGraphAddMemcpyNode(&memcpyNode1, graph, &kernelNode1, 1, &memcpyParams1); - hipGraphAddMemcpyNode(&memcpyNode2, graph, &kernelNode2, 1, &memcpyParams2); - - // Instantiate the graph - hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0); - - // Launch the graph asynchronously in different streams - hipGraphLaunch(graphExec, stream1); - hipGraphLaunch(graphExec, stream2); - - // Wait for all operations in both streams to complete - hipStreamSynchronize(stream1); - hipStreamSynchronize(stream2); - - // Cleanup - hipGraphExecDestroy(graphExec); - hipGraphDestroy(graph); - hipStreamDestroy(stream1); - hipStreamDestroy(stream2); - hipFree(d_data1); - hipFree(d_data2); - - std::cout << "Graph executed with asynchronous concurrent execution." << std::endl; - - return 0; - } - -Best practices and performance optimization -=============================================================================== - -Achieving optimal performance in GPU-accelerated applications involves adhering -to best practices and continuously tuning the code to ensure efficient resource -utilization. - -Implementing best practices -------------------------------------------------------------------------------- - -Following best practices for managing asynchronous operations is crucial for -achieving optimal performance. Here are some key strategies to consider: - -- minimize synchronization overhead: Synchronize only when necessary to avoid - stalling the GPU and hindering parallelism. - -- leverage asynchronous operations: Use asynchronous memory transfers and - kernel launches to overlap computation and data transfer, maximizing resource - utilization. - -- balance workloads: Distribute tasks efficiently between the host and device - to ensure both are fully utilized. This can significantly enhance application - responsiveness and performance. - -- utilize multiple streams: Create and manage multiple streams to run commands - concurrently, reducing idle times and improving overall efficiency. - -By implementing these strategies, developers can significantly enhance -application responsiveness and overall performance. These best practices are -essential for effective asynchronous operation management and for fully -leveraging the capabilities of modern GPUs. - -Balancing and profiling -------------------------------------------------------------------------------- - -Profiling tools help identify bottlenecks by providing detailed insights into -the execution of GPU-accelerated applications. These tools allow developers to -visualize how computational tasks and memory operations are distributed across -different hardware resources. By analyzing these visualizations, developers can -pinpoint areas where the application may be spending excessive time, such as -during synchronization points or data transfers. - -Key profiling metrics include: - -- kernel execution time: Measuring the time spent executing each kernel helps - identify which kernels are taking longer than expected and may need - optimization. - -- memory transfer time: Assessing the duration of data transfers between the - host and device can highlight inefficiencies or bottlenecks in memory - operations. - -- stream utilization: Evaluating how streams are utilized can reveal whether - resources are being used effectively or if some streams are underutilized. - -- concurrency: Analyzing the overlap of computation and data transfers helps - identify opportunities to improve concurrency and reduce idle times. - -Using profiling tools, developers gain a comprehensive understanding of their -application's performance characteristics, making informed decisions about -where to focus optimization efforts. Regular profiling and adjustments ensure -that applications run at their best, maintaining high efficiency and -performance. \ No newline at end of file diff --git a/docs/how-to/performance_guidelines.rst b/docs/how-to/performance_guidelines.rst index bf74b63d16..26d74975fd 100644 --- a/docs/how-to/performance_guidelines.rst +++ b/docs/how-to/performance_guidelines.rst @@ -3,6 +3,8 @@ developers optimize the performance of HIP-capable GPU architectures. :keywords: AMD, ROCm, HIP, CUDA, performance, guidelines +.. _how_to_performance_guidelines: + ******************************************************************************* Performance guidelines ******************************************************************************* @@ -32,12 +34,14 @@ reveal and efficiently provide as much parallelism as possible. The parallelism can be performed at the application level, device level, and multiprocessor level. +.. _application_parallel_execution: + Application level -------------------------------------------------------------------------------- To enable parallel execution of the application across the host and devices, use -asynchronous calls and streams. Assign workloads based on efficiency: serial to -the host or parallel to the devices. +:ref:`asynchronous calls and streams `. Assign workloads +based on efficiency: serial to the host or parallel to the devices. For parallel workloads, when threads belonging to the same block need to synchronize to share data, use :cpp:func:`__syncthreads()` (see: diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 1c0adb05df..6ec4703be3 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -49,10 +49,10 @@ subtrees: - file: how-to/hip_runtime_api/memory_management/virtual_memory - file: how-to/hip_runtime_api/memory_management/stream_ordered_allocator - file: how-to/hip_runtime_api/error_handling - - file: how-to/hip_runtime_api/cooperative_groups + - file: how-to/hip_runtime_api/call_stack - file: how-to/hip_runtime_api/asynchronous - file: how-to/hip_runtime_api/hipgraph - - file: how-to/hip_runtime_api/call_stack + - file: how-to/hip_runtime_api/cooperative_groups - file: how-to/hip_runtime_api/multi_device - file: how-to/hip_runtime_api/opengl_interop - file: how-to/hip_runtime_api/external_interop