From 4df81a1c6d2e86de3411cfd21f868ee1bebb0715 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 16 Dec 2024 19:44:04 +0000 Subject: [PATCH] accelerator/rocm: add large BAR check check whether the device has large BAR support enabled. If not, set the rocm_copy_D2H and H2D thresholds to 0, i.e. use hipMemcpy(Async) for all data transfers. Data center (Instinct) devices usually have large BAR enabled, for gaming PCs its not always set by the user. The PR also does a little bit of cleanup in the error handling of the lazy_init() routine. Signed-off-by: Edgar Gabriel --- .../rocm/accelerator_rocm_component.c | 88 +++++++++++++++---- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c index cf1c6c058b8..8e11f8ea7ee 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c @@ -181,6 +181,7 @@ static int accelerator_rocm_component_register(void) int opal_accelerator_rocm_lazy_init() { + hipError_t hip_err; int err = OPAL_SUCCESS; /* Double checked locking to avoid having to @@ -196,41 +197,94 @@ int opal_accelerator_rocm_lazy_init() goto out; } - err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices); - if (hipSuccess != err) { + hip_err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices); + if (hipSuccess != hip_err) { opal_output(0, "Failed to query device count, err=%d %s\n", - err, hipGetErrorString(err)); - err = OPAL_ERROR; + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; goto out; } hipStream_t memcpy_stream; - err = hipStreamCreate(&memcpy_stream); - if (hipSuccess != err) { + hip_err = hipStreamCreate(&memcpy_stream); + if (hipSuccess != hip_err) { opal_output(0, "Could not create hipStream, err=%d %s\n", - err, hipGetErrorString(err)); - err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad goto out; } + opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t)); + if (NULL == opal_accelerator_rocm_MemcpyStream) { + opal_output(0, "Could not allocate hipStream\n"); + err = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } *opal_accelerator_rocm_MemcpyStream = memcpy_stream; opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices); + if (NULL == opal_accelerator_rocm_mem_bw) { + opal_output(0, "Could not allocate memory_bw array\n"); + err = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } + for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) { int mem_clock_rate; // kHz - err = hipDeviceGetAttribute(&mem_clock_rate, - hipDeviceAttributeMemoryClockRate, - i); + hip_err = hipDeviceGetAttribute(&mem_clock_rate, + hipDeviceAttributeMemoryClockRate, + i); + if (hipSuccess != hip_err) { + opal_output(0, "Failed to query device MemoryClockRate, err=%d %s\n", + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; + goto out; + } + int bus_width; // bit - err = hipDeviceGetAttribute(&bus_width, - hipDeviceAttributeMemoryBusWidth, - i); - /* bw = clock_rate * bus width * 2bit multiplier - * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 - */ + hip_err = hipDeviceGetAttribute(&bus_width, + hipDeviceAttributeMemoryBusWidth, + i); + if (hipSuccess != hip_err) { + opal_output(0, "Failed to query device MemoryBusWidth, err=%d %s\n", + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; + goto out; + } + + /* bw = clock_rate * bus width * 2bit multiplier */ float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; opal_accelerator_rocm_mem_bw[i] = bw; } + +#if HIP_VERSION >= 60000000 + int dev_id; + hip_err = hipGetDevice(&dev_id); + if (hipSuccess != hip_err) { + opal_output(0, "error retrieving current device"); + err = OPAL_ERROR; + goto out; + } + + int has_large_bar = 0; + hip_err = hipDeviceGetAttribute (&has_large_bar, hipDeviceAttributeIsLargeBar, + dev_id); + if (hipSuccess != hip_err) { + opal_output(0, "error retrieving current device"); + err = OPAL_ERROR; + goto out; + } + + if (0 == has_large_bar) { + // Without large BAR we have to use hipMemcpy(Async) for all data transfers + opal_output(0, "Large BAR support is not enabled on current device. " + "Enable large BAR support in BIOS (Above 4G Encoding) for " + "better performance\n."); + opal_accelerator_rocm_memcpyH2D_limit = 0; + opal_accelerator_rocm_memcpyD2H_limit = 0; + } +#endif + err = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_rocm_init_complete = true;