diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c index cf1c6c058b8..8e11f8ea7ee 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c @@ -181,6 +181,7 @@ static int accelerator_rocm_component_register(void) int opal_accelerator_rocm_lazy_init() { + hipError_t hip_err; int err = OPAL_SUCCESS; /* Double checked locking to avoid having to @@ -196,41 +197,94 @@ int opal_accelerator_rocm_lazy_init() goto out; } - err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices); - if (hipSuccess != err) { + hip_err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices); + if (hipSuccess != hip_err) { opal_output(0, "Failed to query device count, err=%d %s\n", - err, hipGetErrorString(err)); - err = OPAL_ERROR; + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; goto out; } hipStream_t memcpy_stream; - err = hipStreamCreate(&memcpy_stream); - if (hipSuccess != err) { + hip_err = hipStreamCreate(&memcpy_stream); + if (hipSuccess != hip_err) { opal_output(0, "Could not create hipStream, err=%d %s\n", - err, hipGetErrorString(err)); - err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad goto out; } + opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t)); + if (NULL == opal_accelerator_rocm_MemcpyStream) { + opal_output(0, "Could not allocate hipStream\n"); + err = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } *opal_accelerator_rocm_MemcpyStream = memcpy_stream; opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices); + if (NULL == opal_accelerator_rocm_mem_bw) { + opal_output(0, "Could not allocate memory_bw array\n"); + err = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } + for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) { int mem_clock_rate; // kHz - err = hipDeviceGetAttribute(&mem_clock_rate, - hipDeviceAttributeMemoryClockRate, - i); + hip_err = hipDeviceGetAttribute(&mem_clock_rate, + hipDeviceAttributeMemoryClockRate, + i); + if (hipSuccess != hip_err) { + opal_output(0, "Failed to query device MemoryClockRate, err=%d %s\n", + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; + goto out; + } + int bus_width; // bit - err = hipDeviceGetAttribute(&bus_width, - hipDeviceAttributeMemoryBusWidth, - i); - /* bw = clock_rate * bus width * 2bit multiplier - * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 - */ + hip_err = hipDeviceGetAttribute(&bus_width, + hipDeviceAttributeMemoryBusWidth, + i); + if (hipSuccess != hip_err) { + opal_output(0, "Failed to query device MemoryBusWidth, err=%d %s\n", + hip_err, hipGetErrorString(hip_err)); + err = OPAL_ERROR; + goto out; + } + + /* bw = clock_rate * bus width * 2bit multiplier */ float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; opal_accelerator_rocm_mem_bw[i] = bw; } + +#if HIP_VERSION >= 60000000 + int dev_id; + hip_err = hipGetDevice(&dev_id); + if (hipSuccess != hip_err) { + opal_output(0, "error retrieving current device"); + err = OPAL_ERROR; + goto out; + } + + int has_large_bar = 0; + hip_err = hipDeviceGetAttribute (&has_large_bar, hipDeviceAttributeIsLargeBar, + dev_id); + if (hipSuccess != hip_err) { + opal_output(0, "error retrieving current device"); + err = OPAL_ERROR; + goto out; + } + + if (0 == has_large_bar) { + // Without large BAR we have to use hipMemcpy(Async) for all data transfers + opal_output(0, "Large BAR support is not enabled on current device. " + "Enable large BAR support in BIOS (Above 4G Encoding) for " + "better performance\n."); + opal_accelerator_rocm_memcpyH2D_limit = 0; + opal_accelerator_rocm_memcpyD2H_limit = 0; + } +#endif + err = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_rocm_init_complete = true;