Skip to content

Commit

Permalink
Merge pull request #12982 from edgargabriel/topic/rocm-large-bar-check
Browse files Browse the repository at this point in the history
accelerator/rocm: add large BAR check
  • Loading branch information
edgargabriel authored Dec 16, 2024
2 parents 28c2e47 + 4df81a1 commit 95a8c39
Showing 1 changed file with 71 additions and 17 deletions.
88 changes: 71 additions & 17 deletions opal/mca/accelerator/rocm/accelerator_rocm_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ static int accelerator_rocm_component_register(void)

int opal_accelerator_rocm_lazy_init()
{
hipError_t hip_err;
int err = OPAL_SUCCESS;

/* Double checked locking to avoid having to
Expand All @@ -196,41 +197,94 @@ int opal_accelerator_rocm_lazy_init()
goto out;
}

err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
if (hipSuccess != err) {
hip_err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
if (hipSuccess != hip_err) {
opal_output(0, "Failed to query device count, err=%d %s\n",
err, hipGetErrorString(err));
err = OPAL_ERROR;
hip_err, hipGetErrorString(hip_err));
err = OPAL_ERROR;
goto out;
}

hipStream_t memcpy_stream;
err = hipStreamCreate(&memcpy_stream);
if (hipSuccess != err) {
hip_err = hipStreamCreate(&memcpy_stream);
if (hipSuccess != hip_err) {
opal_output(0, "Could not create hipStream, err=%d %s\n",
err, hipGetErrorString(err));
err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad
hip_err, hipGetErrorString(hip_err));
err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad
goto out;
}

opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t));
if (NULL == opal_accelerator_rocm_MemcpyStream) {
opal_output(0, "Could not allocate hipStream\n");
err = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
*opal_accelerator_rocm_MemcpyStream = memcpy_stream;

opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices);
if (NULL == opal_accelerator_rocm_mem_bw) {
opal_output(0, "Could not allocate memory_bw array\n");
err = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}

for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) {
int mem_clock_rate; // kHz
err = hipDeviceGetAttribute(&mem_clock_rate,
hipDeviceAttributeMemoryClockRate,
i);
hip_err = hipDeviceGetAttribute(&mem_clock_rate,
hipDeviceAttributeMemoryClockRate,
i);
if (hipSuccess != hip_err) {
opal_output(0, "Failed to query device MemoryClockRate, err=%d %s\n",
hip_err, hipGetErrorString(hip_err));
err = OPAL_ERROR;
goto out;
}

int bus_width; // bit
err = hipDeviceGetAttribute(&bus_width,
hipDeviceAttributeMemoryBusWidth,
i);
/* bw = clock_rate * bus width * 2bit multiplier
* See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
*/
hip_err = hipDeviceGetAttribute(&bus_width,
hipDeviceAttributeMemoryBusWidth,
i);
if (hipSuccess != hip_err) {
opal_output(0, "Failed to query device MemoryBusWidth, err=%d %s\n",
hip_err, hipGetErrorString(hip_err));
err = OPAL_ERROR;
goto out;
}

/* bw = clock_rate * bus width * 2bit multiplier */
float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
opal_accelerator_rocm_mem_bw[i] = bw;
}

#if HIP_VERSION >= 60000000
int dev_id;
hip_err = hipGetDevice(&dev_id);
if (hipSuccess != hip_err) {
opal_output(0, "error retrieving current device");
err = OPAL_ERROR;
goto out;
}

int has_large_bar = 0;
hip_err = hipDeviceGetAttribute (&has_large_bar, hipDeviceAttributeIsLargeBar,
dev_id);
if (hipSuccess != hip_err) {
opal_output(0, "error retrieving current device");
err = OPAL_ERROR;
goto out;
}

if (0 == has_large_bar) {
// Without large BAR we have to use hipMemcpy(Async) for all data transfers
opal_output(0, "Large BAR support is not enabled on current device. "
"Enable large BAR support in BIOS (Above 4G Encoding) for "
"better performance\n.");
opal_accelerator_rocm_memcpyH2D_limit = 0;
opal_accelerator_rocm_memcpyD2H_limit = 0;
}
#endif

err = OPAL_SUCCESS;
opal_atomic_wmb();
accelerator_rocm_init_complete = true;
Expand Down

0 comments on commit 95a8c39

Please sign in to comment.