Skip to content

Commit

Permalink
UCT/CUDA: Update cuda_copy perf estimates for Grace-Hopper
Browse files Browse the repository at this point in the history
  • Loading branch information
SeyedMir committed Sep 18, 2024
1 parent 27ab197 commit f6964ee
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 13 deletions.
1 change: 1 addition & 0 deletions config/ucx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ UCX_REG_NONBLOCK_MEM_TYPES=host,cuda-managed
UCX_IB_ODP_MEM_TYPES=host,cuda-managed
UCX_IB_MLX5_DEVX_OBJECTS=
UCX_DISTANCE_BW=auto,sys:16500MBs
UCX_CUDA_COPY_BW=h2d:400GBs,d2h:400GBs,d2d:400GBs,other:10000MBs

[Fujitsu ARM]
CPU vendor=Fujitsu ARM
Expand Down
36 changes: 25 additions & 11 deletions src/uct/cuda/cuda_copy/cuda_copy_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,24 @@ static ucs_config_field_t uct_cuda_copy_iface_config_table[] = {
"Max number of cuda events. -1 is infinite",
ucs_offsetof(uct_cuda_copy_iface_config_t, max_cuda_events), UCS_CONFIG_TYPE_UINT},

{"BW", "10000MBs",
"Effective memory bandwidth",
ucs_offsetof(uct_cuda_copy_iface_config_t, bandwidth), UCS_CONFIG_TYPE_BW},
/* TODO: 1. Add separate keys for shared and dedicated bandwidth
2. Remove the "other" key (use pref_loc for managed memory) */
{"BW", "h2d:8300MBs,d2h:11660MBs,d2d:320GBs,other:10000MBs",
"Effective memory bandwidth", 0,
UCS_CONFIG_TYPE_KEY_VALUE(UCS_CONFIG_TYPE_BW,
{"h2d", "host to device bandwidth",
ucs_offsetof(uct_cuda_copy_iface_config_t, bw.h2d)},
{"d2h", "device to host bandwidth",
ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2h)},
{"d2d", "device to device bandwidth",
ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2d)},
{"other", "any other src-dest memory types bandwidth",
ucs_offsetof(uct_cuda_copy_iface_config_t, bw.other)},
{NULL})},

{NULL}
};


/* Forward declaration for the delete function */
static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t)(uct_iface_t*);

Expand Down Expand Up @@ -134,7 +144,7 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface,

iface_attr->latency = UCT_CUDA_COPY_IFACE_LATENCY;
iface_attr->bandwidth.dedicated = 0;
iface_attr->bandwidth.shared = iface->config.bandwidth;
iface_attr->bandwidth.shared = iface->config.bw.other;
iface_attr->overhead = UCT_CUDA_COPY_IFACE_OVERHEAD;
iface_attr->priority = 0;

Expand Down Expand Up @@ -407,16 +417,17 @@ uct_cuda_copy_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr)
perf_attr->bandwidth.dedicated = 0;
if ((src_mem_type == UCS_MEMORY_TYPE_HOST) &&
(dst_mem_type == UCS_MEMORY_TYPE_CUDA)) {
perf_attr->bandwidth.shared = (zcopy ? 8300.0 : 7900.0) * UCS_MBYTE;
perf_attr->bandwidth.shared = zcopy ? iface->config.bw.h2d :
iface->config.bw.h2d * 0.95;
} else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) &&
(dst_mem_type == UCS_MEMORY_TYPE_HOST)) {
perf_attr->bandwidth.shared = (zcopy ? 11660.0 : 9320.0) *
UCS_MBYTE;
perf_attr->bandwidth.shared = zcopy ? iface->config.bw.d2h :
iface->config.bw.d2h * 0.95;
} else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) &&
(dst_mem_type == UCS_MEMORY_TYPE_CUDA)) {
perf_attr->bandwidth.shared = 320.0 * UCS_GBYTE;
perf_attr->bandwidth.shared = iface->config.bw.d2d;
} else {
perf_attr->bandwidth.shared = iface->config.bandwidth;
perf_attr->bandwidth.shared = iface->config.bw.other;
}
}

Expand Down Expand Up @@ -491,7 +502,10 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
self->id = ucs_generate_uuid((uintptr_t)self);
self->config.max_poll = config->max_poll;
self->config.max_cuda_events = config->max_cuda_events;
self->config.bandwidth = config->bandwidth;
self->config.bw.h2d = config->bw.h2d;
self->config.bw.d2h = config->bw.d2h;
self->config.bw.d2d = config->bw.d2d;
self->config.bw.other = config->bw.other;
UCS_STATIC_BITMAP_RESET_ALL(&self->streams_to_sync);

ucs_mpool_params_reset(&mp_params);
Expand Down
14 changes: 12 additions & 2 deletions src/uct/cuda/cuda_copy/cuda_copy_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,12 @@ typedef struct uct_cuda_copy_iface {
struct {
unsigned max_poll;
unsigned max_cuda_events;
double bandwidth;
struct {
double h2d;
double d2h;
double d2d;
double other;
} bw;
} config;
/* handler to support arm/wakeup feature */
struct {
Expand All @@ -87,7 +92,12 @@ typedef struct uct_cuda_copy_iface_config {
uct_iface_config_t super;
unsigned max_poll;
unsigned max_cuda_events;
double bandwidth;
struct {
double h2d;
double d2h;
double d2d;
double other;
} bw;
} uct_cuda_copy_iface_config_t;


Expand Down

0 comments on commit f6964ee

Please sign in to comment.