Skip to content

Commit

Permalink
rdma: add MR cache to RDMA protocol
Browse files Browse the repository at this point in the history
This commit is making the RDMA protocol use the MR cache for external memory
registrations. Internal memory registrations (bounce buffers and ctrl messages)
do not use an MR cache, as they won't try to register the same memory region
twice.
This commit also adds the OFI_NCCL_MR_CACHE_DISABLE environment variable to
disable the MR cache and always register the memory with the device when
regMR() is called.

Signed-off-by: Amedeo Sapio <[email protected]>
  • Loading branch information
AmedeoSapio committed Jul 9, 2024
1 parent a11b55b commit 5b7ab56
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 71 deletions.
26 changes: 20 additions & 6 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ extern "C" {
#include "nccl_ofi_log.h"
#include "nccl_ofi_topo.h"
#include "nccl_ofi_idpool.h"
#include "nccl_ofi_mr.h"


/*
Expand Down Expand Up @@ -71,6 +72,9 @@ extern "C" {
/* Flush read size (bytes) */
#define NCCL_OFI_FLUSH_SIZE 4

/* Initial number of entries in the MR cache of a device */
#define NCCL_OFI_MR_CACHE_INIT_SIZE 128

/* Indicates if GPUDirect is supported by libfabric provider */
enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED};
extern enum gdr_support_level_t support_gdr;
Expand Down Expand Up @@ -247,12 +251,22 @@ struct nccl_net_ofi_device {
/* this device's index in the plugin's devices array */
int dev_id;

/* name of the device - should include the provider name, but
may be augmented (in the case of mrail). Set during the
transport's initialization, and should be read-only from
that point. */
/*
* name of the device - should include the provider name, but may be
* augmented (in the case of mrail). Set during the transport's
* initialization, and should be read-only from that point.
*/
char *name;

/*
* Protocol-agnostic MR cache for this device. Note that Registrations
* are tied to domains in libfabric, but we do not have a
* domain-specific object today, so stashing it in the device itself.
* This should change if we were to break up nccl_net_ofi_device into
* separate device and domain objects.
*/
nccl_ofi_mr_cache_t *mr_cache;

int (*get_properties)(nccl_net_ofi_device_t *base_dev,
nccl_ofi_properties_t *props);

Expand All @@ -261,8 +275,8 @@ struct nccl_net_ofi_device {
* nccl_ofi_device. Create if it does not exist. Store
* in pthread key. Increase reference counter. Must be
* protected by lock stored in device.
*
* During the plugin initialization, this function will be
*
* During the plugin initialization, this function will be
* called once per process using one of the instantiated device structs
* to create and configure the endpoint of the initializing thread.
*/
Expand Down
12 changes: 12 additions & 0 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,18 @@ OFI_NCCL_PARAM_INT(cuda_flush_enable, "CUDA_FLUSH_ENABLE", 0);
*/
OFI_NCCL_PARAM_INT(mr_key_size, "MR_KEY_SIZE", 2);

/*
* Disable the MR cache. The MR cache is used to keep track of registered
* memory regions, so that calling regMr() on the same buffer (address and
* size), will quickly return a previously globally registered MR on that
* buffer, avoiding redundant (and expensive) registrations with the
* underlying device.
* Disabling the MR cache will make all calls to regMR() result in a
* registration with the device, so it may cause a significant performance
* degradation.
*/
OFI_NCCL_PARAM_INT(mr_cache_disable, "MR_CACHE_DISABLE", 0);

/*
* Maximum number of cq entries to read in a single call to
* fi_cq_read.
Expand Down
23 changes: 21 additions & 2 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -630,17 +630,32 @@ int nccl_net_ofi_plugin_fini(nccl_net_ofi_plugin_t *plugin)
int nccl_net_ofi_device_init(nccl_net_ofi_device_t *device, nccl_net_ofi_plugin_t *plugin,
int device_index, const char *device_name)
{
int ret = 0;

device->plugin = plugin;
device->dev_id = device_index;
device->name = strdup(device_name);
if (device->name == NULL) {
NCCL_OFI_WARN("Unable to allocate device name");
return -ENOMEM;
ret = -ENOMEM;
goto exit;
}

device->release = nccl_net_ofi_device_fini;

return 0;
device->mr_cache = NULL;
if (!ofi_nccl_mr_cache_disable()) {
device->mr_cache =
nccl_ofi_mr_cache_init(NCCL_OFI_MR_CACHE_INIT_SIZE,
system_page_size);
if (!device->mr_cache) {
ret = -ENOMEM;
goto exit;
}
}

exit:
return ret;
}


Expand All @@ -650,6 +665,10 @@ int nccl_net_ofi_device_fini(nccl_net_ofi_device_t *device)
return 0;
}

if (device->mr_cache != NULL) {
nccl_ofi_mr_cache_finalize(device->mr_cache);
}

if (device->name != NULL) {
free(device->name);
}
Expand Down
Loading

0 comments on commit 5b7ab56

Please sign in to comment.