UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

RamHPC · 2024-06-12T00:33:24Z

Describe the bug

Installed ucx-1.16 and everything was working fine. The devices/transports recognized are inline with the expectation. Installed OFED (MLNX_OFED_LINUX-24.04-0.6.6.0-rhel8.9-x86_64) for which automatically installed ucx-1.17 version. This doesn't show cuda, cuda_cpy and gdr_copy as devices/transports

Steps to Reproduce

Command line
UCX version used (from github branch XX or release YY) + UCX configure flags (can be checked by ucx_info -v)

$ ucx_info -v
# Library version: 1.17.0
# Library path: /usr/lib64/libucs.so.0
# API headers version: 1.17.0
# Git branch '', revision 7bb2722
# Configured with: --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --without-go --without-java --enable-cma --with-cuda --with-gdrcopy --with-verbs --with-knem --with-rdmacm --without-rocm --with-xpmem --without-fuse3 --without-ugni --without-mad --without-ze --with-cuda=/usr/local/cuda-12.4

$ ucx_info -b
#define UCX_CONFIG_H              
#define ENABLE_BUILTIN_MEMCPY     1
#define ENABLE_DEBUG_DATA         0
#define ENABLE_MT                 1
#define ENABLE_PARAMS_CHECK       0
#define HAVE_1_ARG_BFD_SECTION_SIZE 0
#define HAVE_ALLOCA               1
#define HAVE_ALLOCA_H             1
#define HAVE_ATTRIBUTE_NOOPTIMIZE 1
#define HAVE_CLEARENV             1
#define HAVE_CPLUS_DEMANGLE       1
#define HAVE_CPU_SET_T            1
#define HAVE_CUDA                 1
#define HAVE_CUDA_H               1
#define HAVE_CUDA_RUNTIME_H       1
#define HAVE_DC_DV                1
#define HAVE_DECL_ASPRINTF        1
#define HAVE_DECL_BASENAME        1
#define HAVE_DECL_BFD_GET_SECTION_FLAGS 1
#define HAVE_DECL_BFD_GET_SECTION_VMA 1
#define HAVE_DECL_BFD_SECTION_FLAGS 0
#define HAVE_DECL_BFD_SECTION_VMA 1
#define HAVE_DECL_CPU_ISSET       1
#define HAVE_DECL_CPU_ZERO        1
#define HAVE_DECL_ETHTOOL_CMD_SPEED 1
#define HAVE_DECL_FMEMOPEN        1
#define HAVE_DECL_F_SETOWN_EX     1
#define HAVE_DECL_GDR_COPY_TO_MAPPING 1
#define HAVE_DECL_GETAUXVAL       1
#define HAVE_DECL_IBV_ACCESS_ON_DEMAND 1
#define HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING 1
#define HAVE_DECL_IBV_ADVISE_MR   1
#define HAVE_DECL_IBV_ALLOC_DM    1
#define HAVE_DECL_IBV_ALLOC_TD    1
#define HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN 1
#define HAVE_DECL_IBV_CREATE_CQ_EX 1
#define HAVE_DECL_IBV_CREATE_QP_EX 1
#define HAVE_DECL_IBV_CREATE_SRQ  1
#define HAVE_DECL_IBV_CREATE_SRQ_EX 1
#define HAVE_DECL_IBV_EVENT_GID_CHANGE 1
#define HAVE_DECL_IBV_EVENT_TYPE_STR 1
#define HAVE_DECL_IBV_GET_ASYNC_EVENT 1
#define HAVE_DECL_IBV_GET_DEVICE_NAME 1
#define HAVE_DECL_IBV_LINK_LAYER_ETHERNET 1
#define HAVE_DECL_IBV_LINK_LAYER_INFINIBAND 1
#define HAVE_DECL_IBV_QPF_GRH_REQUIRED 1
#define HAVE_DECL_IBV_QUERY_DEVICE_EX 1
#define HAVE_DECL_IBV_QUERY_GID   1
#define HAVE_DECL_IBV_REG_DMABUF_MR 1
#define HAVE_DECL_IBV_SET_ECE     1
#define HAVE_DECL_IBV_TRANSPORT_UNSPECIFIED 1
#define HAVE_DECL_IBV_TRANSPORT_USNIC 1
#define HAVE_DECL_IBV_TRANSPORT_USNIC_UDP 1
#define HAVE_DECL_IBV_WC_STATUS_STR 1
#define HAVE_DECL_INOTIFY_ADD_WATCH 1
#define HAVE_DECL_INOTIFY_INIT    1
#define HAVE_DECL_IN_ATTRIB       1
#define HAVE_DECL_IPPROTO_TCP     1
#define HAVE_DECL_MADV_FREE       1
#define HAVE_DECL_MADV_REMOVE     1
#define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE 1
#define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE 1
#define HAVE_DECL_MLX5DV_CREATE_QP 1
#define HAVE_DECL_MLX5DV_DCTYPE_DCT 1
#define HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT 1
#define HAVE_DECL_MLX5DV_DEVX_UMEM_REG_EX 1
#define HAVE_DECL_MLX5DV_INIT_OBJ 1
#define HAVE_DECL_MLX5DV_IS_SUPPORTED 1
#define HAVE_DECL_MLX5DV_OBJ_AH   1
#define HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE 1
#define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_BF 1
#define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC 1
#define HAVE_DECL_POSIX_MADV_DONTNEED 1
#define HAVE_DECL_PR_SET_PTRACER  1
#define HAVE_DECL_SOL_SOCKET      1
#define HAVE_DECL_SO_KEEPALIVE    1
#define HAVE_DECL_SPEED_UNKNOWN   1
#define HAVE_DECL_STRERROR_R      1
#define HAVE_DECL_SYS_BRK         1
#define HAVE_DECL_SYS_IPC         0
#define HAVE_DECL_SYS_MADVISE     1
#define HAVE_DECL_SYS_MMAP        1
#define HAVE_DECL_SYS_MREMAP      1
#define HAVE_DECL_SYS_MUNMAP      1
#define HAVE_DECL_SYS_SHMAT       1
#define HAVE_DECL_SYS_SHMDT       1
#define HAVE_DECL_TCP_KEEPCNT     1
#define HAVE_DECL_TCP_KEEPIDLE    1
#define HAVE_DECL_TCP_KEEPINTVL   1
#define HAVE_DECL___PPC_GET_TIMEBASE 0
#define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
#define HAVE_DETAILED_BACKTRACE   1
#define HAVE_DEVX                 1
#define HAVE_DLFCN_H              1
#define HAVE_GDRAPI_H             1
#define HAVE_HW_TIMER             1
#define HAVE_IB                   1
#define HAVE_IBV_DM               1
#define HAVE_IN6_ADDR_S6_ADDR32   1
#define HAVE_INFINIBAND_MLX5DV_H  1
#define HAVE_INOTIFY              1
#define HAVE_INTTYPES_H           1
#define HAVE_IP_IP_DST            1
#define HAVE_LIBGEN_H             1
#define HAVE_LIBRT                1
#define HAVE_LINUX_FUTEX_H        1
#define HAVE_LINUX_IP_H           1
#define HAVE_LINUX_MMAN_H         1
#define HAVE_MALLOC_H             1
#define HAVE_MALLOC_HOOK          1
#define HAVE_MALLOC_TRIM          1
#define HAVE_MEMALIGN             1
#define HAVE_MEMORY_H             1
#define HAVE_MLX5_DV              1
#define HAVE_MLX5_HW_UD           1
#define HAVE_MREMAP               1
#define HAVE_NETINET_IP_H         1
#define HAVE_NET_ETHERNET_H       1
#define HAVE_NVML_H               1
#define HAVE_POSIX_MEMALIGN       1
#define HAVE_PREFETCH             1
#define HAVE_SCHED_GETAFFINITY    1
#define HAVE_SCHED_SETAFFINITY    1
#define HAVE_SIGACTION_SA_RESTORER 1
#define HAVE_SIGEVENT_SIGEV_UN_TID 1
#define HAVE_SIGHANDLER_T         1
#define HAVE_STDINT_H             1
#define HAVE_STDLIB_H             1
#define HAVE_STRERROR_R           1
#define HAVE_STRINGS_H            1
#define HAVE_STRING_H             1
#define HAVE_STRUCT_DL_PHDR_INFO  1
#define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_ODP_CAPS 1
#define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_PCI_ATOMIC_CAPS 1
#define HAVE_STRUCT_IBV_TM_CAPS_FLAGS 1
#define HAVE_STRUCT_MLX5DV_CQ_CQ_UAR 1
#define HAVE_SYS_EPOLL_H          1
#define HAVE_SYS_EVENTFD_H        1
#define HAVE_SYS_STAT_H           1
#define HAVE_SYS_TYPES_H          1
#define HAVE_SYS_UIO_H            1
#define HAVE_TL_DC                1
#define HAVE_TL_RC                1
#define HAVE_TL_UD                1
#define HAVE_UCM_PTMALLOC286      1
#define HAVE_UNISTD_H             1
#define HAVE___CLEAR_CACHE        1
#define HAVE___CURBRK             1
#define HAVE___SIGHANDLER_T       1
#define IBV_HW_TM                 1
#define LT_OBJDIR                 ".libs/"
#define NVALGRIND                 1
#define PACKAGE                   "ucx"
#define PACKAGE_BUGREPORT         ""
#define PACKAGE_NAME              "ucx"
#define PACKAGE_STRING            "ucx 1.17"
#define PACKAGE_TARNAME           "ucx"
#define PACKAGE_URL               ""
#define PACKAGE_VERSION           "1.17"
#define STDC_HEADERS              1
#define STRERROR_R_CHAR_P         1
#define UCM_BISTRO_HOOKS          1
#define UCS_MAX_LOG_LEVEL         UCS_LOG_LEVEL_DEBUG
#define UCT_TCP_EP_KEEPALIVE      1
#define UCT_UD_EP_DEBUG_HOOKS     0
#define UCX_CONFIGURE_FLAGS       "--build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --without-go --without-java --enable-cma --with-cuda --with-gdrcopy --with-verbs --with-knem --with-rdmacm --without-rocm --with-xpmem --without-fuse3 --without-ugni --without-mad --without-ze --with-cuda=/usr/local/cuda-12.4"
#define UCX_MODULE_SUBDIR         "ucx"
#define VERSION                   "1.17"
#define restrict                  __restrict
#define test_MODULES              ":module"
#define ucm_MODULES               ":cuda"
#define ucs_MODULES               ""
#define uct_MODULES               ":cuda:ib:rdmacm:cma:knem:xpmem"
#define uct_cuda_MODULES          ":gdrcopy"
#define uct_ib_MODULES            ""
#define uct_rocm_MODULES          ""
#define ucx_perftest_MODULES      ":cuda"

$ ucx_info -d
#
# Memory domain: self
#     Component: self
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#           rkey_ptr is supported
#         memory types: host (access,reg_nonblock,reg,cache)
#
#      Transport: self
#         Device: memory
#           Type: loopback
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 19360.00 MB/sec
#              latency: 0 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 8K
#             am_bcopy: <= 8K
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: tcp
#     Component: tcp
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#         memory types: host (access,reg_nonblock,reg,cache)
#
#      Transport: tcp
#         Device: ens21f0
#           Type: network
#  System device: ens21f0 (0)
#
#      capabilities:
#            bandwidth: 113.16/ppn + 0.00 MB/sec
#              latency: 5776 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: ens21f1
#           Type: network
#  System device: ens21f1 (1)
#
#      capabilities:
#            bandwidth: 113.16/ppn + 0.00 MB/sec
#              latency: 5776 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: ib0
#           Type: network
#  System device: ib0 (2)
#
#      capabilities:
#            bandwidth: 2200.00/ppn + 0.00 MB/sec
#              latency: 5203 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: ib1
#           Type: network
#  System device: ib1 (3)
#
#      capabilities:
#            bandwidth: 2200.00/ppn + 0.00 MB/sec
#              latency: 5203 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: lo
#           Type: network
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 11.91/ppn + 0.00 MB/sec
#              latency: 10960 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 18 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
#      max_conn_priv: 2064 bytes
#
# Memory domain: sysv
#     Component: sysv
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#         memory types: host (access,alloc,cache)
#
#      Transport: sysv
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 15360.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: posix
#     Component: posix
#             allocate: <= 263740988K
#           remote key: 24 bytes
#           rkey_ptr is supported
#         memory types: host (access,alloc,cache)
#
#      Transport: posix
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 15360.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: mlx5_0
#     Component: ib
#             register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#           memory invalidation is supported
#         memory types: host (access,reg,cache)
#
#      Transport: dc_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (2)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 660 nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 11 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 11 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 138
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 7 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: rc_verbs
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (2)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 600 + 1.000 * N nsec
#             overhead: 75 nsec
#            put_short: <= 124
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 5 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 5 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 123
#             am_bcopy: <= 8255
#             am_zcopy: <= 8255, up to 4 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 127
#               domain: device
#           atomic_add: 64 bit
#          atomic_fadd: 64 bit
#         atomic_cswap: 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 7 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: rc_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (2)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 600 + 1.000 * N nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 14 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 14 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 186
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 10 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: ud_verbs
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (2)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 630 nsec
#             overhead: 105 nsec
#             am_short: <= 116
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 5 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 3992
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: ud_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (2)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 630 nsec
#             overhead: 80 nsec
#             am_short: <= 180
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 132
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_1
#     Component: ib
#             register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#           memory invalidation is supported
#         memory types: host (access,reg,cache)
#
#      Transport: dc_mlx5
#         Device: mlx5_1:1
#           Type: network
#  System device: mlx5_1 (3)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 660 nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 11 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 11 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 138
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 7 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: rc_verbs
#         Device: mlx5_1:1
#           Type: network
#  System device: mlx5_1 (3)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 600 + 1.000 * N nsec
#             overhead: 75 nsec
#            put_short: <= 124
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 5 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 5 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 123
#             am_bcopy: <= 8255
#             am_zcopy: <= 8255, up to 4 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 127
#               domain: device
#           atomic_add: 64 bit
#          atomic_fadd: 64 bit
#         atomic_cswap: 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 7 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: rc_mlx5
#         Device: mlx5_1:1
#           Type: network
#  System device: mlx5_1 (3)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 600 + 1.000 * N nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 14 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 14 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 186
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 10 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: ud_verbs
#         Device: mlx5_1:1
#           Type: network
#  System device: mlx5_1 (3)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 630 nsec
#             overhead: 105 nsec
#             am_short: <= 116
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 5 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 3992
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: ud_mlx5
#         Device: mlx5_1:1
#           Type: network
#  System device: mlx5_1 (3)
#
#      capabilities:
#            bandwidth: 23588.47/ppn + 0.00 MB/sec
#              latency: 630 nsec
#             overhead: 80 nsec
#             am_short: <= 180
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 132
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_0
#     Component: gga
#             register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#           memory invalidation is supported
#         memory types: host (access,reg,cache)
#   < no supported devices found >
#
# Memory domain: mlx5_1
#     Component: gga
#             register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#           memory invalidation is supported
#         memory types: host (access,reg,cache)
#   < no supported devices found >
#
# Connection manager: rdmacm
#      max_conn_priv: 54 bytes
#
# Memory domain: cma
#     Component: cma
#             register: unlimited, cost: 9 nsec
#         memory types: host (access,reg_nonblock,reg,cache)
#
#      Transport: cma
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 11145.00 MB/sec
#              latency: 80 nsec
#             overhead: 2000 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 4 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: knem
#     Component: knem
#             register: unlimited, cost: 1200 + 0.007 * N nsec
#           remote key: 16 bytes
#         memory types: host (access,reg,cache)
#
#      Transport: knem
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 13862.00 MB/sec
#              latency: 80 nsec
#             overhead: 2000 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 0 bytes
#       error handling: none
#
#
# Memory domain: xpmem
#     Component: xpmem
#             register: unlimited, cost: 60 nsec
#           remote key: 24 bytes
#           rkey_ptr is supported
#         memory types: host (access,alloc,reg_nonblock,reg,cache)
#
#      Transport: xpmem
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 15360.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 16 bytes
#       error handling: none
#

Any UCX environment variables used
export UCX_TLS=ib,sm,cuda,cuda_copy,cuda_ipc,gdr_copy

Setup and versions

OS version (e.g Linux distro) + CPU architecture (x86_64/aarch64/ppc64le/...)
- cat /etc/issue or cat /etc/redhat-release + uname -a
  Red Hat Enterprise Linux release 8.9 (Ootpa) + Linux gpu2 4.18.0-513.24.1.el8_9.x86_64 Add basic types and functions, initial makefile, and smoke test. #1 SMP Thu Mar 14 14:20:09 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux
- For Nvidia Bluefield SmartNIC include cat /etc/mlnx-release (the string identifies software and firmware setup)
For RDMA/IB/RoCE related issues:
- Driver version:
  - rpm -q rdma-core or rpm -q libibverbs
  - or: MLNX_OFED version ofed_info -s
    $ ofed_info -s
    MLNX_OFED_LINUX-24.04-0.6.6.0:
- HW information from ibstat or ibv_devinfo -vv command

   - $ ibv_devinfo -vv
hca_id:	mlx5_0
	transport:			InfiniBand (0)
	fw_ver:				20.37.1700
	node_guid:			88e9:a4ff:ff25:a45a
	sys_image_guid:			88e9:a4ff:ff25:a45a
	vendor_id:			0x02c9
	vendor_part_id:			4123
	hw_ver:				0x0
	board_id:			MT_0000000594
	phys_port_cnt:			1
	max_mr_size:			0xffffffffffffffff
	page_size_cap:			0xfffffffffffff000
	max_qp:				131072
	max_qp_wr:			32768
	device_cap_flags:		0x21361c36
					BAD_PKEY_CNTR
					BAD_QKEY_CNTR
					AUTO_PATH_MIG
					CHANGE_PHY_PORT
					PORT_ACTIVE_EVENT
					SYS_IMAGE_GUID
					RC_RNR_NAK_GEN
					MEM_WINDOW
					UD_IP_CSUM
					XRC
					MEM_MGT_EXTENSIONS
					MEM_WINDOW_TYPE_2B
					MANAGED_FLOW_STEERING
	max_sge:			30
	max_sge_rd:			30
	max_cq:				16777216
	max_cqe:			4194303
	max_mr:				16777216
	max_pd:				8388608
	max_qp_rd_atom:			16
	max_ee_rd_atom:			0
	max_res_rd_atom:		2097152
	max_qp_init_rd_atom:		16
	max_ee_init_rd_atom:		0
	atomic_cap:			ATOMIC_HCA (1)
	max_ee:				0
	max_rdd:			0
	max_mw:				16777216
	max_raw_ipv6_qp:		0
	max_raw_ethy_qp:		0
	max_mcast_grp:			2097152
	max_mcast_qp_attach:		240
	max_total_mcast_qp_attach:	503316480
	max_ah:				2147483647
	max_fmr:			0
	max_srq:			8388608
	max_srq_wr:			32767
	max_srq_sge:			31
	max_pkeys:			128
	local_ca_ack_delay:		16
	general_odp_caps:
					ODP_SUPPORT
					ODP_SUPPORT_IMPLICIT
	rc_odp_caps:
					SUPPORT_SEND
					SUPPORT_RECV
					SUPPORT_WRITE
					SUPPORT_READ
					SUPPORT_ATOMIC
					SUPPORT_SRQ
	uc_odp_caps:
					NO SUPPORT
	ud_odp_caps:
					SUPPORT_SEND
	xrc_odp_caps:
					SUPPORT_SEND
					SUPPORT_WRITE
					SUPPORT_READ
					SUPPORT_ATOMIC
					SUPPORT_SRQ
	completion timestamp_mask:			0x7fffffffffffffff
	hca_core_clock:			156250kHZ
	device_cap_flags_ex:		0x3000005021361C36
					PCI_WRITE_END_PADDING
					Unknown flags: 0x3000004000000000
	tso_caps:
		max_tso:			0
	rss_caps:
		max_rwq_indirection_tables:			0
		max_rwq_indirection_table_size:			0
		rx_hash_function:				0x0
		rx_hash_fields_mask:				0x0
	max_wq_type_rq:			0
	packet_pacing_caps:
		qp_rate_limit_min:	0kbps
		qp_rate_limit_max:	0kbps
	max_rndv_hdr_size:		64
	max_num_tags:			127
	max_ops:			32768
	max_sge:			1
	flags:
					IBV_TM_CAP_RC

	cq moderation caps:
		max_cq_count:	65535
		max_cq_period:	4095 us

	maximum available device memory:	131072Bytes

	num_comp_vectors:		63
		port:	1
			state:			PORT_ACTIVE (4)
			max_mtu:		4096 (5)
			active_mtu:		4096 (5)
			sm_lid:			1
			port_lid:		3
			port_lmc:		0x00
			link_layer:		InfiniBand
			max_msg_sz:		0x40000000
			port_cap_flags:		0xa259e848
			port_cap_flags2:	0x0032
			max_vl_num:		4 (3)
			bad_pkey_cntr:		0x0
			qkey_viol_cntr:		0x0
			sm_sl:			0
			pkey_tbl_len:		128
			gid_tbl_len:		8
			subnet_timeout:		18
			init_type_reply:	0
			active_width:		4X (2)
			active_speed:		50.0 Gbps (64)
			phys_state:		LINK_UP (5)
			GID[  0]:		fe80:0000:0000:0000:88e9:a4ff:ff25:a45a

hca_id:	mlx5_1
	transport:			InfiniBand (0)
	fw_ver:				20.37.1700
	node_guid:			88e9:a4ff:ff25:a45b
	sys_image_guid:			88e9:a4ff:ff25:a45a
	vendor_id:			0x02c9
	vendor_part_id:			4123
	hw_ver:				0x0
	board_id:			MT_0000000594
	phys_port_cnt:			1
	max_mr_size:			0xffffffffffffffff
	page_size_cap:			0xfffffffffffff000
	max_qp:				131072
	max_qp_wr:			32768
	device_cap_flags:		0x21361c36
					BAD_PKEY_CNTR
					BAD_QKEY_CNTR
					AUTO_PATH_MIG
					CHANGE_PHY_PORT
					PORT_ACTIVE_EVENT
					SYS_IMAGE_GUID
					RC_RNR_NAK_GEN
					MEM_WINDOW
					UD_IP_CSUM
					XRC
					MEM_MGT_EXTENSIONS
					MEM_WINDOW_TYPE_2B
					MANAGED_FLOW_STEERING
	max_sge:			30
	max_sge_rd:			30
	max_cq:				16777216
	max_cqe:			4194303
	max_mr:				16777216
	max_pd:				8388608
	max_qp_rd_atom:			16
	max_ee_rd_atom:			0
	max_res_rd_atom:		2097152
	max_qp_init_rd_atom:		16
	max_ee_init_rd_atom:		0
	atomic_cap:			ATOMIC_HCA (1)
	max_ee:				0
	max_rdd:			0
	max_mw:				16777216
	max_raw_ipv6_qp:		0
	max_raw_ethy_qp:		0
	max_mcast_grp:			2097152
	max_mcast_qp_attach:		240
	max_total_mcast_qp_attach:	503316480
	max_ah:				2147483647
	max_fmr:			0
	max_srq:			8388608
	max_srq_wr:			32767
	max_srq_sge:			31
	max_pkeys:			128
	local_ca_ack_delay:		16
	general_odp_caps:
					ODP_SUPPORT
					ODP_SUPPORT_IMPLICIT
	rc_odp_caps:
					SUPPORT_SEND
					SUPPORT_RECV
					SUPPORT_WRITE
					SUPPORT_READ
					SUPPORT_ATOMIC
					SUPPORT_SRQ
	uc_odp_caps:
					NO SUPPORT
	ud_odp_caps:
					SUPPORT_SEND
	xrc_odp_caps:
					SUPPORT_SEND
					SUPPORT_WRITE
					SUPPORT_READ
					SUPPORT_ATOMIC
					SUPPORT_SRQ
	completion timestamp_mask:			0x7fffffffffffffff
	hca_core_clock:			156250kHZ
	device_cap_flags_ex:		0x3000005021361C36
					PCI_WRITE_END_PADDING
					Unknown flags: 0x3000004000000000
	tso_caps:
		max_tso:			0
	rss_caps:
		max_rwq_indirection_tables:			0
		max_rwq_indirection_table_size:			0
		rx_hash_function:				0x0
		rx_hash_fields_mask:				0x0
	max_wq_type_rq:			0
	packet_pacing_caps:
		qp_rate_limit_min:	0kbps
		qp_rate_limit_max:	0kbps
	max_rndv_hdr_size:		64
	max_num_tags:			127
	max_ops:			32768
	max_sge:			1
	flags:
					IBV_TM_CAP_RC

	cq moderation caps:
		max_cq_count:	65535
		max_cq_period:	4095 us

	maximum available device memory:	131072Bytes

	num_comp_vectors:		63
		port:	1
			state:			PORT_ACTIVE (4)
			max_mtu:		4096 (5)
			active_mtu:		4096 (5)
			sm_lid:			1
			port_lid:		6
			port_lmc:		0x00
			link_layer:		InfiniBand
			max_msg_sz:		0x40000000
			port_cap_flags:		0xa259e848
			port_cap_flags2:	0x0032
			max_vl_num:		4 (3)
			bad_pkey_cntr:		0x0
			qkey_viol_cntr:		0x0
			sm_sl:			0
			pkey_tbl_len:		128
			gid_tbl_len:		8
			subnet_timeout:		18
			init_type_reply:	0
			active_width:		4X (2)
			active_speed:		50.0 Gbps (64)
			phys_state:		LINK_UP (5)
			GID[  0]:		fe80:0000:0000:0000:88e9:a4ff:ff25:a45b

For GPU related issues:
- GPU type
- Nvidia A100
- Cuda:
  - Drivers version
    Driver Version: 555.42.02
  - Check if peer-direct is loaded: lsmod|grep nv_peer_mem and/or gdrcopy: lsmod|grep gdrdrv
  - Manually need to load nv_peer_mem but not loaded when the problem happens.
    $ lsmod|grep gdrdrv
    gdrdrv 24576 0
    nvidia 8691712 365 nvidia_uvm,nvidia_fs,gdrdrv,nvidia_modeset

Additional information (depending on the issue)

OpenMPI version
5.0.3
Output of ucx_info -d to show transports and devices recognized by UCX
Configure result - config.log
Log file - configure UCX with "--enable-logging" - and run with "UCX_LOG_LEVEL=data"

The text was updated successfully, but these errors were encountered:

yosefe · 2024-06-12T08:02:39Z

@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems.
I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.

RamHPC · 2024-06-12T19:23:09Z

@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems. I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.

Thank you! Will give this a try. When I see the flags MLNX_OFED installed UCX it looks like it included cuda and gdr_copy but some how transports don't show up. There is no way to re-calibrate existing installation?

RamHPC added the Bug label Jun 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

RamHPC commented Jun 12, 2024 •

edited by yosefe

Loading

yosefe commented Jun 12, 2024

RamHPC commented Jun 12, 2024

UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

Comments

RamHPC commented Jun 12, 2024 • edited by yosefe Loading

Describe the bug

Steps to Reproduce

Setup and versions

Additional information (depending on the issue)

yosefe commented Jun 12, 2024

RamHPC commented Jun 12, 2024

RamHPC commented Jun 12, 2024 •

edited by yosefe

Loading