From 4f1303bac3569e84f7738af44a18916498abba61 Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Thu, 26 Dec 2024 21:25:58 +0000 Subject: [PATCH] fabtests: Bugfixes for neuron This commit fixes the following bugs in neuron fabtests 1. The neuron accelerator detection is broken on some OSs because the full path of the executable `neuron-ls` was not used 2. Before this commit, each pytest worker was assigned a single neuron core. This works on multi node tests but fails on single node tests because a neuron core can only be opened by a single process. This commit assigns two different neuron cores to each pytest worker for client-server tests: one for the server and one for the client. Trn1 has 2 cores per neuron device and Trn2 has 8 cores per neuron device, so this assignment works for both. 3. When running in serial mode, the env var PYTEST_XDIST_WORKER is not set, so the NEURON_RT_VISIBLE_CORES env var is also not set. This causes the server to occupy all neuron cores and the client fails. So this commit assigns device 0 to the server and client when running with one worker. Signed-off-by: Sai Sunku (cherry picked from commit f893f5f88eb5cf0f4cf4e0154edc91e0b6f8b7bf) --- fabtests/pytest/common.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py index a6f50fcc9f4..ef3ad8b22da 100644 --- a/fabtests/pytest/common.py +++ b/fabtests/pytest/common.py @@ -68,7 +68,7 @@ def num_cuda_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_devices(ip): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -84,7 +84,7 @@ def num_neuron_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_cores_on_device(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id): @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def is_neuron_device_available(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable, if "PYTEST_XDIST_WORKER" in os.environ: worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", "")) hmem_device_id = worker_id % num_hmem - if host_memory_type == "cuda": - command += " -i {}".format(hmem_device_id) - else: - assert host_memory_type == "neuron" - num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + else: + hmem_device_id = 0 + + if host_memory_type == "cuda": + command += " -i {}".format(hmem_device_id) + else: + assert host_memory_type == "neuron" + num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + if command_type == "server": additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( hmem_device_id * num_cores) - wait_until_neuron_device_available(host_ip, hmem_device_id) + else: + additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( + hmem_device_id * num_cores + 1) + wait_until_neuron_device_available(host_ip, hmem_device_id) - if self._cmdline_args.provider == "efa": - import efa.efa_common - efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) - command += " -d {}-rdm".format(efa_device) + if self._cmdline_args.provider == "efa": + import efa.efa_common + efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) + command += " -d {}-rdm".format(efa_device) return command, additional_environment