From 4f1303bac3569e84f7738af44a18916498abba61 Mon Sep 17 00:00:00 2001
From: Sai Sunku <sunkusa@amazon.com>
Date: Thu, 26 Dec 2024 21:25:58 +0000
Subject: [PATCH] fabtests: Bugfixes for neuron

This commit fixes the following bugs in neuron fabtests
1. The neuron accelerator detection is broken on some OSs because the
   full path of the executable `neuron-ls` was not used

2. Before this commit, each pytest worker was assigned a single  neuron
   core. This works on multi node tests but fails on single node tests
because a neuron core can only be opened by a single process. This
commit assigns two different neuron cores to each pytest worker for
client-server tests: one for the server and one for the client. Trn1 has
2 cores per neuron device and Trn2 has 8 cores per neuron device, so
  this assignment works for both.

3. When running in serial mode, the env var PYTEST_XDIST_WORKER is not
   set, so the NEURON_RT_VISIBLE_CORES env var is also not set. This
causes the server to occupy all neuron cores and the client fails. So
this commit assigns device 0 to the server and client when running with
one worker.

Signed-off-by: Sai Sunku <sunkusa@amazon.com>
(cherry picked from commit f893f5f88eb5cf0f4cf4e0154edc91e0b6f8b7bf)
---
 fabtests/pytest/common.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py
index a6f50fcc9f4..ef3ad8b22da 100644
--- a/fabtests/pytest/common.py
+++ b/fabtests/pytest/common.py
@@ -68,7 +68,7 @@ def num_cuda_devices(ip):
 @functools.lru_cache(10)
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def num_neuron_devices(ip):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -84,7 +84,7 @@ def num_neuron_devices(ip):
 @functools.lru_cache(10)
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def num_neuron_cores_on_device(ip, device_id):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id):
 
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def is_neuron_device_available(ip, device_id):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable,
         if "PYTEST_XDIST_WORKER" in os.environ:
             worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", ""))
             hmem_device_id = worker_id % num_hmem
-            if host_memory_type == "cuda":
-                command += " -i {}".format(hmem_device_id)
-            else:
-                assert host_memory_type == "neuron"
-                num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
+        else:
+            hmem_device_id = 0
+
+        if host_memory_type == "cuda":
+            command += " -i {}".format(hmem_device_id)
+        else:
+            assert host_memory_type == "neuron"
+            num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
+            if command_type == "server":
                 additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
                     hmem_device_id * num_cores)
-                wait_until_neuron_device_available(host_ip, hmem_device_id)
+            else:
+                additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
+                    hmem_device_id * num_cores + 1)
+            wait_until_neuron_device_available(host_ip, hmem_device_id)
 
-            if self._cmdline_args.provider == "efa":
-                import efa.efa_common
-                efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
-                command += " -d {}-rdm".format(efa_device)
+        if self._cmdline_args.provider == "efa":
+            import efa.efa_common
+            efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
+            command += " -d {}-rdm".format(efa_device)
 
         return command, additional_environment