cuda: fix check for GPU device availability

The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov <[email protected]>
checkpoint-restore · Nov 12, 2024 · 26dcc21 · 26dcc21
1 parent 31b38d6
commit 26dcc21
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c
@@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid)
 }
 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
 
+/**
+ * Check if a CUDA device is available on the system
+ */
+static bool is_cuda_device_available(void)
+{
+	const char *gpu_path = "/proc/driver/nvidia/gpus/";
+	struct stat sb;
+
+	if (stat(gpu_path, &sb) != 0)
+		return false;
+
+	return S_ISDIR(sb.st_mode);
+}
+
 int cuda_plugin_init(int stage)
 {
 	int ret;
@@ -481,8 +495,8 @@ int cuda_plugin_init(int stage)
 		}
 	}
 
-	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
-		pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
+	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) {
+		pr_info("No GPU device found; CUDA plugin is disabled\n");
 		plugin_disabled = true;
 		return 0;
 	}