analysiscenter · AlexeyKozhevin · Jul 5, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 4, 2024
diff --git a/nbtools/core.py b/nbtools/core.py
@@ -1,6 +1,7 @@
 """ Core utility functions to work with Jupyter Notebooks. """
 #pylint: disable=import-outside-toplevel
 import os
+import sys
 import re
 import json
 import warnings
@@ -120,7 +121,9 @@ def notebook_to_script(path_script, path_notebook=None, ignore_markdown=True, re
 
 
 
-def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False):
+
+def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
+                       raise_error=False, return_memory=False):
     """ Select ``n`` gpus from available and free devices.
 
     Parameters
@@ -130,47 +133,67 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
         * If ``'max'``, then use maximum number of available devices.
         * If ``int``, then number of devices to select.
 
-    min_free_memory : float
-        Minimum percentage of free memory on a device to consider it free.
+    min_free_memory : int, float
+
+        * If ``int``, minimum amount of free memory (in MB) on a device to consider it free.
+        * If ``float``, minimum percentage of free memory.
+
     max_processes : int
         Maximum amount of computed processes on a device to consider it free.
     verbose : bool
         Whether to show individual device information.
     raise_error : bool
         Whether to raise an exception if not enough devices are available.
+    return_memory : bool
+        Whether to return memory available on each GPU.
 
     Returns
     -------
     available_devices : list
-        Indices of available GPUs.
+        List with available GPUs indices or dict of indices and ``'available'`` and ``'max'`` memory (in MB)
     """
     try:
-        import nvidia_smi
+        import pynvml
     except ImportError as exception:
         raise ImportError('Install Python interface for nvidia_smi') from exception
 
-    nvidia_smi.nvmlInit()
-    n_devices = nvidia_smi.nvmlDeviceGetCount()
+    try:
+        error_message = None
+        pynvml.nvmlInit()
+    except pynvml.NVMLError_LibraryNotFound:
+        if sys.platform == 'win32':
+            error_message = " Copy nvml.dll from 'Windows/System32' to 'Program Files/NVIDIA Corporation/NVSMI'"
+    finally:
+        if error_message:
+            raise RuntimeError('NVIDIA SMI is not available.' + error_message)
+    n_devices = pynvml.nvmlDeviceGetCount()
+
+    available_devices, memory_free, memory_total  = [], [], []
 
-    available_devices, memory_usage = [], []
     for i in range(n_devices):
-        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
-        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+        num_processes = len(pynvml.nvmlDeviceGetComputeRunningProcesses(handle))
+        free_memory = info.free / 1024**2
+        total_memory = info.total / 1024**2
+
+        memory_threshold = total_memory * min_free_memory if isinstance(min_free_memory, float) else min_free_memory
 
-        fraction_free = info.free / info.total
-        num_processes = len(nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle))
+        consider_available = (
+            (free_memory >= memory_threshold) &
+            (max_processes is None or num_processes <= max_processes)
+        )
 
-        consider_available = (fraction_free > min_free_memory) & (num_processes <= max_processes)
         if consider_available:
             available_devices.append(i)
-            memory_usage.append(fraction_free)
+            memory_free.append(free_memory)
+            memory_total.append(total_memory)
 
         if verbose:
-            print(f'Device {i} | Free memory: {fraction_free:4.2f} | '
+            print(f'Device {i} | Free memory: {info.free:4.2f} | '
                   f'Number of running processes: {num_processes:>2} | Free: {consider_available}')
 
-    nvidia_smi.nvmlShutdown()
-
     if isinstance(n, str) and n.startswith('max'):
         n = len(available_devices)
 
@@ -180,24 +203,37 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
             raise ValueError(msg)
         warnings.warn(msg, RuntimeWarning)
 
-    # Argsort of `memory_usage` in a descending order
-    indices = sorted(range(len(available_devices)), key=memory_usage.__getitem__, reverse=True)
-    available_devices = [available_devices[i] for i in indices]
-    return sorted(available_devices[:n])
+    sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True)
+    if return_memory:
+        gpus = {}
+        for ix in sorted_indices[:n]:
+            gpu = available_devices[ix]
+            gpus[gpu] = {'available': memory_free[ix], 'max': memory_total[ix]}
+        return gpus
+
+    sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True)
+    sorted_devices = [available_devices[i] for i in sorted_indices]
+    return sorted_devices[:n]
 
-def get_gpu_free_memory(index):
-    """ Get free memory of a device. """
+def get_gpu_free_memory(index, ratio=True):
+    """ Get free memory of a device (ratio or size in MB). """
     try:
-        import nvidia_smi
+        import pynvml
     except ImportError as exception:
         raise ImportError('Install Python interface for nvidia_smi') from exception
 
-    nvidia_smi.nvmlInit()
-    nvidia_smi.nvmlDeviceGetCount()
-    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index)
-    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
-    nvidia_smi.nvmlShutdown()
-    return info.free / info.total
+    pynvml.nvmlInit()
+    pynvml.nvmlDeviceGetCount()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(index)
+    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    pynvml.nvmlShutdown()
+
+    free_memory = info.free / 1024**2
+    total_memory = info.total / 1024**2
+
+    if ratio:
+        return free_memory / total_memory
+    return free_memory
 
 def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False):
     """ Set the ``CUDA_VISIBLE_DEVICES`` variable to ``n`` available devices.
@@ -209,8 +245,11 @@ def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_err
         * If ``'max'``, then use maximum number of available devices.
         * If ``int``, then number of devices to select.
 
-    min_free_memory : float
-        Minimum percentage of free memory on a device to consider it free.
+    min_free_memory : int, float
+
+        * If ``int``, minimum amount of free memory (in MB) on a device to consider it free.
+        * If ``float``, minimum percentage of free memory.
+
     max_processes : int
         Maximum amount of computed processes on a device to consider it free.
     verbose : bool or int
@@ -252,21 +291,21 @@ def free_gpus(devices=None):
     devices : iterable of ints
         Device indices to terminate processes. If ``None``, than free all available gpus.
     """
-    import nvidia_smi
+    import pynvml
     import psutil
 
-    nvidia_smi.nvmlInit()
+    pynvml.nvmlInit()
 
     if devices is None:
         if 'CUDA_VISIBLE_DEVICES' in os.environ.keys():
             devices = [int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(',')]
         else:
-            devices = range(0, nvidia_smi.nvmlDeviceGetCount())
+            devices = range(0, pynvml.nvmlDeviceGetCount())
 
     for device_index in devices:
-        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_index)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
 
-        for proc in nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle):
+        for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
             psutil.Process(proc.pid).terminate()
 
-    nvidia_smi.nvmlShutdown()
+    pynvml.nvmlShutdown()
diff --git a/nbtools/nbstat/resource_inspector.py b/nbtools/nbstat/resource_inspector.py
@@ -10,7 +10,7 @@
 import requests
 from blessed import Terminal
 
-import nvidia_smi
+import pynvml
 
 from .resource import Resource
 from .resource_table import ResourceTable
@@ -32,7 +32,7 @@ class ResourceInspector:
     # TODO: correct working with VSCode Jupyter Notebooks
     # TODO: make sure that everything works without sudo
     # TODO: add more fallbacks for unavailable resources
-    # TODO: can add explicit __delete__ to call nvidia_smi.nvmlShutdown(), if we ever have problems with that
+    # TODO: can add explicit __delete__ to call pynvml.nvmlShutdown(), if we ever have problems with that
     def __init__(self, formatter=None):
         self.formatter = formatter
 
@@ -47,10 +47,10 @@ def __init__(self, formatter=None):
     def device_handles(self):
         """ Cached handles of NVIDIA devices. """
         if self._device_handles is None:
-            nvidia_smi.nvmlInit()
-            n_devices = nvidia_smi.nvmlDeviceGetCount()
+            pynvml.nvmlInit()
+            n_devices = pynvml.nvmlDeviceGetCount()
 
-            self._device_handles = {device_id : nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
+            self._device_handles = {device_id : pynvml.nvmlDeviceGetHandleByIndex(device_id)
                                     for device_id in range(n_devices)}
         return self._device_handles
 
@@ -82,15 +82,15 @@ def get_device_table(self, formatter=None, window=20):
         device_table, device_process_table = ResourceTable(), ResourceTable()
 
         for device_id, handle in self.device_handles.items():
-            device_name = nvidia_smi.nvmlDeviceGetName(handle)
+            device_name = pynvml.nvmlDeviceGetName(handle)
             device_name = device_name.decode() if isinstance(device_name, bytes) else device_name
             common_info = {Resource.DEVICE_ID : device_id,
                            Resource.DEVICE_NAME : device_name}
 
             # Inseparable device information like memory, temperature, power, etc. Request it only if needed
             if (formatter.get(Resource.DEVICE_UTIL, False) or
                 formatter.get(Resource.DEVICE_UTIL_MA, False)):
-                utilization = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
+                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                 common_info[Resource.DEVICE_UTIL] = utilization.gpu
                 common_info[Resource.DEVICE_MEMORY_UTIL] = utilization.memory
 
@@ -100,29 +100,29 @@ def get_device_table(self, formatter=None, window=20):
                 common_info[Resource.DEVICE_UTIL_MA] = lst.get_average(size=window)
 
             if formatter.get(Resource.DEVICE_TEMP, False):
-                temperature = nvidia_smi.nvmlDeviceGetTemperature(handle, nvidia_smi.NVML_TEMPERATURE_GPU)
+                temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                 common_info[Resource.DEVICE_TEMP] = temperature
 
             if formatter.get(Resource.DEVICE_FAN, False):
-                fan_speed = nvidia_smi.nvmlDeviceGetFanSpeed(handle)
+                fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
                 common_info[Resource.DEVICE_FAN] = fan_speed
 
             if formatter.get(Resource.DEVICE_POWER_USED, False):
-                power_used = nvidia_smi.nvmlDeviceGetPowerUsage(handle)
-                power_total = nvidia_smi.nvmlDeviceGetEnforcedPowerLimit(handle)
+                power_used = pynvml.nvmlDeviceGetPowerUsage(handle)
+                power_total = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
 
                 common_info[Resource.DEVICE_POWER_USED] = power_used
                 common_info[Resource.DEVICE_POWER_TOTAL] = power_total
 
             if (formatter.get(Resource.DEVICE_MEMORY_USED, False) or
                 formatter.get(Resource.DEVICE_PROCESS_MEMORY_USED, False)):
-                memory = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                 common_info[Resource.DEVICE_MEMORY_USED] = memory.used
                 common_info[Resource.DEVICE_MEMORY_TOTAL] = memory.total
 
             # Collect individual processes info, if needed. Save it to both tables: in one as list, in other separately
             device_info = {**common_info}
-            processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle)
+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
             device_info.update({Resource.DEVICE_PROCESS_N : 0,
                                 Resource.DEVICE_PROCESS_PID : [],
                                 Resource.DEVICE_PROCESS_MEMORY_USED : []})

diff --git a/setup.py b/setup.py
@@ -27,14 +27,17 @@
     zip_safe=False,
     platforms='any',
     install_requires=[
-        'nvidia-ml-py3>=7.352',
+        'pynvml>=11.5.0',
         'blessed>=1.17',
         'psutil>=5.6',
+        'requests>=2.24',
     ],
-    extras_require={'nbrun': [
-        'ipython>=7.10.0',
-        'nbconvert>=5.6.1',
-    ]},
+    extras_require={
+        'nbrun': [
+            'ipython>=7.10.0',
+            'nbconvert>=5.6.1',
+        ],
+    },
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',