diff --git a/nbtools/core.py b/nbtools/core.py index 352161b..0a3af42 100755 --- a/nbtools/core.py +++ b/nbtools/core.py @@ -1,6 +1,7 @@ """ Core utility functions to work with Jupyter Notebooks. """ #pylint: disable=import-outside-toplevel import os +import sys import re import json import warnings @@ -120,7 +121,9 @@ def notebook_to_script(path_script, path_notebook=None, ignore_markdown=True, re -def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False): + +def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, + raise_error=False, return_memory=False): """ Select ``n`` gpus from available and free devices. Parameters @@ -130,47 +133,67 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, * If ``'max'``, then use maximum number of available devices. * If ``int``, then number of devices to select. - min_free_memory : float - Minimum percentage of free memory on a device to consider it free. + min_free_memory : int, float + + * If ``int``, minimum amount of free memory (in MB) on a device to consider it free. + * If ``float``, minimum percentage of free memory. + max_processes : int Maximum amount of computed processes on a device to consider it free. verbose : bool Whether to show individual device information. raise_error : bool Whether to raise an exception if not enough devices are available. + return_memory : bool + Whether to return memory available on each GPU. Returns ------- available_devices : list - Indices of available GPUs. + List with available GPUs indices or dict of indices and ``'available'`` and ``'max'`` memory (in MB) """ try: - import nvidia_smi + import pynvml except ImportError as exception: raise ImportError('Install Python interface for nvidia_smi') from exception - nvidia_smi.nvmlInit() - n_devices = nvidia_smi.nvmlDeviceGetCount() + try: + error_message = None + pynvml.nvmlInit() + except pynvml.NVMLError_LibraryNotFound: + if sys.platform == 'win32': + error_message = " Copy nvml.dll from 'Windows/System32' to 'Program Files/NVIDIA Corporation/NVSMI'" + finally: + if error_message: + raise RuntimeError('NVIDIA SMI is not available.' + error_message) + n_devices = pynvml.nvmlDeviceGetCount() + + available_devices, memory_free, memory_total = [], [], [] - available_devices, memory_usage = [], [] for i in range(n_devices): - handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) - info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + + num_processes = len(pynvml.nvmlDeviceGetComputeRunningProcesses(handle)) + free_memory = info.free / 1024**2 + total_memory = info.total / 1024**2 + + memory_threshold = total_memory * min_free_memory if isinstance(min_free_memory, float) else min_free_memory - fraction_free = info.free / info.total - num_processes = len(nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle)) + consider_available = ( + (free_memory >= memory_threshold) & + (max_processes is None or num_processes <= max_processes) + ) - consider_available = (fraction_free > min_free_memory) & (num_processes <= max_processes) if consider_available: available_devices.append(i) - memory_usage.append(fraction_free) + memory_free.append(free_memory) + memory_total.append(total_memory) if verbose: - print(f'Device {i} | Free memory: {fraction_free:4.2f} | ' + print(f'Device {i} | Free memory: {info.free:4.2f} | ' f'Number of running processes: {num_processes:>2} | Free: {consider_available}') - nvidia_smi.nvmlShutdown() - if isinstance(n, str) and n.startswith('max'): n = len(available_devices) @@ -180,24 +203,37 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise ValueError(msg) warnings.warn(msg, RuntimeWarning) - # Argsort of `memory_usage` in a descending order - indices = sorted(range(len(available_devices)), key=memory_usage.__getitem__, reverse=True) - available_devices = [available_devices[i] for i in indices] - return sorted(available_devices[:n]) + sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True) + if return_memory: + gpus = {} + for ix in sorted_indices[:n]: + gpu = available_devices[ix] + gpus[gpu] = {'available': memory_free[ix], 'max': memory_total[ix]} + return gpus + + sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True) + sorted_devices = [available_devices[i] for i in sorted_indices] + return sorted_devices[:n] -def get_gpu_free_memory(index): - """ Get free memory of a device. """ +def get_gpu_free_memory(index, ratio=True): + """ Get free memory of a device (ratio or size in MB). """ try: - import nvidia_smi + import pynvml except ImportError as exception: raise ImportError('Install Python interface for nvidia_smi') from exception - nvidia_smi.nvmlInit() - nvidia_smi.nvmlDeviceGetCount() - handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index) - info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) - nvidia_smi.nvmlShutdown() - return info.free / info.total + pynvml.nvmlInit() + pynvml.nvmlDeviceGetCount() + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + pynvml.nvmlShutdown() + + free_memory = info.free / 1024**2 + total_memory = info.total / 1024**2 + + if ratio: + return free_memory / total_memory + return free_memory def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False): """ Set the ``CUDA_VISIBLE_DEVICES`` variable to ``n`` available devices. @@ -209,8 +245,11 @@ def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_err * If ``'max'``, then use maximum number of available devices. * If ``int``, then number of devices to select. - min_free_memory : float - Minimum percentage of free memory on a device to consider it free. + min_free_memory : int, float + + * If ``int``, minimum amount of free memory (in MB) on a device to consider it free. + * If ``float``, minimum percentage of free memory. + max_processes : int Maximum amount of computed processes on a device to consider it free. verbose : bool or int @@ -252,21 +291,21 @@ def free_gpus(devices=None): devices : iterable of ints Device indices to terminate processes. If ``None``, than free all available gpus. """ - import nvidia_smi + import pynvml import psutil - nvidia_smi.nvmlInit() + pynvml.nvmlInit() if devices is None: if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): devices = [int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(',')] else: - devices = range(0, nvidia_smi.nvmlDeviceGetCount()) + devices = range(0, pynvml.nvmlDeviceGetCount()) for device_index in devices: - handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_index) + handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) - for proc in nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle): + for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle): psutil.Process(proc.pid).terminate() - nvidia_smi.nvmlShutdown() + pynvml.nvmlShutdown() diff --git a/nbtools/nbstat/resource_inspector.py b/nbtools/nbstat/resource_inspector.py index 0cfa805..aae434f 100755 --- a/nbtools/nbstat/resource_inspector.py +++ b/nbtools/nbstat/resource_inspector.py @@ -10,7 +10,7 @@ import requests from blessed import Terminal -import nvidia_smi +import pynvml from .resource import Resource from .resource_table import ResourceTable @@ -32,7 +32,7 @@ class ResourceInspector: # TODO: correct working with VSCode Jupyter Notebooks # TODO: make sure that everything works without sudo # TODO: add more fallbacks for unavailable resources - # TODO: can add explicit __delete__ to call nvidia_smi.nvmlShutdown(), if we ever have problems with that + # TODO: can add explicit __delete__ to call pynvml.nvmlShutdown(), if we ever have problems with that def __init__(self, formatter=None): self.formatter = formatter @@ -47,10 +47,10 @@ def __init__(self, formatter=None): def device_handles(self): """ Cached handles of NVIDIA devices. """ if self._device_handles is None: - nvidia_smi.nvmlInit() - n_devices = nvidia_smi.nvmlDeviceGetCount() + pynvml.nvmlInit() + n_devices = pynvml.nvmlDeviceGetCount() - self._device_handles = {device_id : nvidia_smi.nvmlDeviceGetHandleByIndex(device_id) + self._device_handles = {device_id : pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in range(n_devices)} return self._device_handles @@ -82,7 +82,7 @@ def get_device_table(self, formatter=None, window=20): device_table, device_process_table = ResourceTable(), ResourceTable() for device_id, handle in self.device_handles.items(): - device_name = nvidia_smi.nvmlDeviceGetName(handle) + device_name = pynvml.nvmlDeviceGetName(handle) device_name = device_name.decode() if isinstance(device_name, bytes) else device_name common_info = {Resource.DEVICE_ID : device_id, Resource.DEVICE_NAME : device_name} @@ -90,7 +90,7 @@ def get_device_table(self, formatter=None, window=20): # Inseparable device information like memory, temperature, power, etc. Request it only if needed if (formatter.get(Resource.DEVICE_UTIL, False) or formatter.get(Resource.DEVICE_UTIL_MA, False)): - utilization = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) + utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) common_info[Resource.DEVICE_UTIL] = utilization.gpu common_info[Resource.DEVICE_MEMORY_UTIL] = utilization.memory @@ -100,29 +100,29 @@ def get_device_table(self, formatter=None, window=20): common_info[Resource.DEVICE_UTIL_MA] = lst.get_average(size=window) if formatter.get(Resource.DEVICE_TEMP, False): - temperature = nvidia_smi.nvmlDeviceGetTemperature(handle, nvidia_smi.NVML_TEMPERATURE_GPU) + temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) common_info[Resource.DEVICE_TEMP] = temperature if formatter.get(Resource.DEVICE_FAN, False): - fan_speed = nvidia_smi.nvmlDeviceGetFanSpeed(handle) + fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) common_info[Resource.DEVICE_FAN] = fan_speed if formatter.get(Resource.DEVICE_POWER_USED, False): - power_used = nvidia_smi.nvmlDeviceGetPowerUsage(handle) - power_total = nvidia_smi.nvmlDeviceGetEnforcedPowerLimit(handle) + power_used = pynvml.nvmlDeviceGetPowerUsage(handle) + power_total = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) common_info[Resource.DEVICE_POWER_USED] = power_used common_info[Resource.DEVICE_POWER_TOTAL] = power_total if (formatter.get(Resource.DEVICE_MEMORY_USED, False) or formatter.get(Resource.DEVICE_PROCESS_MEMORY_USED, False)): - memory = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) + memory = pynvml.nvmlDeviceGetMemoryInfo(handle) common_info[Resource.DEVICE_MEMORY_USED] = memory.used common_info[Resource.DEVICE_MEMORY_TOTAL] = memory.total # Collect individual processes info, if needed. Save it to both tables: in one as list, in other separately device_info = {**common_info} - processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle) + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) device_info.update({Resource.DEVICE_PROCESS_N : 0, Resource.DEVICE_PROCESS_PID : [], Resource.DEVICE_PROCESS_MEMORY_USED : []}) diff --git a/setup.py b/setup.py index 4416644..224f0e7 100644 --- a/setup.py +++ b/setup.py @@ -27,14 +27,17 @@ zip_safe=False, platforms='any', install_requires=[ - 'nvidia-ml-py3>=7.352', + 'pynvml>=11.5.0', 'blessed>=1.17', 'psutil>=5.6', + 'requests>=2.24', ], - extras_require={'nbrun': [ - 'ipython>=7.10.0', - 'nbconvert>=5.6.1', - ]}, + extras_require={ + 'nbrun': [ + 'ipython>=7.10.0', + 'nbconvert>=5.6.1', + ], + }, classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers',