Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update get_available_gpus and set_gpus #17

Merged
merged 10 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 77 additions & 38 deletions nbtools/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Core utility functions to work with Jupyter Notebooks. """
#pylint: disable=import-outside-toplevel
import os
import sys
import re
import json
import warnings
Expand Down Expand Up @@ -120,7 +121,9 @@ def notebook_to_script(path_script, path_notebook=None, ignore_markdown=True, re



def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False):

def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
raise_error=False, return_memory=False):
""" Select ``n`` gpus from available and free devices.

Parameters
Expand All @@ -130,47 +133,67 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
* If ``'max'``, then use maximum number of available devices.
* If ``int``, then number of devices to select.

min_free_memory : float
Minimum percentage of free memory on a device to consider it free.
min_free_memory : int, float

* If ``int``, minimum amount of free memory (in MB) on a device to consider it free.
* If ``float``, minimum percentage of free memory.

max_processes : int
Maximum amount of computed processes on a device to consider it free.
verbose : bool
Whether to show individual device information.
raise_error : bool
Whether to raise an exception if not enough devices are available.
return_memory : bool
Whether to return memory available on each GPU.

Returns
-------
available_devices : list
Indices of available GPUs.
List with available GPUs indices or dict of indices and ``'available'`` and ``'max'`` memory (in MB)
"""
try:
import nvidia_smi
import pynvml
except ImportError as exception:
raise ImportError('Install Python interface for nvidia_smi') from exception

nvidia_smi.nvmlInit()
n_devices = nvidia_smi.nvmlDeviceGetCount()
try:
error_message = None
pynvml.nvmlInit()
except pynvml.NVMLError_LibraryNotFound:
if sys.platform == 'win32':
error_message = " Copy nvml.dll from 'Windows/System32' to 'Program Files/NVIDIA Corporation/NVSMI'"
finally:
if error_message:
raise RuntimeError('NVIDIA SMI is not available.' + error_message)
n_devices = pynvml.nvmlDeviceGetCount()

available_devices, memory_free, memory_total = [], [], []

available_devices, memory_usage = [], []
for i in range(n_devices):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)

num_processes = len(pynvml.nvmlDeviceGetComputeRunningProcesses(handle))
free_memory = info.free / 1024**2
total_memory = info.total / 1024**2

memory_threshold = total_memory * min_free_memory if isinstance(min_free_memory, float) else min_free_memory

fraction_free = info.free / info.total
num_processes = len(nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle))
consider_available = (
(free_memory >= memory_threshold) &
(max_processes is None or num_processes <= max_processes)
)

consider_available = (fraction_free > min_free_memory) & (num_processes <= max_processes)
if consider_available:
available_devices.append(i)
memory_usage.append(fraction_free)
memory_free.append(free_memory)
memory_total.append(total_memory)

if verbose:
print(f'Device {i} | Free memory: {fraction_free:4.2f} | '
print(f'Device {i} | Free memory: {info.free:4.2f} | '
f'Number of running processes: {num_processes:>2} | Free: {consider_available}')

nvidia_smi.nvmlShutdown()

if isinstance(n, str) and n.startswith('max'):
n = len(available_devices)

Expand All @@ -180,24 +203,37 @@ def get_available_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False,
raise ValueError(msg)
warnings.warn(msg, RuntimeWarning)

# Argsort of `memory_usage` in a descending order
indices = sorted(range(len(available_devices)), key=memory_usage.__getitem__, reverse=True)
available_devices = [available_devices[i] for i in indices]
return sorted(available_devices[:n])
sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True)
if return_memory:
gpus = {}
for ix in sorted_indices[:n]:
gpu = available_devices[ix]
gpus[gpu] = {'available': memory_free[ix], 'max': memory_total[ix]}
return gpus

sorted_indices = sorted(range(len(memory_free)), key=lambda k: memory_free[k], reverse=True)
sorted_devices = [available_devices[i] for i in sorted_indices]
return sorted_devices[:n]

def get_gpu_free_memory(index):
""" Get free memory of a device. """
def get_gpu_free_memory(index, ratio=True):
""" Get free memory of a device (ratio or size in MB). """
try:
import nvidia_smi
import pynvml
except ImportError as exception:
raise ImportError('Install Python interface for nvidia_smi') from exception

nvidia_smi.nvmlInit()
nvidia_smi.nvmlDeviceGetCount()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
nvidia_smi.nvmlShutdown()
return info.free / info.total
pynvml.nvmlInit()
pynvml.nvmlDeviceGetCount()
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
pynvml.nvmlShutdown()

free_memory = info.free / 1024**2
total_memory = info.total / 1024**2

if ratio:
return free_memory / total_memory
return free_memory

def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_error=False):
""" Set the ``CUDA_VISIBLE_DEVICES`` variable to ``n`` available devices.
Expand All @@ -209,8 +245,11 @@ def set_gpus(n=1, min_free_memory=0.9, max_processes=2, verbose=False, raise_err
* If ``'max'``, then use maximum number of available devices.
* If ``int``, then number of devices to select.

min_free_memory : float
Minimum percentage of free memory on a device to consider it free.
min_free_memory : int, float

* If ``int``, minimum amount of free memory (in MB) on a device to consider it free.
* If ``float``, minimum percentage of free memory.

max_processes : int
Maximum amount of computed processes on a device to consider it free.
verbose : bool or int
Expand Down Expand Up @@ -252,21 +291,21 @@ def free_gpus(devices=None):
devices : iterable of ints
Device indices to terminate processes. If ``None``, than free all available gpus.
"""
import nvidia_smi
import pynvml
import psutil

nvidia_smi.nvmlInit()
pynvml.nvmlInit()

if devices is None:
if 'CUDA_VISIBLE_DEVICES' in os.environ.keys():
devices = [int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(',')]
else:
devices = range(0, nvidia_smi.nvmlDeviceGetCount())
devices = range(0, pynvml.nvmlDeviceGetCount())

for device_index in devices:
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_index)
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)

for proc in nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle):
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
psutil.Process(proc.pid).terminate()

nvidia_smi.nvmlShutdown()
pynvml.nvmlShutdown()
26 changes: 13 additions & 13 deletions nbtools/nbstat/resource_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import requests
from blessed import Terminal

import nvidia_smi
import pynvml

from .resource import Resource
from .resource_table import ResourceTable
Expand All @@ -32,7 +32,7 @@ class ResourceInspector:
# TODO: correct working with VSCode Jupyter Notebooks
# TODO: make sure that everything works without sudo
# TODO: add more fallbacks for unavailable resources
# TODO: can add explicit __delete__ to call nvidia_smi.nvmlShutdown(), if we ever have problems with that
# TODO: can add explicit __delete__ to call pynvml.nvmlShutdown(), if we ever have problems with that
def __init__(self, formatter=None):
self.formatter = formatter

Expand All @@ -47,10 +47,10 @@ def __init__(self, formatter=None):
def device_handles(self):
""" Cached handles of NVIDIA devices. """
if self._device_handles is None:
nvidia_smi.nvmlInit()
n_devices = nvidia_smi.nvmlDeviceGetCount()
pynvml.nvmlInit()
n_devices = pynvml.nvmlDeviceGetCount()

self._device_handles = {device_id : nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
self._device_handles = {device_id : pynvml.nvmlDeviceGetHandleByIndex(device_id)
for device_id in range(n_devices)}
return self._device_handles

Expand Down Expand Up @@ -82,15 +82,15 @@ def get_device_table(self, formatter=None, window=20):
device_table, device_process_table = ResourceTable(), ResourceTable()

for device_id, handle in self.device_handles.items():
device_name = nvidia_smi.nvmlDeviceGetName(handle)
device_name = pynvml.nvmlDeviceGetName(handle)
device_name = device_name.decode() if isinstance(device_name, bytes) else device_name
common_info = {Resource.DEVICE_ID : device_id,
Resource.DEVICE_NAME : device_name}

# Inseparable device information like memory, temperature, power, etc. Request it only if needed
if (formatter.get(Resource.DEVICE_UTIL, False) or
formatter.get(Resource.DEVICE_UTIL_MA, False)):
utilization = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
common_info[Resource.DEVICE_UTIL] = utilization.gpu
common_info[Resource.DEVICE_MEMORY_UTIL] = utilization.memory

Expand All @@ -100,29 +100,29 @@ def get_device_table(self, formatter=None, window=20):
common_info[Resource.DEVICE_UTIL_MA] = lst.get_average(size=window)

if formatter.get(Resource.DEVICE_TEMP, False):
temperature = nvidia_smi.nvmlDeviceGetTemperature(handle, nvidia_smi.NVML_TEMPERATURE_GPU)
temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
common_info[Resource.DEVICE_TEMP] = temperature

if formatter.get(Resource.DEVICE_FAN, False):
fan_speed = nvidia_smi.nvmlDeviceGetFanSpeed(handle)
fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
common_info[Resource.DEVICE_FAN] = fan_speed

if formatter.get(Resource.DEVICE_POWER_USED, False):
power_used = nvidia_smi.nvmlDeviceGetPowerUsage(handle)
power_total = nvidia_smi.nvmlDeviceGetEnforcedPowerLimit(handle)
power_used = pynvml.nvmlDeviceGetPowerUsage(handle)
power_total = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)

common_info[Resource.DEVICE_POWER_USED] = power_used
common_info[Resource.DEVICE_POWER_TOTAL] = power_total

if (formatter.get(Resource.DEVICE_MEMORY_USED, False) or
formatter.get(Resource.DEVICE_PROCESS_MEMORY_USED, False)):
memory = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
common_info[Resource.DEVICE_MEMORY_USED] = memory.used
common_info[Resource.DEVICE_MEMORY_TOTAL] = memory.total

# Collect individual processes info, if needed. Save it to both tables: in one as list, in other separately
device_info = {**common_info}
processes = nvidia_smi.nvmlDeviceGetComputeRunningProcesses(handle)
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
device_info.update({Resource.DEVICE_PROCESS_N : 0,
Resource.DEVICE_PROCESS_PID : [],
Resource.DEVICE_PROCESS_MEMORY_USED : []})
Expand Down
13 changes: 8 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@
zip_safe=False,
platforms='any',
install_requires=[
'nvidia-ml-py3>=7.352',
'pynvml>=11.5.0',
'blessed>=1.17',
'psutil>=5.6',
'requests>=2.24',
],
extras_require={'nbrun': [
'ipython>=7.10.0',
'nbconvert>=5.6.1',
]},
extras_require={
'nbrun': [
'ipython>=7.10.0',
'nbconvert>=5.6.1',
],
},
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
Expand Down
Loading