Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized implementation to use IOKit directly. #897

Merged
merged 9 commits into from
Dec 31, 2024
254 changes: 204 additions & 50 deletions scalene/scalene_apple_gpu.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,227 @@
import platform
import random
import re
import subprocess
import ctypes
import time
from typing import Tuple

from scalene.scalene_accelerator import ScaleneAccelerator
# ---------------------------------------------------------------------------
# 1. Define the needed IOKit / CoreFoundation constants and function signatures
# ---------------------------------------------------------------------------
iokit = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/IOKit.framework/IOKit")
corefoundation = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation")

CFTypeRef = ctypes.c_void_p
CFAllocatorRef = ctypes.c_void_p
IOOptionBits = ctypes.c_uint32
io_iterator_t = ctypes.c_void_p
io_registry_entry_t = ctypes.c_void_p
mach_port_t = ctypes.c_void_p

class ScaleneAppleGPU(ScaleneAccelerator):
"""Wrapper class for Apple integrated GPU statistics."""
try:
# On Intel Macs, kIOMasterPortDefault might be defined; on Apple Silicon, it may just be 0.
kIOMasterPortDefault = ctypes.c_void_p.in_dll(iokit, 'kIOMasterPortDefault')
except ValueError:
kIOMasterPortDefault = mach_port_t(0)

IOServiceMatching = iokit.IOServiceMatching
IOServiceMatching.argtypes = [ctypes.c_char_p]
IOServiceMatching.restype = CFTypeRef

IOServiceGetMatchingServices = iokit.IOServiceGetMatchingServices
IOServiceGetMatchingServices.argtypes = [
mach_port_t,
CFTypeRef,
ctypes.POINTER(io_iterator_t),
]
IOServiceGetMatchingServices.restype = ctypes.c_int # kern_return_t

IOIteratorNext = iokit.IOIteratorNext
IOIteratorNext.argtypes = [io_iterator_t]
IOIteratorNext.restype = io_registry_entry_t

IOObjectRelease = iokit.IOObjectRelease
IOObjectRelease.argtypes = [io_registry_entry_t]
IOObjectRelease.restype = ctypes.c_int # kern_return_t

IORegistryEntryCreateCFProperties = iokit.IORegistryEntryCreateCFProperties
IORegistryEntryCreateCFProperties.argtypes = [
io_registry_entry_t,
ctypes.POINTER(CFTypeRef),
CFAllocatorRef,
IOOptionBits,
]
IORegistryEntryCreateCFProperties.restype = CFTypeRef

CFGetTypeID = corefoundation.CFGetTypeID
CFGetTypeID.argtypes = [CFTypeRef]
CFGetTypeID.restype = ctypes.c_long

CFDictionaryGetTypeID = corefoundation.CFDictionaryGetTypeID
CFDictionaryGetTypeID.argtypes = []
CFDictionaryGetTypeID.restype = ctypes.c_long

CFStringCreateWithCString = corefoundation.CFStringCreateWithCString
CFStringCreateWithCString.argtypes = [CFAllocatorRef, ctypes.c_char_p, ctypes.c_uint32]
CFStringCreateWithCString.restype = CFTypeRef

CFDictionaryGetValue = corefoundation.CFDictionaryGetValue
CFDictionaryGetValue.argtypes = [CFTypeRef, CFTypeRef]
CFDictionaryGetValue.restype = CFTypeRef

CFNumberGetTypeID = corefoundation.CFNumberGetTypeID
CFNumberGetTypeID.argtypes = []
CFNumberGetTypeID.restype = ctypes.c_long

CFNumberGetValue = corefoundation.CFNumberGetValue
CFNumberGetValue.argtypes = [CFTypeRef, ctypes.c_int, ctypes.c_void_p]
CFNumberGetValue.restype = ctypes.c_bool

CFNumberGetType = corefoundation.CFNumberGetType
CFNumberGetType.argtypes = [CFTypeRef]
CFNumberGetType.restype = ctypes.c_int

CFShow = corefoundation.CFShow
CFShow.argtypes = [CFTypeRef]

kCFNumberSInt64Type = 4 # 64-bit integers

def cfstr(py_str: str) -> CFTypeRef:
"""Helper to create a CFString from a Python string."""
return CFStringCreateWithCString(None, py_str.encode('utf-8'), 0)

def _read_apple_gpu_stats_and_cores() -> Tuple[float, float, int]:
"""
Reads from IOService class "IOAccelerator" and returns:
(device_util, in_use_mem, gpu_core_count)
where:
- device_util is a fraction [0..1].
- in_use_mem is in megabytes.
- gpu_core_count is an integer from top-level "gpu-core-count".
"""
matching_dict = IOServiceMatching(b"IOAccelerator")
if not matching_dict:
# debug_print("[DEBUG] Could not create matching dictionary.")
return (0.0, 0.0, 0)

service_iterator = io_iterator_t()
kr = IOServiceGetMatchingServices(kIOMasterPortDefault, matching_dict, ctypes.byref(service_iterator))
if kr != 0:
# debug_print(f"[DEBUG] IOServiceGetMatchingServices returned kr={kr}. Possibly no services found.")
return (0.0, 0.0, 0)

device_util = 0.0
in_use_mem = 0.0
gpu_core_count = 0

while True:
service_object = IOIteratorNext(service_iterator)
if not service_object:
# No more services
break

props_ref = CFTypeRef()
IORegistryEntryCreateCFProperties(service_object, ctypes.byref(props_ref), None, 0)

# The top-level dictionary:
if props_ref and CFGetTypeID(props_ref) == CFDictionaryGetTypeID():
# 1. Grab "gpu-core-count" at the top level
top_key_cores = cfstr("gpu-core-count")
core_val_ref = CFDictionaryGetValue(props_ref, top_key_cores)
if core_val_ref and (CFGetTypeID(core_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(core_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
gpu_core_count = val_container_64.value
IOObjectRelease(top_key_cores)

# 2. Check for sub-dictionary "PerformanceStatistics"
performance_key = cfstr("PerformanceStatistics")
performance_dict_ref = CFDictionaryGetValue(props_ref, performance_key)
IOObjectRelease(performance_key)

if performance_dict_ref and (CFGetTypeID(performance_dict_ref) == CFDictionaryGetTypeID()):
cf_key_util = cfstr("Device Utilization %")
cf_key_mem = cfstr("In use system memory")

# Device Utilization
util_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_util)
if util_val_ref and (CFGetTypeID(util_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(util_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
device_util = val_container_64.value / 100.0

# In use system memory
mem_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_mem)
if mem_val_ref and (CFGetTypeID(mem_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(mem_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
in_use_mem = float(val_container_64.value) / 1048576.0

IOObjectRelease(cf_key_util)
IOObjectRelease(cf_key_mem)

IOObjectRelease(props_ref)

IOObjectRelease(service_object)

if (device_util > 0.0 or in_use_mem > 0.0) and gpu_core_count > 0:
# Success, break
break

IOObjectRelease(service_iterator)
return (device_util, in_use_mem, gpu_core_count)


class ScaleneAppleGPU:
"""Wrapper class for Apple integrated GPU statistics, using direct IOKit calls."""

def __init__(self, sampling_frequency: int = 100) -> None:
assert platform.system() == "Darwin"
self.cmd = (
'DYLD_INSERT_LIBRARIES="" ioreg -r -d 1 -w 0 -c "IOAccelerator"'
)
self.regex_util = re.compile(r'"Device Utilization %"=(\d+)')
self.regex_inuse = re.compile(r'"In use system memory"=(\d+)')
# Only actually get stats some fraction of the time, since it is costly.
# Used in get_stats().
self.gpu_sampling_frequency = sampling_frequency
self.core_count = self._get_num_cores()

def gpu_device(self) -> str:
return "GPU"

def has_gpu(self) -> bool:
"""True iff there is a GPU"""
# Disabling Apple GPU, since it does not collect per-process statistics.
return False
"""Return True if the system likely has an Apple integrated GPU."""
return True

def reinit(self) -> None:
"""A NOP, here for compatibility with the nvidia wrapper."""
return
"""No-op for compatibility with other GPU wrappers."""
pass

def get_num_cores(self) -> int:
# FIXME: not yet implemented
return 1
return self.core_count

def get_stats(self) -> Tuple[float, float]:
"""Returns a tuple of (utilization%, memory in use)"""
"""Returns a tuple of (utilization%, memory in use in megabytes)."""
if not self.has_gpu():
return (0.0, 0.0)
try:
# Only periodically query the statistics for real (at a
# rate of 1/self.gpu_sampling_frequency). We do this to
# amortize its cost, as it is shockingly expensive.
if random.randint(0, self.gpu_sampling_frequency - 1) != 0:
return (0.0, 0.0)
in_use = 0.0
util = 0.0
read_process = subprocess.Popen(
self.cmd, shell=True, stdout=subprocess.PIPE
)
if read_process.stdout is not None:
read_process_return = read_process.stdout.readlines()
for line in read_process_return:
decoded_line = line.decode("utf-8")
# print(decoded_line)
if "In use system memory" in decoded_line:
in_use_re = self.regex_inuse.search(decoded_line)
if in_use_re:
in_use = float(in_use_re.group(1))
if "Device Utilization %" in decoded_line:
util_re = self.regex_util.search(decoded_line)
if util_re:
util = int(util_re.group(1)) / 1000
if util and in_use:
break
return (util, in_use)
except Exception:
pass
return (0.0, 0.0)
util, in_use, _ = _read_apple_gpu_stats_and_cores()
return (util, in_use)
except Exception as ex:
return (0.0, 0.0)

def _get_num_cores(self) -> int:
"""
Retrieves the 'gpu-core-count' property from the top-level dictionary.
Returns 0 if not found.
"""
# We reuse the same function that gathers utilization & memory
_, _, core_count = _read_apple_gpu_stats_and_cores()
return core_count

if __name__ == "__main__":
gpu = ScaleneAppleGPU()
while True:
util, mem = gpu.get_stats()
cores = gpu.get_num_cores()
print(
f"GPU Utilization: {util*100:.1f}%, "
f"In-Use GPU Memory: {mem} megabytes, "
f"GPU Core Count: {cores}"
)
time.sleep(2)
3 changes: 2 additions & 1 deletion test/torchtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
def torchtest():
dtype = torch.float
#device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU
# device = torch.device("cuda:0") # Uncomment this to run on GPU
# device = torch.device("cuda") # Uncomment this to run on GPU
device = torch.device("mps")

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
Expand Down
39 changes: 0 additions & 39 deletions tests/test_coverup_18.py

This file was deleted.

30 changes: 0 additions & 30 deletions tests/test_coverup_27.py

This file was deleted.

Loading
Loading