From 65454721774306c2d3ee2e81634e3eb51ad7f9a8 Mon Sep 17 00:00:00 2001 From: limengxuan Date: Mon, 18 Nov 2024 18:26:14 +0800 Subject: [PATCH] Update vGPUmonitor to add dynamic adjustment on core and memory limit (#624) * update ci and initdevices logic Signed-off-by: limengxuan <391013634@qq.com> * prepare to update v2.4.1 Signed-off-by: limengxuan <391013634@qq.com> * update vGPUmonitor to add dynamic adjustment on core and memory limit Signed-off-by: limengxuan <391013634@qq.com> --------- Signed-off-by: limengxuan <391013634@qq.com> --- pkg/monitor/nvidia/cudevshr.go | 2 ++ pkg/monitor/nvidia/v0/spec.go | 16 ++++++++++++++++ pkg/monitor/nvidia/v1/spec.go | 16 ++++++++++++++++ pkg/scheduler/pods.go | 2 ++ 4 files changed, 36 insertions(+) diff --git a/pkg/monitor/nvidia/cudevshr.go b/pkg/monitor/nvidia/cudevshr.go index 0198c6626..09536ed35 100644 --- a/pkg/monitor/nvidia/cudevshr.go +++ b/pkg/monitor/nvidia/cudevshr.go @@ -55,9 +55,11 @@ type UsageInfo interface { DeviceMemoryOffset(idx int) uint64 DeviceMemoryTotal(idx int) uint64 DeviceSmUtil(idx int) uint64 + SetDeviceSmLimit(l uint64) IsValidUUID(idx int) bool DeviceUUID(idx int) string DeviceMemoryLimit(idx int) uint64 + SetDeviceMemoryLimit(l uint64) LastKernelTime() int64 //UsedMemory(idx int) (uint64, error) GetPriority() int diff --git a/pkg/monitor/nvidia/v0/spec.go b/pkg/monitor/nvidia/v0/spec.go index f29839cf2..9163e7627 100644 --- a/pkg/monitor/nvidia/v0/spec.go +++ b/pkg/monitor/nvidia/v0/spec.go @@ -129,6 +129,14 @@ func (s Spec) DeviceSmUtil(idx int) uint64 { return v } +func (s Spec) SetDeviceSmLimit(l uint64) { + idx := uint64(0) + for idx < s.sr.num { + s.sr.smLimit[idx] = l + idx += 1 + } +} + func (s Spec) IsValidUUID(idx int) bool { return s.sr.uuids[idx].uuid[0] != 0 } @@ -141,6 +149,14 @@ func (s Spec) DeviceMemoryLimit(idx int) uint64 { return s.sr.limit[idx] } +func (s Spec) SetDeviceMemoryLimit(l uint64) { + idx := uint64(0) + for idx < s.sr.num { + s.sr.limit[idx] = l + idx += 1 + } +} + func (s Spec) LastKernelTime() int64 { return 0 } diff --git a/pkg/monitor/nvidia/v1/spec.go b/pkg/monitor/nvidia/v1/spec.go index 079507d5e..5bfcd9363 100644 --- a/pkg/monitor/nvidia/v1/spec.go +++ b/pkg/monitor/nvidia/v1/spec.go @@ -136,6 +136,14 @@ func (s Spec) DeviceSmUtil(idx int) uint64 { return v } +func (s Spec) SetDeviceSmLimit(l uint64) { + idx := uint64(0) + for idx < s.sr.num { + s.sr.smLimit[idx] = l + idx += 1 + } +} + func (s Spec) IsValidUUID(idx int) bool { return s.sr.uuids[idx].uuid[0] != 0 } @@ -148,6 +156,14 @@ func (s Spec) DeviceMemoryLimit(idx int) uint64 { return s.sr.limit[idx] } +func (s Spec) SetDeviceMemoryLimit(l uint64) { + idx := uint64(0) + for idx < s.sr.num { + s.sr.limit[idx] = l + idx += 1 + } +} + func (s Spec) LastKernelTime() int64 { return s.sr.lastKernelTime } diff --git a/pkg/scheduler/pods.go b/pkg/scheduler/pods.go index 1bd026f0d..d4fb2d399 100644 --- a/pkg/scheduler/pods.go +++ b/pkg/scheduler/pods.go @@ -61,6 +61,8 @@ func (m *podManager) addPod(pod *corev1.Pod, nodeID string, devices util.PodDevi pi := &podInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices} m.pods[pod.UID] = pi klog.Infof("Pod added: Name: %s, UID: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID) + } else { + m.pods[pod.UID].Devices = devices } }