From 9bc1a5366edd1ec199ff5923702ebc611ea2a617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 26 Dec 2024 13:48:47 +0100 Subject: [PATCH 1/4] Add support for GPU feature --- api/datadoghq/v2alpha1/const.go | 7 + api/datadoghq/v2alpha1/datadogagent_types.go | 16 ++ .../v2alpha1/zz_generated.deepcopy.go | 30 ++++ .../v2alpha1/zz_generated.openapi.go | 8 +- .../bases/v1/datadoghq.com_datadogagents.yaml | 30 ++++ .../datadoghq.com_datadogagents_v2alpha1.json | 30 ++++ docs/configuration.v2alpha1.md | 2 + examples/datadogagent/datadog-agent-all.yaml | 2 + .../controller/datadogagent/controller.go | 1 + .../defaults/datadogagent_default.go | 8 + .../defaults/datadogagent_default_test.go | 46 ++++++ .../datadogagent/feature/gpu/envvar.go | 9 + .../datadogagent/feature/gpu/feature.go | 154 ++++++++++++++++++ .../datadogagent/feature/gpu/feature_test.go | 148 +++++++++++++++++ .../controller/datadogagent/feature/ids.go | 2 + .../datadogagent/feature/test/factory_test.go | 19 ++- .../datadogagent_controller_test.go | 5 + internal/controller/testutils/agent.go | 13 ++ pkg/testutils/builder.go | 15 ++ 19 files changed, 543 insertions(+), 2 deletions(-) create mode 100644 internal/controller/datadogagent/feature/gpu/envvar.go create mode 100644 internal/controller/datadogagent/feature/gpu/feature.go create mode 100644 internal/controller/datadogagent/feature/gpu/feature_test.go diff --git a/api/datadoghq/v2alpha1/const.go b/api/datadoghq/v2alpha1/const.go index ef52fb674..c6662fa2a 100644 --- a/api/datadoghq/v2alpha1/const.go +++ b/api/datadoghq/v2alpha1/const.go @@ -78,6 +78,9 @@ const ( KubeServicesAndEndpointsListeners = "kube_services kube_endpoints" EndpointsChecksConfigProvider = "endpointschecks" ClusterAndEndpointsConfigProviders = "clusterchecks endpointschecks" + + // DefaultGPUMonitoringRuntimeClass default runtime class for GPU pods + DefaultGPUMonitoringRuntimeClass = "nvidia" ) // Labels @@ -201,6 +204,10 @@ const ( FIPSProxyCustomConfigFileName = "datadog-fips-proxy.cfg" FIPSProxyCustomConfigMapName = "%s-fips-config" FIPSProxyCustomConfigMountPath = "/etc/datadog-fips-proxy/datadog-fips-proxy.cfg" + + NVIDIADevicesMountPath = "/var/run/nvidia-container-devices/all" + NVIDIADevicesVolumeName = "nvidia-devices" + DevNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices ) // Field paths diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 857767b44..1756ba844 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -82,6 +82,8 @@ type DatadogFeatures struct { SBOM *SBOMFeatureConfig `json:"sbom,omitempty"` // ServiceDiscovery ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"` + // GPU monitoring + GPUMonitoring *GPUMonitoringFeatureConfig `json:"gpu,omitempty"` // Cluster-level features @@ -498,6 +500,20 @@ type ServiceDiscoveryFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` } +// GPUMonitoringFeatureConfig contains the GPU monitoring configuration. +type GPUMonitoringFeatureConfig struct { + // Enabled enables GPU monitoring. + // Default: false + // +optional + Enabled *bool `json:"enabled,omitempty"` + + // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + // If left empty, the runtime class will not be set. + // Default: nvidia + // +optional + PodRuntimeClassName *string `json:"requiredRuntimeClassName"` +} + // DogstatsdFeatureConfig contains the Dogstatsd configuration parameters. // +k8s:openapi-gen=true type DogstatsdFeatureConfig struct { diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index a36ac9778..30bdf8e78 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1232,6 +1232,11 @@ func (in *DatadogFeatures) DeepCopyInto(out *DatadogFeatures) { *out = new(ServiceDiscoveryFeatureConfig) (*in).DeepCopyInto(*out) } + if in.GPUMonitoring != nil { + in, out := &in.GPUMonitoring, &out.GPUMonitoring + *out = new(GPUMonitoringFeatureConfig) + (*in).DeepCopyInto(*out) + } if in.EventCollection != nil { in, out := &in.EventCollection, &out.EventCollection *out = new(EventCollectionFeatureConfig) @@ -1545,6 +1550,31 @@ func (in *FIPSConfig) DeepCopy() *FIPSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUMonitoringFeatureConfig) DeepCopyInto(out *GPUMonitoringFeatureConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.PodRuntimeClassName != nil { + in, out := &in.PodRuntimeClassName, &out.PodRuntimeClassName + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMonitoringFeatureConfig. +func (in *GPUMonitoringFeatureConfig) DeepCopy() *GPUMonitoringFeatureConfig { + if in == nil { + return nil + } + out := new(GPUMonitoringFeatureConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GlobalConfig) DeepCopyInto(out *GlobalConfig) { *out = *in diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index 9dfdc495f..ae577cbf1 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -675,6 +675,12 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig"), }, }, + "gpu": { + SchemaProps: spec.SchemaProps{ + Description: "GPU monitoring", + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig"), + }, + }, "eventCollection": { SchemaProps: spec.SchemaProps{ Description: "EventCollection configuration.", @@ -733,7 +739,7 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, } } diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 679f7ce12..f862243d3 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1019,6 +1019,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If left empty, the runtime class will not be set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: @@ -7883,6 +7898,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If left empty, the runtime class will not be set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 02d401ef4..62a36b6d3 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1065,6 +1065,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", @@ -7871,6 +7886,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 1e37d145c..40a37b5b0 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -111,6 +111,8 @@ spec: | features.externalMetricsServer.registerAPIService | RegisterAPIService registers the External Metrics endpoint as an APIService Default: true | | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | +| features.gpu.enabled | Enables GPU monitoring. Default: false | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class will not be set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/examples/datadogagent/datadog-agent-all.yaml b/examples/datadogagent/datadog-agent-all.yaml index ea0cff3c0..dd786ed54 100644 --- a/examples/datadogagent/datadog-agent-all.yaml +++ b/examples/datadogagent/datadog-agent-all.yaml @@ -47,6 +47,8 @@ spec: enabled: true serviceDiscovery: enabled: true + gpu: + enabled: true eventCollection: collectKubernetesEvents: true orchestratorExplorer: diff --git a/internal/controller/datadogagent/controller.go b/internal/controller/datadogagent/controller.go index 7c29bdd31..6eb7f2e65 100644 --- a/internal/controller/datadogagent/controller.go +++ b/internal/controller/datadogagent/controller.go @@ -34,6 +34,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/eventcollection" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/externalmetrics" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/helmcheck" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/kubernetesstatecore" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" diff --git a/internal/controller/datadogagent/defaults/datadogagent_default.go b/internal/controller/datadogagent/defaults/datadogagent_default.go index 1ebe56970..7b43579ba 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default.go @@ -37,6 +37,8 @@ const ( defaultEBPFCheckEnabled bool = false + defaultGPUMonitoringEnabled bool = false + defaultServiceDiscoveryEnabled bool = false defaultAPMEnabled bool = true @@ -265,6 +267,12 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) { } apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled) + // GPU monitoring feature + if ddaSpec.Features.GPUMonitoring == nil { + ddaSpec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + } + apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPUMonitoring.Enabled, defaultGPUMonitoringEnabled) + // APM Feature // APM is enabled by default if ddaSpec.Features.APM == nil { diff --git a/internal/controller/datadogagent/defaults/datadogagent_default_test.go b/internal/controller/datadogagent/defaults/datadogagent_default_test.go index 98d61fd6b..52c9cb446 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default_test.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default_test.go @@ -198,6 +198,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -333,6 +336,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -423,6 +429,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -549,6 +558,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -696,6 +708,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -838,6 +853,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueTrue), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -980,6 +998,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1131,6 +1152,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1273,6 +1297,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1418,6 +1445,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1602,6 +1632,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, CSPM: &v2alpha1.CSPMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultCSPMEnabled), }, @@ -1717,6 +1750,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1860,6 +1896,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1979,6 +2018,7 @@ func Test_defaultFeatures(t *testing.T) { OOMKill: &v2alpha1.OOMKillFeatureConfig{}, TCPQueueLength: &v2alpha1.TCPQueueLengthFeatureConfig{}, EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{}, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{}, ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{}, APM: &v2alpha1.APMFeatureConfig{}, ASM: &v2alpha1.ASMFeatureConfig{}, @@ -2024,6 +2064,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -2169,6 +2212,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ diff --git a/internal/controller/datadogagent/feature/gpu/envvar.go b/internal/controller/datadogagent/feature/gpu/envvar.go new file mode 100644 index 000000000..5c8a0b96f --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/envvar.go @@ -0,0 +1,9 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +const DDEnableGPUMonitoringEnvVar = "DD_GPU_MONITORING_ENABLED" +const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES" diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go new file mode 100644 index 000000000..1d4f8f9ff --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -0,0 +1,154 @@ +package gpu + +import ( + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" +) + +func init() { + if err := feature.Register(feature.GPUMonitoringType, buildFeature); err != nil { + panic(err) + } +} + +func buildFeature(*feature.Options) feature.Feature { + return &gpuMonitoringFeature{} +} + +type gpuMonitoringFeature struct { + podRuntimeClassName string +} + +// ID returns the ID of the Feature +func (f *gpuMonitoringFeature) ID() feature.IDType { + return feature.GPUMonitoringType +} + +// Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. +func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { + if dda.Spec.Features == nil || dda.Spec.Features.GPUMonitoring == nil || !apiutils.BoolValue(dda.Spec.Features.GPUMonitoring.Enabled) { + return reqComp + } + + reqComp.Agent = feature.RequiredComponent{ + IsRequired: apiutils.NewBoolPointer(true), + Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, + } + + if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { + f.podRuntimeClassName = v2alpha1.DefaultGPUMonitoringRuntimeClass + } else { + f.podRuntimeClassName = *dda.Spec.Features.GPUMonitoring.PodRuntimeClassName + } + + return reqComp +} + +// ManageDependencies allows a feature to manage its dependencies. +// Feature's dependencies should be added in the store. +func (f *gpuMonitoringFeature) ManageDependencies(feature.ResourceManagers, feature.RequiredComponents) error { + return nil +} + +// ManageClusterAgent allows a feature to configure the ClusterAgent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageClusterAgent(feature.PodTemplateManagers) error { + return nil +} + +func configureSystemProbe(managers feature.PodTemplateManagers) { + // annotations + managers.Annotation().AddAnnotation(v2alpha1.SystemProbeAppArmorAnnotationKey, v2alpha1.SystemProbeAppArmorAnnotationValue) + + // security context capabilities + managers.SecurityContext().AddCapabilitiesToContainer(agent.DefaultCapabilitiesForSystemProbe(), apicommon.SystemProbeContainerName) + + // socket volume mount (needs write perms for the system probe container but not the others) + procdirVol, procdirMount := volume.GetVolumes(v2alpha1.ProcdirVolumeName, v2alpha1.ProcdirHostPath, v2alpha1.ProcdirMountPath, true) + managers.VolumeMount().AddVolumeMountToContainer(&procdirMount, apicommon.SystemProbeContainerName) + managers.Volume().AddVolume(&procdirVol) + + socketVol, socketVolMount := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, false) + managers.Volume().AddVolume(&socketVol) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMount, apicommon.SystemProbeContainerName) + + _, socketVolMountReadOnly := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, true) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMountReadOnly, apicommon.CoreAgentContainerName) + + socketEnvVar := &corev1.EnvVar{ + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + } + + managers.EnvVar().AddEnvVarToContainer(apicommon.CoreAgentContainerName, socketEnvVar) + managers.EnvVar().AddEnvVarToContainer(apicommon.SystemProbeContainerName, socketEnvVar) +} + +// ManageNodeAgent allows a feature to configure the Node Agent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ string) error { + configureSystemProbe(managers) + + // env var to enable the GPU module + enableEnvVar := &corev1.EnvVar{ + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + } + + // Both in the core agent and the system probe + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, enableEnvVar) + + // The agent check does not need to be manually enabled, the init config container will + // check if GPU monitoring is enabled and will enable the check automatically (see + // Dockerfiles/agent/cont-init.d/60-sysprobe-check.sh in the datadog-agent repo). + managers.EnvVar().AddEnvVarToInitContainer(apicommon.InitConfigContainerName, enableEnvVar) + + // Now we need to add the NVIDIA_VISIBLE_DEVICES env var to both agents again so + // that the nvidia runtime can expose the GPU devices in the container + nvidiaVisibleDevicesEnvVar := &corev1.EnvVar{ + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + } + + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, nvidiaVisibleDevicesEnvVar) + + // Some nvidia-container-runtime setups ignore the NVIDIA_VISIBLE_DEVICES + // env variable. This is usually configured with the options + // accept-nvidia-visible-devices-envvar-when-unprivileged = true + // accept-nvidia-visible-devices-as-volume-mounts = true + // in the NVIDIA conatiner runtime config. In this case, we need to mount the + // /var/run/nvidia-container-devices/all directory into the container, so that + // the nvidia-container-runtime can see that we want to use all GPUs. + devicesVol, devicesMount := volume.GetVolumes(v2alpha1.NVIDIADevicesVolumeName, v2alpha1.DevNullPath, v2alpha1.NVIDIADevicesMountPath, true) + managers.Volume().AddVolume(&devicesVol) + managers.VolumeMount().AddVolumeMountToContainers(&devicesMount, []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}) + + // Configure the runtime class for the container + if f.podRuntimeClassName != "" { + managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName + } + + // Note: we don't need to mount the NVML library, as it's mounted automatically + // by the nvidia-container-runtime. + + return nil +} + +// ManageSingleContainerNodeAgent allows a feature to configure the Agent container for the Node Agent's corev1.PodTemplateSpec +// if SingleContainerStrategy is enabled and can be used with the configured feature set. +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageSingleContainerNodeAgent(feature.PodTemplateManagers, string) error { + return nil +} + +// ManageClusterChecksRunner allows a feature to configure the ClusterChecksRunner's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageClusterChecksRunner(feature.PodTemplateManagers) error { + return nil +} diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go new file mode 100644 index 000000000..82a291dbb --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -0,0 +1,148 @@ +package gpu + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/fake" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" +) + +func Test_GPUMonitoringFeature_Configure(t *testing.T) { + ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ + Spec: v2alpha1.DatadogAgentSpec{ + Features: &v2alpha1.DatadogFeatures{ + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(false), + }, + }, + }, + } + ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() + ddaGPUMonitoringEnabled.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(true) + + GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + mgr := mgrInterface.(*fake.PodTemplateManagers) + + // check security context capabilities + sysProbeCapabilities := mgr.SecurityContextMgr.CapabilitiesByC[apicommon.SystemProbeContainerName] + assert.True( + t, + apiutils.IsEqualStruct(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + "System Probe security context capabilities \ndiff = %s", + cmp.Diff(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + ) + + // check volume mounts + wantCoreAgentVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: true, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + MountPath: v2alpha1.NVIDIADevicesMountPath, + ReadOnly: true, + }, + } + + wantSystemProbeVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.ProcdirVolumeName, + MountPath: v2alpha1.ProcdirMountPath, + ReadOnly: true, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: false, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + MountPath: v2alpha1.NVIDIADevicesMountPath, + ReadOnly: true, + }, + } + + coreAgentVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(coreAgentVolumeMounts, wantCoreAgentVolMounts), "Core agent volume mounts \ndiff = %s", cmp.Diff(coreAgentVolumeMounts, wantCoreAgentVolMounts)) + + systemProbeVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeVolumeMounts, wantSystemProbeVolMounts), "System Probe volume mounts \ndiff = %s", cmp.Diff(systemProbeVolumeMounts, wantSystemProbeVolMounts)) + + // check volumes + wantVolumes := []corev1.Volume{ + { + Name: v2alpha1.ProcdirVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: v2alpha1.ProcdirHostPath, + }, + }, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: v2alpha1.DevNullPath, + }, + }, + }, + } + + volumes := mgr.VolumeMgr.Volumes + assert.True(t, apiutils.IsEqualStruct(volumes, wantVolumes), "Volumes \ndiff = %s", cmp.Diff(volumes, wantVolumes)) + + // check env vars + wantEnvVars := []*corev1.EnvVar{ + { + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + }, + { + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + }, + { + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + }, + } + agentEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(agentEnvVars, wantEnvVars), "Agent envvars \ndiff = %s", cmp.Diff(agentEnvVars, wantEnvVars)) + + systemProbeEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeEnvVars, wantEnvVars), "System Probe envvars \ndiff = %s", cmp.Diff(systemProbeEnvVars, wantEnvVars)) + } + + tests := test.FeatureTestSuite{ + { + Name: "gpu monitoring not enabled", + DDA: ddaGPUMonitoringDisabled.DeepCopy(), + WantConfigure: false, + }, + { + Name: "gpu monitoring enabled", + DDA: ddaGPUMonitoringEnabled, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(GPUMonitoringAgentNodeWantFunc), + }, + } + + tests.Run(t, buildFeature) +} diff --git a/internal/controller/datadogagent/feature/ids.go b/internal/controller/datadogagent/feature/ids.go index b395d720d..ecd4365e3 100644 --- a/internal/controller/datadogagent/feature/ids.go +++ b/internal/controller/datadogagent/feature/ids.go @@ -71,4 +71,6 @@ const ( DummyIDType = "dummy" // ServiceDiscoveryType service discovery feature. ServiceDiscoveryType = "service_discovery" + // GPUMonitoringType monitoring feature. + GPUMonitoringType = "gpu" ) diff --git a/internal/controller/datadogagent/feature/test/factory_test.go b/internal/controller/datadogagent/feature/test/factory_test.go index 8d0491d17..5b6ed0e58 100644 --- a/internal/controller/datadogagent/feature/test/factory_test.go +++ b/internal/controller/datadogagent/feature/test/factory_test.go @@ -12,6 +12,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/apm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/cspm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/npm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/otelcollector" @@ -295,6 +296,22 @@ func TestBuilder(t *testing.T) { common.AgentDataPlaneContainerName: false, }, }, + { + name: "GPU monitoring enabled, 4 agents", + dda: testutils.NewDatadogAgentBuilder(). + WithGPUMonitoringEnabled(true). + BuildWithDefaults(), + wantAgentContainer: map[common.AgentContainerName]bool{ + common.UnprivilegedSingleAgentContainerName: false, + common.CoreAgentContainerName: true, + common.ProcessAgentContainerName: true, + common.TraceAgentContainerName: true, + common.SystemProbeContainerName: true, + common.SecurityAgentContainerName: false, + common.OtelAgent: false, + common.AgentDataPlaneContainerName: false, + }, + }, } for _, tt := range tests { @@ -304,7 +321,7 @@ func TestBuilder(t *testing.T) { assert.True(t, *requiredComponents.Agent.IsRequired) for name, required := range tt.wantAgentContainer { - assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "Check", name) + assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "container %s", name) } }) } diff --git a/internal/controller/datadogagent_controller_test.go b/internal/controller/datadogagent_controller_test.go index b8442043a..97d7e6def 100644 --- a/internal/controller/datadogagent_controller_test.go +++ b/internal/controller/datadogagent_controller_test.go @@ -168,6 +168,11 @@ var _ = Describe("V2 Controller - DatadogAgent Deployment", func() { "with overrides", testFunction(testutils.NewDatadogAgentWithOverrides(namespace, "with-overrides")), ) + + Context( + "with GPU monitoring", + testFunction(testutils.NewDatadogAgentWithGPUMonitoring(namespace, "with-gpu-monitoring")), + ) }) func testFunction(agent v2alpha1.DatadogAgent) func() { diff --git a/internal/controller/testutils/agent.go b/internal/controller/testutils/agent.go index 89244d725..83a9422cc 100644 --- a/internal/controller/testutils/agent.go +++ b/internal/controller/testutils/agent.go @@ -351,6 +351,19 @@ func NewDatadogAgentWithUSM(namespace string, name string) v2alpha1.DatadogAgent ) } +// NewDatadogAgentWithGPUMonitoring returns an agent with GPU monitoring enabled +func NewDatadogAgentWithGPUMonitoring(namespace string, name string) v2alpha1.DatadogAgent { + return newDatadogAgentWithFeatures( + namespace, + name, + &v2alpha1.DatadogFeatures{ + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(true), + }, + }, + ) +} + // NewDatadogAgentWithGlobalConfigSettings returns an agent with some global // settings set func NewDatadogAgentWithGlobalConfigSettings(namespace string, name string) v2alpha1.DatadogAgent { diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index 28a2f4e61..dae0f5c40 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -949,3 +949,18 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da builder.datadogAgent.Spec.Global.FIPS = &fipsConfig return builder } + + +// GPU + +func (builder *DatadogAgentBuilder) initGPUMonitoring() { + if builder.datadogAgent.Spec.Features.GPUMonitoring == nil { + builder.datadogAgent.Spec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + } +} + +func (builder *DatadogAgentBuilder) WithGPUMonitoringEnabled(enabled bool) *DatadogAgentBuilder { + builder.initGPUMonitoring() + builder.datadogAgent.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(enabled) + return builder +} From dd0dd9c9c87d1ff30be2d0519787144085bc023d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Tue, 7 Jan 2025 13:40:30 +0000 Subject: [PATCH 2/4] Add tests for runtime class changes --- .../datadogagent/feature/gpu/feature_test.go | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 82a291dbb..01190254e 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -16,6 +16,8 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" ) +const alternativeRuntimeClass = "nvidia-like" + func Test_GPUMonitoringFeature_Configure(t *testing.T) { ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ Spec: v2alpha1.DatadogAgentSpec{ @@ -29,7 +31,13 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() ddaGPUMonitoringEnabled.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(true) - GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + ddaGPUMonitoringEnabledAlternativeRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledAlternativeRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer(alternativeRuntimeClass) + + ddaGPUMonitoringEnabledANoRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledANoRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer("") + + GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers, expectedRuntimeClass string) { mgr := mgrInterface.(*fake.PodTemplateManagers) // check security context capabilities @@ -128,6 +136,13 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { systemProbeEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.SystemProbeContainerName] assert.True(t, apiutils.IsEqualStruct(systemProbeEnvVars, wantEnvVars), "System Probe envvars \ndiff = %s", cmp.Diff(systemProbeEnvVars, wantEnvVars)) + + // Check runtime class + if expectedRuntimeClass == "" { + assert.Nil(t, mgr.PodTemplateSpec().Spec.RuntimeClassName) + } else { + assert.Equal(t, expectedRuntimeClass, *mgr.PodTemplateSpec().Spec.RuntimeClassName) + } } tests := test.FeatureTestSuite{ @@ -140,7 +155,20 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { Name: "gpu monitoring enabled", DDA: ddaGPUMonitoringEnabled, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(GPUMonitoringAgentNodeWantFunc), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) }), + }, + { + Name: "gpu monitoring enabled, alternative runtime class", + DDA: ddaGPUMonitoringEnabledAlternativeRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) }), + }, + + { + Name: "gpu monitoring enabled, no runtime class", + DDA: ddaGPUMonitoringEnabledANoRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") }), }, } From 7d40ddb8b101a38375467bf57397c9149d71b693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 8 Jan 2025 10:07:31 +0000 Subject: [PATCH 3/4] Documentation --- config/manager/kustomization.yaml | 4 ++-- .../datadogagent/feature/gpu/feature.go | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 58b7c4148..8c4744223 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -2,7 +2,7 @@ resources: - manager.yaml images: - name: controller - newName: gcr.io/datadoghq/operator - newTag: 1.11.1 + newName: 601427279990.dkr.ecr.us-east-1.amazonaws.com/guillermo.julian/sandbox + newTag: operator apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index 1d4f8f9ff..eccee6a9d 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -22,6 +22,9 @@ func buildFeature(*feature.Options) feature.Feature { } type gpuMonitoringFeature struct { + // podRuntimeClassName is the value to set in the runtimeClassName + // configuration of the agent pod. If this is empty, the runtimeClassName + // will not be changed. podRuntimeClassName string } @@ -42,8 +45,11 @@ func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp fe } if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { + // Configuration option not set, so revert to the default f.podRuntimeClassName = v2alpha1.DefaultGPUMonitoringRuntimeClass } else { + // Configuration option set, use the value. Note that here the value might be an empty + // string, which tells us to not change the runtime class. f.podRuntimeClassName = *dda.Spec.Features.GPUMonitoring.PodRuntimeClassName } @@ -122,7 +128,7 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag // env variable. This is usually configured with the options // accept-nvidia-visible-devices-envvar-when-unprivileged = true // accept-nvidia-visible-devices-as-volume-mounts = true - // in the NVIDIA conatiner runtime config. In this case, we need to mount the + // in the NVIDIA container runtime config. In this case, we need to mount the // /var/run/nvidia-container-devices/all directory into the container, so that // the nvidia-container-runtime can see that we want to use all GPUs. devicesVol, devicesMount := volume.GetVolumes(v2alpha1.NVIDIADevicesVolumeName, v2alpha1.DevNullPath, v2alpha1.NVIDIADevicesMountPath, true) @@ -134,8 +140,11 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName } - // Note: we don't need to mount the NVML library, as it's mounted automatically - // by the nvidia-container-runtime. + // Note: we don't need to mount the NVML library, as it's mounted + // automatically by the nvidia-container-runtime. However, if needed, we + // could add a config option for that and mount that in the agent and + // system-probe folders, and then set the correct configuration option so + // that the binaries can find the library. return nil } From da4ab242822e62b7af778e5226fb476290c1cce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 9 Jan 2025 11:02:55 +0000 Subject: [PATCH 4/4] Update docs --- api/datadoghq/v2alpha1/datadogagent_types.go | 2 +- config/crd/bases/v1/datadoghq.com_datadogagents.yaml | 4 ++-- .../v1/datadoghq.com_datadogagents_v2alpha1.json | 4 ++-- docs/configuration.v2alpha1.md | 2 +- .../datadogagent/feature/gpu/feature_test.go | 12 +++++++++--- pkg/testutils/builder.go | 1 - 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 1756ba844..232b653ed 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -508,7 +508,7 @@ type GPUMonitoringFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - // If left empty, the runtime class will not be set. + // If left empty, the runtime class is not set. // Default: nvidia // +optional PodRuntimeClassName *string `json:"requiredRuntimeClassName"` diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index f862243d3..effa55c25 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1030,7 +1030,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class will not be set. + If left empty, the runtime class is not set. Default: nvidia type: string type: object @@ -7909,7 +7909,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class will not be set. + If left empty, the runtime class is not set. Default: nvidia type: string type: object diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 62a36b6d3..d5922be92 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1074,7 +1074,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, @@ -7895,7 +7895,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 40a37b5b0..7a73f5c46 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -112,7 +112,7 @@ spec: | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | | features.gpu.enabled | Enables GPU monitoring. Default: false | -| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class will not be set. Default: nvidia | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class is not set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 01190254e..65224cd7d 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -155,20 +155,26 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { Name: "gpu monitoring enabled", DDA: ddaGPUMonitoringEnabled, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) + }), }, { Name: "gpu monitoring enabled, alternative runtime class", DDA: ddaGPUMonitoringEnabledAlternativeRuntimeClass, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) + }), }, { Name: "gpu monitoring enabled, no runtime class", DDA: ddaGPUMonitoringEnabledANoRuntimeClass, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") + }), }, } diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index dae0f5c40..1a0c21107 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -950,7 +950,6 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da return builder } - // GPU func (builder *DatadogAgentBuilder) initGPUMonitoring() {