Skip to content

Commit efcb779

Browse files
authored
Add taints for AL2023 NodeGroups as tolerations for Nvidia device plugin daemonset (#8627)
Updated check for apply nodegroup taints for Nvidia device plugin daemonset to include AL2023 AMIs
1 parent 2d370db commit efcb779

File tree

3 files changed

+266
-2
lines changed

3 files changed

+266
-2
lines changed

pkg/addons/addons_suite_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package addons_test
2+
3+
import (
4+
"testing"
5+
6+
"github.com/weaveworks/eksctl/pkg/testutils"
7+
)
8+
9+
func TestAddons(t *testing.T) {
10+
testutils.RegisterAndRun(t)
11+
}

pkg/addons/device_plugin.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ func (n *NvidiaDevicePlugin) SetTolerations(spec *corev1.PodTemplateSpec) error
224224
taints := make(map[string]api.NodeGroupTaint)
225225
for _, ng := range n.spec.NodeGroups {
226226
if api.HasInstanceType(ng, instance.IsNvidiaInstanceType) &&
227-
ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2 {
227+
(ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2 || ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2023) {
228228
for _, taint := range ng.Taints {
229229
if _, ok := taints[taint.Key]; !ok {
230230
taints[taint.Key] = taint
@@ -234,7 +234,7 @@ func (n *NvidiaDevicePlugin) SetTolerations(spec *corev1.PodTemplateSpec) error
234234
}
235235
for _, ng := range n.spec.ManagedNodeGroups {
236236
if api.HasInstanceTypeManaged(ng, instance.IsNvidiaInstanceType) &&
237-
ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2 {
237+
(ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2 || ng.GetAMIFamily() == api.NodeImageFamilyAmazonLinux2023) {
238238
for _, taint := range ng.Taints {
239239
if _, ok := taints[taint.Key]; !ok {
240240
taints[taint.Key] = taint

pkg/addons/device_plugin_test.go

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
package addons_test
2+
3+
import (
4+
. "github.com/onsi/ginkgo/v2"
5+
. "github.com/onsi/gomega"
6+
7+
"github.com/weaveworks/eksctl/pkg/addons"
8+
api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5"
9+
corev1 "k8s.io/api/core/v1"
10+
)
11+
12+
var _ = Describe("NvidiaDevicePlugin", func() {
13+
Describe("SetTolerations", func() {
14+
var (
15+
plugin *addons.NvidiaDevicePlugin
16+
spec *corev1.PodTemplateSpec
17+
config *api.ClusterConfig
18+
)
19+
20+
BeforeEach(func() {
21+
spec = &corev1.PodTemplateSpec{
22+
Spec: corev1.PodSpec{
23+
Tolerations: []corev1.Toleration{},
24+
},
25+
}
26+
config = &api.ClusterConfig{}
27+
})
28+
29+
Context("with NodeGroups", func() {
30+
It("should add tolerations for AmazonLinux2 nodegroups with NVIDIA instances", func() {
31+
config.NodeGroups = []*api.NodeGroup{
32+
{
33+
NodeGroupBase: &api.NodeGroupBase{
34+
Name: "nvidia-ng",
35+
InstanceType: "g4dn.xlarge",
36+
AMIFamily: api.NodeImageFamilyAmazonLinux2,
37+
},
38+
Taints: []api.NodeGroupTaint{
39+
{Key: "nvidia.com/gpu", Value: "true", Effect: "NoSchedule"},
40+
{Key: "workload", Value: "gpu", Effect: "NoExecute"},
41+
},
42+
},
43+
}
44+
45+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
46+
err := plugin.SetTolerations(spec)
47+
48+
Expect(err).NotTo(HaveOccurred())
49+
Expect(spec.Spec.Tolerations).To(HaveLen(2))
50+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
51+
Key: "nvidia.com/gpu",
52+
Value: "true",
53+
}))
54+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
55+
Key: "workload",
56+
Value: "gpu",
57+
}))
58+
})
59+
60+
It("should add tolerations for AmazonLinux2023 nodegroups with NVIDIA instances", func() {
61+
config.NodeGroups = []*api.NodeGroup{
62+
{
63+
NodeGroupBase: &api.NodeGroupBase{
64+
Name: "nvidia-ng",
65+
InstanceType: "g5.xlarge",
66+
AMIFamily: api.NodeImageFamilyAmazonLinux2023,
67+
},
68+
Taints: []api.NodeGroupTaint{
69+
{Key: "gpu-workload", Value: "ml", Effect: "NoSchedule"},
70+
},
71+
},
72+
}
73+
74+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
75+
err := plugin.SetTolerations(spec)
76+
77+
Expect(err).NotTo(HaveOccurred())
78+
Expect(spec.Spec.Tolerations).To(HaveLen(1))
79+
Expect(spec.Spec.Tolerations[0].Key).To(Equal("gpu-workload"))
80+
Expect(spec.Spec.Tolerations[0].Value).To(Equal("ml"))
81+
})
82+
83+
It("should not add tolerations for non-NVIDIA instances", func() {
84+
config.NodeGroups = []*api.NodeGroup{
85+
{
86+
NodeGroupBase: &api.NodeGroupBase{
87+
Name: "cpu-ng",
88+
InstanceType: "m5.large",
89+
AMIFamily: api.NodeImageFamilyAmazonLinux2,
90+
},
91+
Taints: []api.NodeGroupTaint{
92+
{Key: "cpu-only", Value: "true", Effect: "NoSchedule"},
93+
},
94+
},
95+
}
96+
97+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
98+
err := plugin.SetTolerations(spec)
99+
100+
Expect(err).NotTo(HaveOccurred())
101+
Expect(spec.Spec.Tolerations).To(BeEmpty())
102+
})
103+
104+
It("should not add tolerations for unsupported AMI families", func() {
105+
config.NodeGroups = []*api.NodeGroup{
106+
{
107+
NodeGroupBase: &api.NodeGroupBase{
108+
Name: "nvidia-ng",
109+
InstanceType: "g4dn.xlarge",
110+
AMIFamily: api.NodeImageFamilyUbuntu2004,
111+
},
112+
Taints: []api.NodeGroupTaint{
113+
{Key: "nvidia.com/gpu", Value: "true", Effect: "NoSchedule"},
114+
},
115+
},
116+
}
117+
118+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
119+
err := plugin.SetTolerations(spec)
120+
121+
Expect(err).NotTo(HaveOccurred())
122+
Expect(spec.Spec.Tolerations).To(BeEmpty())
123+
})
124+
})
125+
126+
Context("with ManagedNodeGroups", func() {
127+
It("should add tolerations for AmazonLinux2 managed nodegroups with NVIDIA instances", func() {
128+
config.ManagedNodeGroups = []*api.ManagedNodeGroup{
129+
{
130+
NodeGroupBase: &api.NodeGroupBase{
131+
Name: "managed-nvidia-ng",
132+
InstanceType: "g4dn.2xlarge",
133+
AMIFamily: api.NodeImageFamilyAmazonLinux2,
134+
},
135+
Taints: []api.NodeGroupTaint{
136+
{Key: "managed-gpu", Value: "nvidia", Effect: "NoSchedule"},
137+
},
138+
},
139+
}
140+
141+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
142+
err := plugin.SetTolerations(spec)
143+
144+
Expect(err).NotTo(HaveOccurred())
145+
Expect(spec.Spec.Tolerations).To(HaveLen(1))
146+
Expect(spec.Spec.Tolerations[0].Key).To(Equal("managed-gpu"))
147+
Expect(spec.Spec.Tolerations[0].Value).To(Equal("nvidia"))
148+
})
149+
150+
It("should add tolerations for AmazonLinux2023 managed nodegroups with NVIDIA instances", func() {
151+
config.ManagedNodeGroups = []*api.ManagedNodeGroup{
152+
{
153+
NodeGroupBase: &api.NodeGroupBase{
154+
Name: "managed-nvidia-ng",
155+
InstanceType: "g5.4xlarge",
156+
AMIFamily: api.NodeImageFamilyAmazonLinux2023,
157+
},
158+
Taints: []api.NodeGroupTaint{
159+
{Key: "ml-workload", Value: "training", Effect: "NoExecute"},
160+
},
161+
},
162+
}
163+
164+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
165+
err := plugin.SetTolerations(spec)
166+
167+
Expect(err).NotTo(HaveOccurred())
168+
Expect(spec.Spec.Tolerations).To(HaveLen(1))
169+
Expect(spec.Spec.Tolerations[0].Key).To(Equal("ml-workload"))
170+
Expect(spec.Spec.Tolerations[0].Value).To(Equal("training"))
171+
})
172+
})
173+
174+
Context("with existing tolerations", func() {
175+
It("should not duplicate existing tolerations", func() {
176+
spec.Spec.Tolerations = []corev1.Toleration{
177+
{Key: "existing-taint", Value: "existing-value"},
178+
}
179+
180+
config.NodeGroups = []*api.NodeGroup{
181+
{
182+
NodeGroupBase: &api.NodeGroupBase{
183+
Name: "nvidia-ng",
184+
InstanceType: "g4dn.xlarge",
185+
AMIFamily: api.NodeImageFamilyAmazonLinux2,
186+
},
187+
Taints: []api.NodeGroupTaint{
188+
{Key: "existing-taint", Value: "different-value", Effect: "NoSchedule"},
189+
{Key: "new-taint", Value: "new-value", Effect: "NoSchedule"},
190+
},
191+
},
192+
}
193+
194+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
195+
err := plugin.SetTolerations(spec)
196+
197+
Expect(err).NotTo(HaveOccurred())
198+
Expect(spec.Spec.Tolerations).To(HaveLen(2))
199+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
200+
Key: "existing-taint",
201+
Value: "existing-value",
202+
}))
203+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
204+
Key: "new-taint",
205+
Value: "new-value",
206+
}))
207+
})
208+
})
209+
210+
Context("with mixed nodegroup types", func() {
211+
It("should combine taints from both regular and managed nodegroups", func() {
212+
config.NodeGroups = []*api.NodeGroup{
213+
{
214+
NodeGroupBase: &api.NodeGroupBase{
215+
Name: "nvidia-ng",
216+
InstanceType: "g4dn.xlarge",
217+
AMIFamily: api.NodeImageFamilyAmazonLinux2,
218+
},
219+
Taints: []api.NodeGroupTaint{
220+
{Key: "regular-gpu", Value: "nvidia", Effect: "NoSchedule"},
221+
},
222+
},
223+
}
224+
config.ManagedNodeGroups = []*api.ManagedNodeGroup{
225+
{
226+
NodeGroupBase: &api.NodeGroupBase{
227+
Name: "managed-nvidia-ng",
228+
InstanceType: "g5.xlarge",
229+
AMIFamily: api.NodeImageFamilyAmazonLinux2023,
230+
},
231+
Taints: []api.NodeGroupTaint{
232+
{Key: "managed-gpu", Value: "nvidia", Effect: "NoSchedule"},
233+
},
234+
},
235+
}
236+
237+
plugin = addons.NewNvidiaDevicePlugin(nil, "us-west-2", false, config).(*addons.NvidiaDevicePlugin)
238+
err := plugin.SetTolerations(spec)
239+
240+
Expect(err).NotTo(HaveOccurred())
241+
Expect(spec.Spec.Tolerations).To(HaveLen(2))
242+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
243+
Key: "regular-gpu",
244+
Value: "nvidia",
245+
}))
246+
Expect(spec.Spec.Tolerations).To(ContainElement(corev1.Toleration{
247+
Key: "managed-gpu",
248+
Value: "nvidia",
249+
}))
250+
})
251+
})
252+
})
253+
})

0 commit comments

Comments
 (0)