Skip to content

Commit b0c1a56

Browse files
Merge pull request #3807 from bood/pg-completed-pod-error-case
fix: mark PodGroup completed when pod fails
2 parents dca4160 + 9ac1c8c commit b0c1a56

File tree

4 files changed

+84
-4
lines changed

4 files changed

+84
-4
lines changed

pkg/scheduler/api/helpers.go

+10
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,16 @@ func AllocatedStatus(status TaskStatus) bool {
8080
}
8181
}
8282

83+
// CompletedStatus checks whether the tasks are completed (regardless of failure or success)
84+
func CompletedStatus(status TaskStatus) bool {
85+
switch status {
86+
case Failed, Succeeded:
87+
return true
88+
default:
89+
return false
90+
}
91+
}
92+
8393
// MergeErrors is used to merge multiple errors into single error
8494
func MergeErrors(errs ...error) error {
8595
msg := "errors: "

pkg/scheduler/framework/session.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -333,16 +333,16 @@ func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
333333
} else {
334334
allocated := 0
335335
for status, tasks := range jobInfo.TaskStatusIndex {
336-
if api.AllocatedStatus(status) || status == api.Succeeded {
336+
if api.AllocatedStatus(status) || api.CompletedStatus(status) {
337337
allocated += len(tasks)
338338
}
339339
}
340340

341341
// If there're enough allocated resource, it's running
342342
if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
343343
status.Phase = scheduling.PodGroupRunning
344-
// If all allocated tasks is succeeded, it's completed
345-
if len(jobInfo.TaskStatusIndex[api.Succeeded]) == allocated {
344+
// If all allocated tasks is succeeded or failed, it's completed
345+
if len(jobInfo.TaskStatusIndex[api.Succeeded])+len(jobInfo.TaskStatusIndex[api.Failed]) == allocated {
346346
status.Phase = scheduling.PodGroupCompleted
347347
}
348348
} else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue {

test/e2e/schedulingbase/job_scheduling.go

+23
Original file line numberDiff line numberDiff line change
@@ -594,4 +594,27 @@ var _ = Describe("Job E2E Test", func() {
594594
Expect(pgPhase).To(Equal(vcscheduling.PodGroupCompleted), "podGroup Phase is %s, should be %s",
595595
ctx.Namespace, vcscheduling.PodGroupCompleted)
596596
})
597+
598+
It("PodGroup's Phase should be Completed when Job fails", func() {
599+
ctx := e2eutil.InitTestContext(e2eutil.Options{})
600+
defer e2eutil.CleanupTestContext(ctx)
601+
602+
jb := e2eutil.CreateFailK8sJob(ctx, "job1", e2eutil.DefaultNginxImage, e2eutil.OneCPU)
603+
err := e2eutil.Waitk8sJobCompleted(ctx, jb.Name)
604+
Expect(err).NotTo(HaveOccurred())
605+
606+
var pgPhase vcscheduling.PodGroupPhase
607+
wait.Poll(time.Second, time.Second*30, func() (bool, error) {
608+
pgs, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(ctx.Namespace).List(context.TODO(), metav1.ListOptions{})
609+
Expect(err).NotTo(HaveOccurred(), "failed to list podGroups in namespace %s", ctx.Namespace)
610+
Expect(len(pgs.Items)).To(Equal(1), "this test need a clean cluster")
611+
pgPhase = pgs.Items[0].Status.Phase
612+
if pgPhase != vcscheduling.PodGroupRunning {
613+
return true, nil
614+
}
615+
return false, nil
616+
})
617+
Expect(pgPhase).To(Equal(vcscheduling.PodGroupCompleted), "podGroup Phase is %s, should be %s",
618+
ctx.Namespace, vcscheduling.PodGroupCompleted)
619+
})
597620
})

test/e2e/util/deployment.go

+48-1
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,53 @@ func CreateSampleK8sJob(ctx *TestContext, name string, img string, req v1.Resour
159159
return jb
160160
}
161161

162+
// CreateFailK8sJob creates a new k8s job that fails
163+
func CreateFailK8sJob(ctx *TestContext, name string, img string, req v1.ResourceList) *batchv1.Job {
164+
k8sjobname := "job.k8s.io"
165+
defaultTrue := true
166+
// no retries to save time
167+
defaultBackoffLimit := int32(0)
168+
j := &batchv1.Job{
169+
ObjectMeta: metav1.ObjectMeta{
170+
Name: name,
171+
},
172+
Spec: batchv1.JobSpec{
173+
Selector: &metav1.LabelSelector{
174+
MatchLabels: map[string]string{
175+
k8sjobname: name,
176+
},
177+
},
178+
ManualSelector: &defaultTrue,
179+
BackoffLimit: &defaultBackoffLimit,
180+
Template: v1.PodTemplateSpec{
181+
ObjectMeta: metav1.ObjectMeta{
182+
Labels: map[string]string{k8sjobname: name},
183+
},
184+
Spec: v1.PodSpec{
185+
SchedulerName: "volcano",
186+
RestartPolicy: v1.RestartPolicyNever,
187+
Containers: []v1.Container{
188+
{
189+
Image: img,
190+
Name: name,
191+
Command: []string{"/bin/sh", "-c", "sleep 10 && exit 1"},
192+
ImagePullPolicy: v1.PullIfNotPresent,
193+
Resources: v1.ResourceRequirements{
194+
Requests: req,
195+
},
196+
},
197+
},
198+
},
199+
},
200+
},
201+
}
202+
203+
jb, err := ctx.Kubeclient.BatchV1().Jobs(ctx.Namespace).Create(context.TODO(), j, metav1.CreateOptions{})
204+
Expect(err).NotTo(HaveOccurred(), "failed to create k8sjob %s", name)
205+
206+
return jb
207+
}
208+
162209
func k8sjobCompleted(ctx *TestContext, name string) wait.ConditionFunc {
163210
return func() (bool, error) {
164211
jb, err := ctx.Kubeclient.BatchV1().Jobs(ctx.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
@@ -173,7 +220,7 @@ func k8sjobCompleted(ctx *TestContext, name string) wait.ConditionFunc {
173220
if !labelSelector.Matches(labels.Set(pod.Labels)) {
174221
continue
175222
}
176-
if pod.Status.Phase == v1.PodSucceeded {
223+
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
177224
return true, nil
178225
}
179226
}

0 commit comments

Comments
 (0)