Skip to content

Commit cd7c1fb

Browse files
mikedldalphanota
andauthored
fix: retry on errors when watching pods (#9373)
* fix: retry on errors when watching pods If timeout (or some network error?) occurs while waiting for a pod initialization or termination event, e.g. when build takes a long time, skaffold becomes stuck and never finishes the operation. Use retry watcher to handle the errors gracefully. * chore: run `go mod vendor` to pull new dependencies * test: fixup `TestWaitForPodSucceeded` unit test * chore: run `go mod vendor` to sync dependencies --------- Co-authored-by: Angel Montero <[email protected]>
1 parent 0b5cea8 commit cd7c1fb

38 files changed

+8663
-2
lines changed

pkg/skaffold/kubernetes/wait.go

+17-2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ import (
3232
"k8s.io/apimachinery/pkg/watch"
3333
"k8s.io/client-go/kubernetes"
3434
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
35+
"k8s.io/client-go/tools/cache"
36+
watchtools "k8s.io/client-go/tools/watch"
3537

3638
"github.com/GoogleContainerTools/skaffold/v2/pkg/skaffold/output/log"
3739
)
@@ -61,7 +63,7 @@ func watchUntilTimeout(ctx context.Context, timeout time.Duration, w watch.Inter
6163
func WaitForPodSucceeded(ctx context.Context, pods corev1.PodInterface, podName string, timeout time.Duration) error {
6264
log.Entry(ctx).Infof("Waiting for %s to be complete", podName)
6365

64-
w, err := pods.Watch(ctx, metav1.ListOptions{})
66+
w, err := newPodsWatcher(ctx, pods)
6567
if err != nil {
6668
return fmt.Errorf("initializing pod watcher: %s", err)
6769
}
@@ -101,7 +103,7 @@ func isPodSucceeded(podName string) func(event *watch.Event) (bool, error) {
101103
func WaitForPodInitialized(ctx context.Context, pods corev1.PodInterface, podName string) error {
102104
log.Entry(ctx).Infof("Waiting for %s to be initialized", podName)
103105

104-
w, err := pods.Watch(ctx, metav1.ListOptions{})
106+
w, err := newPodsWatcher(ctx, pods)
105107
if err != nil {
106108
return fmt.Errorf("initializing pod watcher: %s", err)
107109
}
@@ -158,3 +160,16 @@ func WaitForDeploymentToStabilize(ctx context.Context, c kubernetes.Interface, n
158160
return false, nil
159161
})
160162
}
163+
164+
func newPodsWatcher(ctx context.Context, pods corev1.PodInterface) (watch.Interface, error) {
165+
initList, err := pods.List(ctx, metav1.ListOptions{})
166+
if err != nil {
167+
return nil, err
168+
}
169+
170+
return watchtools.NewRetryWatcher(initList.GetResourceVersion(), &cache.ListWatch{
171+
WatchFunc: func(listOptions metav1.ListOptions) (watch.Interface, error) {
172+
return pods.Watch(ctx, listOptions)
173+
},
174+
})
175+
}

pkg/skaffold/kubernetes/wait_test.go

+11
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ import (
2323

2424
v1 "k8s.io/api/core/v1"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/runtime"
2627
"k8s.io/apimachinery/pkg/watch"
2728
fakekubeclientset "k8s.io/client-go/kubernetes/fake"
29+
clienttesting "k8s.io/client-go/testing"
2830

2931
"github.com/GoogleContainerTools/skaffold/v2/testutil"
3032
)
@@ -62,6 +64,9 @@ func TestWaitForPodSucceeded(t *testing.T) {
6264
pod := &v1.Pod{}
6365
client := fakekubeclientset.NewSimpleClientset(pod)
6466

67+
client.PrependReactor("list", "pods", func(action clienttesting.Action) (handled bool, ret runtime.Object, err error) {
68+
return true, &v1.PodList{ListMeta: metav1.ListMeta{ResourceVersion: "1"}}, nil
69+
})
6570
fakeWatcher := watch.NewRaceFreeFake()
6671
client.PrependWatchReactor("*", testutil.SetupFakeWatcher(fakeWatcher))
6772
fakePods := client.CoreV1().Pods("")
@@ -78,12 +83,18 @@ func TestWaitForPodSucceeded(t *testing.T) {
7883
switch phase {
7984
case v1.PodPending, v1.PodRunning, v1.PodFailed, v1.PodSucceeded, v1.PodUnknown:
8085
fakeWatcher.Modify(&v1.Pod{
86+
ObjectMeta: metav1.ObjectMeta{
87+
ResourceVersion: "1",
88+
},
8189
Status: v1.PodStatus{
8290
Phase: phase,
8391
},
8492
})
8593
default:
8694
fakeWatcher.Modify(&metav1.Status{
95+
ListMeta: metav1.ListMeta{
96+
ResourceVersion: "1",
97+
},
8798
Status: "Failure",
8899
})
89100
}

vendor/k8s.io/apimachinery/pkg/util/cache/expiring.go

+202
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)