@@ -26,6 +26,8 @@ type recovery struct {
26
26
isRecovering bool // true while in recovery process
27
27
lastFailure time.Time // time of most recent failure
28
28
failures int // failure count for backoff
29
+ stopCh chan struct {}
30
+ wg sync.WaitGroup
29
31
}
30
32
31
33
func newRecovery (
@@ -35,6 +37,7 @@ func newRecovery(
35
37
watcher : watcher ,
36
38
state : state ,
37
39
backoff : backoff .NewExponential (config .BaseRetryInterval , config .MaxRetryInterval ),
40
+ stopCh : make (chan struct {}),
38
41
}
39
42
}
40
43
@@ -80,11 +83,13 @@ func (r *recovery) handleError(ctx context.Context, msg *pendingMessage, err err
80
83
log .Ctx (ctx ).Debug ().Msg ("Reset dispatcher state state after publish failure" )
81
84
82
85
// Launch recovery goroutine
86
+ r .wg .Add (1 )
83
87
go r .recoveryLoop (ctx , r .failures )
84
88
}
85
89
86
90
// recoveryLoop handles the recovery process with backoff
87
91
func (r * recovery ) recoveryLoop (ctx context.Context , failures int ) {
92
+ defer r .wg .Done ()
88
93
defer func () {
89
94
r .mu .Lock ()
90
95
r .isRecovering = false
@@ -95,33 +100,51 @@ func (r *recovery) recoveryLoop(ctx context.Context, failures int) {
95
100
// Perform backoff
96
101
backoffDuration := r .backoff .BackoffDuration (failures )
97
102
log .Debug ().Int ("failures" , failures ).Dur ("backoff" , backoffDuration ).Msg ("Performing backoff" )
98
- r .backoff .Backoff (ctx , failures )
103
+
104
+ // Perform backoff with interruptibility
105
+ timer := time .NewTimer (backoffDuration )
106
+ select {
107
+ case <- timer .C :
108
+ case <- r .stopCh :
109
+ timer .Stop ()
110
+ return
111
+ case <- ctx .Done ():
112
+ timer .Stop ()
113
+ return
114
+ }
99
115
100
116
// Just restart the watcher - it will resume from last checkpoint
101
117
if err := r .watcher .Start (ctx ); err != nil {
102
118
if r .watcher .Stats ().State == watcher .StateRunning {
103
119
log .Debug ().Msg ("Watcher already after recovery. Exiting recovery loop." )
104
120
return
105
121
}
106
- if ctx .Err () != nil {
122
+ select {
123
+ case <- r .stopCh :
124
+ return
125
+ case <- ctx .Done ():
107
126
return
127
+ default :
128
+ log .Error ().Err (err ).Msg ("Failed to restart watcher after backoff. Retrying..." )
129
+ failures ++
108
130
}
109
- log .Error ().Err (err ).Msg ("Failed to restart watcher after backoff. Retrying..." )
110
- failures ++
111
131
} else {
112
- log .Info ().Msg ("Successfully restarted watcher after backoff" )
132
+ log .Debug ().Msg ("Successfully restarted watcher after backoff" )
113
133
return
114
134
}
115
135
}
116
136
}
117
137
118
138
// reset resets the recovery state
119
139
func (r * recovery ) reset () {
140
+ r .stop () // Stop any existing recovery first
141
+
120
142
r .mu .Lock ()
121
143
defer r .mu .Unlock ()
122
144
r .isRecovering = false
123
145
r .lastFailure = time.Time {}
124
146
r .failures = 0
147
+ r .stopCh = make (chan struct {})
125
148
}
126
149
127
150
// getState returns current recovery state
@@ -132,3 +155,17 @@ func (r *recovery) getState() (bool, time.Time, int) {
132
155
defer r .mu .RUnlock ()
133
156
return r .isRecovering , r .lastFailure , r .failures
134
157
}
158
+
159
+ func (r * recovery ) stop () {
160
+ r .mu .Lock ()
161
+ // Try to close the channel only if it's not already closed
162
+ select {
163
+ case <- r .stopCh :
164
+ default :
165
+ close (r .stopCh )
166
+ }
167
+ r .mu .Unlock ()
168
+
169
+ // Wait for recovery loop to exit, if running
170
+ r .wg .Wait ()
171
+ }
0 commit comments