Skip to content

Commit d584238

Browse files
feat(controller): implement monitor recreation with drift detection
- Add drift detection to reconciliation loop to identify missing monitors - Implement handleMonitorRecreation method for automatic recreation - Add new condition types: DriftDetected, Recreated for status tracking - Enhance error handling for API unavailability, rate limiting, and validation - Add Kubernetes event emission for successful monitor recreations - Implement concurrent operation safety with optimistic locking - Add comprehensive property-based tests for all recreation scenarios - Add integration tests for end-to-end recreation workflow This feature automatically detects when monitors are deleted externally and recreates them while preserving configuration and maintaining proper status reporting. Closes: #monitor-recreation-feature Signed-off-by: Starlight Romero <[email protected]>
1 parent 490d736 commit d584238

File tree

6 files changed

+4760
-29
lines changed

6 files changed

+4760
-29
lines changed

api/datadoghq/v1alpha1/datadogmonitor_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,10 @@ const (
303303
DatadogMonitorConditionTypeUpdated DatadogMonitorConditionType = "Updated"
304304
// DatadogMonitorConditionTypeError means the DatadogMonitor has an error
305305
DatadogMonitorConditionTypeError DatadogMonitorConditionType = "Error"
306+
// DatadogMonitorConditionTypeDriftDetected means drift was detected between the resource and Datadog
307+
DatadogMonitorConditionTypeDriftDetected DatadogMonitorConditionType = "DriftDetected"
308+
// DatadogMonitorConditionTypeRecreated means the DatadogMonitor was recreated due to drift
309+
DatadogMonitorConditionTypeRecreated DatadogMonitorConditionType = "Recreated"
306310
)
307311

308312
// DatadogMonitorState represents the overall DatadogMonitor state

internal/controller/datadogmonitor/controller.go

Lines changed: 224 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -165,34 +165,49 @@ func (r *Reconciler) internalReconcile(ctx context.Context, instance *datadoghqv
165165
if instance.Status.ID == 0 {
166166
shouldCreate = true
167167
} else {
168-
var m datadogV1.Monitor
169-
if instanceSpecHash != statusSpecHash {
170-
// Custom resource manifest has changed, need to update the API
171-
logger.V(1).Info("DatadogMonitor manifest has changed")
172-
shouldUpdate = true
173-
} else if instance.Status.MonitorLastForceSyncTime == nil || (forceSyncPeriod-now.Sub(instance.Status.MonitorLastForceSyncTime.Time)) <= 0 {
174-
// Periodically force a sync with the API monitor to ensure parity
175-
// Get monitor to make sure it exists before trying any updates. If it doesn't, set shouldCreate
176-
_, err = r.get(instance, newStatus)
168+
// Perform drift detection to check if monitor exists in Datadog
169+
driftDetected, err := r.detectDrift(ctx, logger, instance, newStatus)
170+
if err != nil {
171+
logger.Error(err, "error during drift detection", "Monitor ID", instance.Status.ID)
172+
// Continue with reconciliation even if drift detection fails
173+
}
174+
175+
if driftDetected {
176+
logger.Info("Drift detected: monitor not found in Datadog, will recreate", "Monitor ID", instance.Status.ID)
177+
err = r.handleMonitorRecreation(ctx, logger, instance, newStatus, now, instanceSpecHash)
177178
if err != nil {
178-
logger.Error(err, "error getting monitor", "Monitor ID", instance.Status.ID)
179-
if strings.Contains(err.Error(), ctrutils.NotFoundString) {
180-
shouldCreate = true
181-
}
182-
} else {
183-
shouldUpdate = true
179+
logger.Error(err, "error recreating monitor", "Monitor ID", instance.Status.ID)
184180
}
185-
} else if instance.Status.MonitorStateLastUpdateTime == nil || (defaultRequeuePeriod-now.Sub(instance.Status.MonitorStateLastUpdateTime.Time)) <= 0 {
186-
// If other conditions aren't met, and we have passed the defaultRequeuePeriod, then update monitor state
187-
// Get monitor to make sure it exists before trying any updates. If it doesn't, set shouldCreate
188-
m, err = r.get(instance, newStatus)
189-
if err != nil {
190-
logger.Error(err, "error getting monitor", "Monitor ID", instance.Status.ID)
191-
if strings.Contains(err.Error(), ctrutils.NotFoundString) {
192-
shouldCreate = true
181+
} else {
182+
var m datadogV1.Monitor
183+
if instanceSpecHash != statusSpecHash {
184+
// Custom resource manifest has changed, need to update the API
185+
logger.V(1).Info("DatadogMonitor manifest has changed")
186+
shouldUpdate = true
187+
} else if instance.Status.MonitorLastForceSyncTime == nil || (forceSyncPeriod-now.Sub(instance.Status.MonitorLastForceSyncTime.Time)) <= 0 {
188+
// Periodically force a sync with the API monitor to ensure parity
189+
// Get monitor to make sure it exists before trying any updates. If it doesn't, set shouldCreate
190+
_, err = r.get(instance, newStatus)
191+
if err != nil {
192+
logger.Error(err, "error getting monitor", "Monitor ID", instance.Status.ID)
193+
if strings.Contains(err.Error(), ctrutils.NotFoundString) {
194+
shouldCreate = true
195+
}
196+
} else {
197+
shouldUpdate = true
198+
}
199+
} else if instance.Status.MonitorStateLastUpdateTime == nil || (defaultRequeuePeriod-now.Sub(instance.Status.MonitorStateLastUpdateTime.Time)) <= 0 {
200+
// If other conditions aren't met, and we have passed the defaultRequeuePeriod, then update monitor state
201+
// Get monitor to make sure it exists before trying any updates. If it doesn't, set shouldCreate
202+
m, err = r.get(instance, newStatus)
203+
if err != nil {
204+
logger.Error(err, "error getting monitor", "Monitor ID", instance.Status.ID)
205+
if strings.Contains(err.Error(), ctrutils.NotFoundString) {
206+
shouldCreate = true
207+
}
193208
}
209+
updateMonitorState(m, now, newStatus)
194210
}
195-
updateMonitorState(m, now, newStatus)
196211
}
197212
}
198213

@@ -236,6 +251,10 @@ func (r *Reconciler) internalReconcile(ctx context.Context, instance *datadoghqv
236251
}
237252

238253
func (r *Reconciler) create(logger logr.Logger, datadogMonitor *datadoghqv1alpha1.DatadogMonitor, status *datadoghqv1alpha1.DatadogMonitorStatus, now metav1.Time, instanceSpecHash string) error {
254+
return r.createInternal(logger, datadogMonitor, status, now, instanceSpecHash, false)
255+
}
256+
257+
func (r *Reconciler) createInternal(logger logr.Logger, datadogMonitor *datadoghqv1alpha1.DatadogMonitor, status *datadoghqv1alpha1.DatadogMonitorStatus, now metav1.Time, instanceSpecHash string, isRecreation bool) error {
239258
// Validate monitor in Datadog
240259
if err := validateMonitor(r.datadogAuth, logger, r.datadogClient, datadogMonitor); err != nil {
241260
return err
@@ -246,10 +265,22 @@ func (r *Reconciler) create(logger logr.Logger, datadogMonitor *datadoghqv1alpha
246265
if err != nil {
247266
return err
248267
}
249-
event := buildEventInfo(datadogMonitor.Name, datadogMonitor.Namespace, pkgutils.CreationEvent)
268+
269+
// Determine event type based on whether this is recreation or initial creation
270+
var eventType pkgutils.EventType
271+
var logMessage string
272+
if isRecreation {
273+
eventType = pkgutils.RecreationEvent
274+
logMessage = "Recreated DatadogMonitor"
275+
} else {
276+
eventType = pkgutils.CreationEvent
277+
logMessage = "Created a new DatadogMonitor"
278+
}
279+
280+
event := buildEventInfo(datadogMonitor.Name, datadogMonitor.Namespace, eventType)
250281
r.recordEvent(datadogMonitor, event)
251282

252-
// As this is a new monitor, add static information to status
283+
// Update status with new monitor information
253284
status.ID = int(m.GetId())
254285
creator := m.GetCreator()
255286
status.Creator = creator.GetEmail()
@@ -259,13 +290,84 @@ func (r *Reconciler) create(logger logr.Logger, datadogMonitor *datadoghqv1alpha
259290
status.MonitorStateSyncStatus = ""
260291
status.CurrentHash = instanceSpecHash
261292

262-
// Set Created Condition
263-
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeCreated, corev1.ConditionTrue, "DatadogMonitor Created")
264-
logger.Info("Created a new DatadogMonitor", "Monitor Namespace", datadogMonitor.Namespace, "Monitor Name", datadogMonitor.Name, "Monitor ID", m.GetId())
293+
// Set appropriate condition based on operation type
294+
if isRecreation {
295+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeRecreated, corev1.ConditionTrue, "DatadogMonitor Recreated")
296+
} else {
297+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeCreated, corev1.ConditionTrue, "DatadogMonitor Created")
298+
}
299+
300+
logger.Info(logMessage, "Monitor Namespace", datadogMonitor.Namespace, "Monitor Name", datadogMonitor.Name, "Monitor ID", m.GetId())
265301

266302
return nil
267303
}
268304

305+
// detectDrift checks if the monitor referenced by the DatadogMonitor exists in Datadog
306+
func (r *Reconciler) detectDrift(ctx context.Context, logger logr.Logger, instance *datadoghqv1alpha1.DatadogMonitor, status *datadoghqv1alpha1.DatadogMonitorStatus) (bool, error) {
307+
// If no monitor ID is set, no drift can be detected
308+
if instance.Status.ID == 0 {
309+
return false, nil
310+
}
311+
312+
// Attempt to get the monitor from Datadog
313+
_, err := getMonitor(r.datadogAuth, r.datadogClient, instance.Status.ID)
314+
if err != nil {
315+
// Check if the error indicates the monitor was not found
316+
if strings.Contains(err.Error(), ctrutils.NotFoundString) {
317+
logger.Info("Drift detected: monitor not found in Datadog", "Monitor ID", instance.Status.ID)
318+
// Update status to indicate drift was detected
319+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
320+
// Set drift detected condition with detailed message
321+
now := metav1.Now()
322+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeDriftDetected, corev1.ConditionTrue, fmt.Sprintf("Monitor ID %d not found in Datadog API", instance.Status.ID))
323+
return true, nil
324+
}
325+
326+
// Handle different types of API errors gracefully with detailed error reporting
327+
errorMessage := err.Error()
328+
now := metav1.Now()
329+
330+
if strings.Contains(errorMessage, "rate limit") || strings.Contains(errorMessage, "429") {
331+
logger.V(1).Info("Rate limit encountered during drift detection, will retry later", "Monitor ID", instance.Status.ID)
332+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
333+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Rate limit during drift detection for monitor ID %d: %s", instance.Status.ID, errorMessage))
334+
return false, fmt.Errorf("rate limit during drift detection, will retry: %w", err)
335+
}
336+
337+
if strings.Contains(errorMessage, "unauthorized") || strings.Contains(errorMessage, "401") {
338+
logger.Error(err, "Authentication error during drift detection", "Monitor ID", instance.Status.ID)
339+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
340+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authentication error during drift detection for monitor ID %d: credentials may be invalid", instance.Status.ID))
341+
return false, err
342+
}
343+
344+
if strings.Contains(errorMessage, "forbidden") || strings.Contains(errorMessage, "403") {
345+
logger.Error(err, "Authorization error during drift detection", "Monitor ID", instance.Status.ID)
346+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
347+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authorization error during drift detection for monitor ID %d: insufficient permissions", instance.Status.ID))
348+
return false, err
349+
}
350+
351+
if strings.Contains(errorMessage, "timeout") || strings.Contains(errorMessage, "context deadline exceeded") {
352+
logger.V(1).Info("Timeout during drift detection, will retry", "Monitor ID", instance.Status.ID)
353+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
354+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Timeout during drift detection for monitor ID %d: API request timed out", instance.Status.ID))
355+
return false, fmt.Errorf("timeout during drift detection, will retry: %w", err)
356+
}
357+
358+
// For other errors (API unavailable, service errors, etc.), handle gracefully
359+
logger.V(1).Info("Error during drift detection, will retry", "Monitor ID", instance.Status.ID, "error", errorMessage)
360+
status.MonitorStateSyncStatus = datadoghqv1alpha1.MonitorStateSyncStatusGetError
361+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("API error during drift detection for monitor ID %d: %s", instance.Status.ID, errorMessage))
362+
return false, fmt.Errorf("error during drift detection, will retry: %w", err)
363+
}
364+
365+
// Monitor exists, no drift detected - clear any previous error conditions
366+
now := metav1.Now()
367+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionFalse, "")
368+
return false, nil
369+
}
370+
269371
func (r *Reconciler) update(logger logr.Logger, datadogMonitor *datadoghqv1alpha1.DatadogMonitor, status *datadoghqv1alpha1.DatadogMonitorStatus, now metav1.Time, instanceSpecHash string) error {
270372
// Validate monitor in Datadog
271373
if err := validateMonitor(r.datadogAuth, logger, r.datadogClient, datadogMonitor); err != nil {
@@ -414,3 +516,96 @@ func isSupportedMonitorType(monitorType datadoghqv1alpha1.DatadogMonitorType) bo
414516
func isTriggered(groupStatus string) bool {
415517
return groupStatus == string(datadoghqv1alpha1.DatadogMonitorStateAlert) || groupStatus == string(datadoghqv1alpha1.DatadogMonitorStateWarn) || groupStatus == string(datadoghqv1alpha1.DatadogMonitorStateNoData)
416518
}
519+
520+
// handleMonitorRecreation manages the recreation of a deleted monitor
521+
func (r *Reconciler) handleMonitorRecreation(ctx context.Context, logger logr.Logger, instance *datadoghqv1alpha1.DatadogMonitor, status *datadoghqv1alpha1.DatadogMonitorStatus, now metav1.Time, instanceSpecHash string) error {
522+
logger.Info("Starting monitor recreation", "Monitor ID", instance.Status.ID, "Monitor Name", instance.Spec.Name)
523+
524+
// Store the old monitor ID for logging and error recovery
525+
oldMonitorID := instance.Status.ID
526+
527+
// Check if the resource was deleted during processing
528+
if ctx.Err() != nil {
529+
logger.V(1).Info("Context cancelled during recreation, aborting", "Monitor ID", oldMonitorID)
530+
return ctx.Err()
531+
}
532+
533+
// Validate the monitor spec before attempting recreation
534+
if err := datadoghqv1alpha1.IsValidDatadogMonitor(&instance.Spec); err != nil {
535+
logger.Error(err, "Invalid monitor spec, cannot recreate", "Monitor ID", oldMonitorID)
536+
// Don't attempt recreation for validation errors
537+
return fmt.Errorf("validation error prevents recreation: %w", err)
538+
}
539+
540+
// Implement optimistic locking by checking resource version hasn't changed
541+
// This helps prevent conflicts during concurrent operations
542+
originalResourceVersion := instance.ResourceVersion
543+
544+
// Reset the monitor ID to trigger creation logic
545+
status.ID = 0
546+
547+
// Use the internal create method with recreation flag
548+
err := r.createInternal(logger, instance, status, now, instanceSpecHash, true)
549+
if err != nil {
550+
// Restore original ID on error to maintain state consistency
551+
status.ID = oldMonitorID
552+
553+
// Check if this is a conflict error (resource was modified concurrently)
554+
if strings.Contains(err.Error(), "conflict") || strings.Contains(err.Error(), "resource version") {
555+
logger.V(1).Info("Concurrent modification detected during recreation, will retry", "Monitor ID", oldMonitorID, "ResourceVersion", originalResourceVersion)
556+
now := metav1.Now()
557+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Concurrent modification detected during recreation of monitor ID %d: resource version conflict", oldMonitorID))
558+
return fmt.Errorf("concurrent modification during recreation, will retry: %w", err)
559+
}
560+
561+
// Categorize and handle different types of creation errors with detailed status reporting
562+
errorMessage := err.Error()
563+
now := metav1.Now()
564+
565+
if strings.Contains(errorMessage, "rate limit") || strings.Contains(errorMessage, "429") {
566+
logger.V(1).Info("Rate limit during recreation, will retry", "Old Monitor ID", oldMonitorID)
567+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Rate limit during recreation of monitor ID %d: API rate limit exceeded, will retry", oldMonitorID))
568+
return fmt.Errorf("rate limit during recreation, will retry: %w", err)
569+
}
570+
571+
if strings.Contains(errorMessage, "unauthorized") || strings.Contains(errorMessage, "401") {
572+
logger.Error(err, "Authentication error during recreation", "Old Monitor ID", oldMonitorID)
573+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authentication error during recreation of monitor ID %d: credentials are invalid or expired", oldMonitorID))
574+
return fmt.Errorf("authentication error during recreation: %w", err)
575+
}
576+
577+
if strings.Contains(errorMessage, "forbidden") || strings.Contains(errorMessage, "403") {
578+
logger.Error(err, "Authorization error during recreation", "Old Monitor ID", oldMonitorID)
579+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authorization error during recreation of monitor ID %d: insufficient permissions to create monitors", oldMonitorID))
580+
return fmt.Errorf("authorization error during recreation: %w", err)
581+
}
582+
583+
if strings.Contains(errorMessage, "validation") || strings.Contains(errorMessage, "400") {
584+
logger.Error(err, "Validation error during recreation", "Old Monitor ID", oldMonitorID)
585+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Validation error during recreation of monitor ID %d: monitor configuration is invalid", oldMonitorID))
586+
return fmt.Errorf("validation error during recreation: %w", err)
587+
}
588+
589+
if strings.Contains(errorMessage, "timeout") || strings.Contains(errorMessage, "context deadline exceeded") {
590+
logger.V(1).Info("Timeout during recreation, will retry", "Old Monitor ID", oldMonitorID)
591+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Timeout during recreation of monitor ID %d: API request timed out, will retry", oldMonitorID))
592+
return fmt.Errorf("timeout during recreation, will retry: %w", err)
593+
}
594+
595+
// Generic error handling for other API errors
596+
logger.Error(err, "Failed to recreate monitor", "Old Monitor ID", oldMonitorID)
597+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Failed to recreate monitor ID %d: %s", oldMonitorID, errorMessage))
598+
return fmt.Errorf("failed to recreate monitor: %w", err)
599+
}
600+
601+
// Check for context cancellation after recreation but before finalizing status
602+
if ctx.Err() != nil {
603+
// Restore original ID since the operation was cancelled
604+
status.ID = oldMonitorID
605+
logger.V(1).Info("Context cancelled after recreation, operation may be incomplete", "Old Monitor ID", oldMonitorID)
606+
return ctx.Err()
607+
}
608+
609+
logger.Info("Successfully recreated monitor", "Old Monitor ID", oldMonitorID, "New Monitor ID", status.ID)
610+
return nil
611+
}

0 commit comments

Comments
 (0)