You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
feat(controller): implement monitor recreation with drift detection
- Add drift detection to reconciliation loop to identify missing monitors
- Implement handleMonitorRecreation method for automatic recreation
- Add new condition types: DriftDetected, Recreated for status tracking
- Enhance error handling for API unavailability, rate limiting, and validation
- Add Kubernetes event emission for successful monitor recreations
- Implement concurrent operation safety with optimistic locking
- Add comprehensive property-based tests for all recreation scenarios
- Add integration tests for end-to-end recreation workflow
This feature automatically detects when monitors are deleted externally
and recreates them while preserving configuration and maintaining
proper status reporting.
Closes: #monitor-recreation-feature
Signed-off-by: Starlight Romero <[email protected]>
// Set drift detected condition with detailed message
321
+
now:=metav1.Now()
322
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeDriftDetected, corev1.ConditionTrue, fmt.Sprintf("Monitor ID %d not found in Datadog API", instance.Status.ID))
323
+
returntrue, nil
324
+
}
325
+
326
+
// Handle different types of API errors gracefully with detailed error reporting
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authentication error during drift detection for monitor ID %d: credentials may be invalid", instance.Status.ID))
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Timeout during drift detection for monitor ID %d: API request timed out", instance.Status.ID))
355
+
returnfalse, fmt.Errorf("timeout during drift detection, will retry: %w", err)
356
+
}
357
+
358
+
// For other errors (API unavailable, service errors, etc.), handle gracefully
359
+
logger.V(1).Info("Error during drift detection, will retry", "Monitor ID", instance.Status.ID, "error", errorMessage)
logger.V(1).Info("Concurrent modification detected during recreation, will retry", "Monitor ID", oldMonitorID, "ResourceVersion", originalResourceVersion)
556
+
now:=metav1.Now()
557
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Concurrent modification detected during recreation of monitor ID %d: resource version conflict", oldMonitorID))
558
+
returnfmt.Errorf("concurrent modification during recreation, will retry: %w", err)
559
+
}
560
+
561
+
// Categorize and handle different types of creation errors with detailed status reporting
logger.V(1).Info("Rate limit during recreation, will retry", "Old Monitor ID", oldMonitorID)
567
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Rate limit during recreation of monitor ID %d: API rate limit exceeded, will retry", oldMonitorID))
568
+
returnfmt.Errorf("rate limit during recreation, will retry: %w", err)
logger.Error(err, "Authentication error during recreation", "Old Monitor ID", oldMonitorID)
573
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authentication error during recreation of monitor ID %d: credentials are invalid or expired", oldMonitorID))
574
+
returnfmt.Errorf("authentication error during recreation: %w", err)
logger.Error(err, "Authorization error during recreation", "Old Monitor ID", oldMonitorID)
579
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Authorization error during recreation of monitor ID %d: insufficient permissions to create monitors", oldMonitorID))
580
+
returnfmt.Errorf("authorization error during recreation: %w", err)
logger.Error(err, "Validation error during recreation", "Old Monitor ID", oldMonitorID)
585
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Validation error during recreation of monitor ID %d: monitor configuration is invalid", oldMonitorID))
586
+
returnfmt.Errorf("validation error during recreation: %w", err)
logger.V(1).Info("Timeout during recreation, will retry", "Old Monitor ID", oldMonitorID)
591
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Timeout during recreation of monitor ID %d: API request timed out, will retry", oldMonitorID))
592
+
returnfmt.Errorf("timeout during recreation, will retry: %w", err)
593
+
}
594
+
595
+
// Generic error handling for other API errors
596
+
logger.Error(err, "Failed to recreate monitor", "Old Monitor ID", oldMonitorID)
597
+
condition.UpdateDatadogMonitorConditions(status, now, datadoghqv1alpha1.DatadogMonitorConditionTypeError, corev1.ConditionTrue, fmt.Sprintf("Failed to recreate monitor ID %d: %s", oldMonitorID, errorMessage))
598
+
returnfmt.Errorf("failed to recreate monitor: %w", err)
599
+
}
600
+
601
+
// Check for context cancellation after recreation but before finalizing status
602
+
ifctx.Err() !=nil {
603
+
// Restore original ID since the operation was cancelled
604
+
status.ID=oldMonitorID
605
+
logger.V(1).Info("Context cancelled after recreation, operation may be incomplete", "Old Monitor ID", oldMonitorID)
0 commit comments