diff --git a/lib/metrics.py b/lib/metrics.py index 70654b334..b80057ce9 100644 --- a/lib/metrics.py +++ b/lib/metrics.py @@ -52,3 +52,14 @@ "inventory_new_export_seconds", "Time spent to create a new host export report" ) create_export_count = Counter("inventory_create_export", "The total amount of host exports created") + +# Stale Host Notification +stale_host_notification_count = Counter( + "inventory_stale_host_notification_count", "The total amount of stale hosts notified" +) +stale_host_notification_processing_time = Summary( + "inventory_stale_host_notification_processing_time", "Time spent notifying stale hosts from the database" +) +stale_host_notification_fail_count = Counter( + "inventory_stale_host_notification_fail_count", "The total amount of Stale Host Notification failures." +) diff --git a/stale_host_notification.py b/stale_host_notification.py index e386b9d7c..3f18fda63 100755 --- a/stale_host_notification.py +++ b/stale_host_notification.py @@ -16,11 +16,16 @@ from jobs.common import excepthook from jobs.common import find_hosts_in_state from jobs.common import main +from lib.metrics import stale_host_notification_count +from lib.metrics import stale_host_notification_fail_count +from lib.metrics import stale_host_notification_processing_time LOGGER_NAME = "stale_host_notification" PROMETHEUS_JOB = "inventory-stale-host-notification" COLLECTED_METRICS = ( - # add metric + stale_host_notification_count, + stale_host_notification_processing_time, + stale_host_notification_fail_count, ) @@ -35,20 +40,27 @@ def _create_host_operation_result(host, identity, logger): ) +@stale_host_notification_fail_count.count_exceptions() def run(config, logger, session, event_producer, notification_event_producer, shutdown_handler, application): - with application.app.app_context(): + with application.app.app_context(), stale_host_notification_processing_time.time(): filter_stale_hosts = find_hosts_in_state(logger, session, ["stale_in"], config) query = session.query(Host).filter(or_(False, *filter_stale_hosts)) stale_hosts = query.all() if len(stale_hosts) > 0: - logger.info("%s hosts found as stale") + logger.info("%s hosts found as stale", len(stale_hosts)) for host in stale_hosts: identity = create_mock_identity_with_org_id(host.org_id) result = _create_host_operation_result(host, identity, logger) - send_notification( - notification_event_producer, NotificationType.system_became_stale, vars(result.host_row) - ) + try: + send_notification( + notification_event_producer, NotificationType.system_became_stale, vars(result.host_row) + ) + + stale_host_notification_count.inc() + result.success_logger() + except Exception: + logger.error("Error when sending notification") else: logger.info("No hosts found as stale")