From d3306ee27cc97f1cb9e38887ff46097e0df81ccf Mon Sep 17 00:00:00 2001 From: tonytan4ever Date: Mon, 2 Jul 2018 14:28:16 -0400 Subject: [PATCH] Adjust neutron/nova agent related alarms and check scripts --- playbooks/files/rax-maas/plugins/README.md | 14 +++++++------- playbooks/files/rax-maas/plugins/maas_common.py | 16 ++++++++++++++++ .../rax-maas/plugins/neutron_service_check.py | 16 +++++++++++----- .../files/rax-maas/plugins/nova_service_check.py | 16 +++++++++++----- .../rax-maas/neutron_dhcp_agent_check.yaml.j2 | 4 ++-- .../rax-maas/neutron_l3_agent_check.yaml.j2 | 4 ++-- .../neutron_linuxbridge_agent_check.yaml.j2 | 4 ++-- .../neutron_metadata_agent_check.yaml.j2 | 4 ++-- .../neutron_metering_agent_check.yaml.j2 | 4 ++-- .../templates/rax-maas/nova_cert_check.yaml.j2 | 6 +++--- .../rax-maas/nova_compute_check.yaml.j2 | 6 +++--- .../rax-maas/nova_conductor_check.yaml.j2 | 6 +++--- .../rax-maas/nova_consoleauth_check.yaml.j2 | 6 +++--- .../rax-maas/nova_scheduler_check.yaml.j2 | 6 +++--- 14 files changed, 70 insertions(+), 42 deletions(-) diff --git a/playbooks/files/rax-maas/plugins/README.md b/playbooks/files/rax-maas/plugins/README.md index 358d57eac..09de9109c 100644 --- a/playbooks/files/rax-maas/plugins/README.md +++ b/playbooks/files/rax-maas/plugins/README.md @@ -221,9 +221,9 @@ polls the nova api and gets a list of all nova services running in the environme Hostname or IP address of service to test ##### Example Output: - metric nova-scheduler_on_host_aio1_nova_scheduler_container-e7b92e0f uint32 1 - metric nova-conductor_on_host_aio1_nova_conductor_container-dcddd54a uint32 1 - metric nova-compute_on_host_aio1_nova_compute_container-19824c74 uint32 1 + metric nova-scheduler_on_host_aio1_nova_scheduler_container-e7b92e0f string Yes + metric nova-conductor_on_host_aio1_nova_conductor_container-dcddd54a string No + metric nova-compute_on_host_aio1_nova_compute_container-19824c74 string Yes ... *** @@ -265,10 +265,10 @@ polls the neutron api and gets a list of all neutron agents running in the envir Hostname or IP address of service to test ##### Example Output: - metric neutron-metadata-agent_status uint32 1 - metric neutron-linuxbridge-agent_status uint32 1 - metric neutron-dhcp-agent_status uint32 1 - metric neutron-linuxbridge-agent_status uint32 1 + metric neutron-metadata-agent_status string Yes + metric neutron-linuxbridge-agent_status string No + metric neutron-dhcp-agent_status string neutron-dhcp-agent cannot reach API + metric neutron-linuxbridge-agent_status string Yes ... *** diff --git a/playbooks/files/rax-maas/plugins/maas_common.py b/playbooks/files/rax-maas/plugins/maas_common.py index f88da6541..92c4d5e19 100755 --- a/playbooks/files/rax-maas/plugins/maas_common.py +++ b/playbooks/files/rax-maas/plugins/maas_common.py @@ -50,6 +50,22 @@ TOKEN_FILE = '/root/.auth_ref.json' +NEUTRON_AGENT_TYPE_LIST = [ + 'neutron-linuxbridge-agent', + 'neutron-dhcp-agent', + 'neutron-l3-agent', + 'neutron-metadata-agent', + 'neutron-metering-agent' +] +NOVA_SERVICE_TYPE_LIST = [ + 'nova-cert', + 'nova-compute', + 'nova-conductor', + 'nova-consoleauth', + 'nova-scheduler' +] + + try: from cinderclient import client as c_client from cinderclient import exceptions as c_exc diff --git a/playbooks/files/rax-maas/plugins/neutron_service_check.py b/playbooks/files/rax-maas/plugins/neutron_service_check.py index 80424b9b2..a2938ae20 100755 --- a/playbooks/files/rax-maas/plugins/neutron_service_check.py +++ b/playbooks/files/rax-maas/plugins/neutron_service_check.py @@ -15,14 +15,15 @@ # limitations under the License. import argparse -import sys from maas_common import get_neutron_client +from maas_common import metric from maas_common import metric_bool from maas_common import print_output from maas_common import status_err from maas_common import status_err_no_exit from maas_common import status_ok +from maas_common import NEUTRON_AGENT_TYPE_LIST def check(args): @@ -35,8 +36,13 @@ def check(args): # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') + for neutron_agent_type in NEUTRON_AGENT_TYPE_LIST: + metric('%s_status' % neutron_agent_type, + 'string', + '%s cannot reach API' % neutron_agent_type, + m_name='maas_neutron') status_err_no_exit(str(e), m_name='maas_neutron') - sys.exit(0) + return else: metric_bool('client_success', True, m_name='maas_neutron') @@ -58,9 +64,9 @@ def check(args): # return all the things status_ok(m_name='maas_neutron') for agent in agents: - agent_is_up = True + agent_is_up = "Yes" if agent['admin_state_up'] and not agent['alive']: - agent_is_up = False + agent_is_up = "No" if args.host: name = '%s_status' % agent['binary'] @@ -71,7 +77,7 @@ def check(args): agent['id'], agent['host']) - metric_bool(name, agent_is_up, m_name='maas_neutron') + metric(name, 'string', agent_is_up, m_name='maas_neutron') def main(args): diff --git a/playbooks/files/rax-maas/plugins/nova_service_check.py b/playbooks/files/rax-maas/plugins/nova_service_check.py index 14635ed9a..12f419d85 100755 --- a/playbooks/files/rax-maas/plugins/nova_service_check.py +++ b/playbooks/files/rax-maas/plugins/nova_service_check.py @@ -15,16 +15,17 @@ # limitations under the License. import argparse -import sys from maas_common import get_auth_ref from maas_common import get_keystone_client from maas_common import get_nova_client +from maas_common import metric from maas_common import metric_bool from maas_common import print_output from maas_common import status_err from maas_common import status_err_no_exit from maas_common import status_ok +from maas_common import NOVA_SERVICE_TYPE_LIST def check(auth_ref, args): @@ -44,8 +45,13 @@ def check(auth_ref, args): # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_nova') + for nova_service_type in NOVA_SERVICE_TYPE_LIST: + metric('%s_status' % nova_service_type, + 'string', + '%s cannot reach API' % nova_service_type, + m_name='maas_nova') status_err_no_exit(str(e), m_name='maas_nova') - sys.exit(0) + return else: metric_bool('client_success', True, m_name='maas_nova') @@ -61,17 +67,17 @@ def check(auth_ref, args): # return all the things status_ok(m_name='maas_nova') for service in services: - service_is_up = True + service_is_up = "Yes" if service.status == 'enabled' and service.state == 'down': - service_is_up = False + service_is_up = "No" if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) - metric_bool(name, service_is_up, m_name='maas_nova') + metric(name, 'string', service_is_up, m_name='maas_nova') def main(args): diff --git a/playbooks/templates/rax-maas/neutron_dhcp_agent_check.yaml.j2 b/playbooks/templates/rax-maas/neutron_dhcp_agent_check.yaml.j2 index 764eaecdc..dec6369ef 100644 --- a/playbooks/templates/rax-maas/neutron_dhcp_agent_check.yaml.j2 +++ b/playbooks/templates/rax-maas/neutron_dhcp_agent_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('neutron_dhcp_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { + if (metric["neutron-dhcp-agent_status"] regex ".*cannot reach API.*") { return new AlarmStatus(WARNING, "neutron dhcp agent can't reach API"); } - if (metric["neutron-dhcp-agent_status"] != 1) { + if (metric["neutron-dhcp-agent_status"] == 'No') { return new AlarmStatus(CRITICAL, "neutron-dhcp-agent down"); } diff --git a/playbooks/templates/rax-maas/neutron_l3_agent_check.yaml.j2 b/playbooks/templates/rax-maas/neutron_l3_agent_check.yaml.j2 index 58053d662..3ff38bff5 100644 --- a/playbooks/templates/rax-maas/neutron_l3_agent_check.yaml.j2 +++ b/playbooks/templates/rax-maas/neutron_l3_agent_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('neutron_l3_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { + if (metric["neutron-l3-agent_status"] regex ".*cannot reach API.*") { return new AlarmStatus(WARNING, "neutron l3 agent can't reach API"); } - if (metric["neutron-l3-agent_status"] != 1) { + if (metric["neutron-l3-agent_status"] == 'No') { return new AlarmStatus(CRITICAL, "neutron-l3-agent down"); } diff --git a/playbooks/templates/rax-maas/neutron_linuxbridge_agent_check.yaml.j2 b/playbooks/templates/rax-maas/neutron_linuxbridge_agent_check.yaml.j2 index 46d5fd3bd..145f4ec82 100644 --- a/playbooks/templates/rax-maas/neutron_linuxbridge_agent_check.yaml.j2 +++ b/playbooks/templates/rax-maas/neutron_linuxbridge_agent_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('neutron_linuxbridge_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { + if (metric["neutron-linuxbridge-agent_status"] regex ".*cannot reach API.*") { return new AlarmStatus(WARNING, "neutron linux-agent can't reach API"); } - if (metric["neutron-linuxbridge-agent_status"] != 1) { + if (metric["neutron-linuxbridge-agent_status"] == "No") { return new AlarmStatus(CRITICAL, "neutron-linuxbridge-agent down"); } diff --git a/playbooks/templates/rax-maas/neutron_metadata_agent_check.yaml.j2 b/playbooks/templates/rax-maas/neutron_metadata_agent_check.yaml.j2 index 36afe0318..b2927c4a3 100644 --- a/playbooks/templates/rax-maas/neutron_metadata_agent_check.yaml.j2 +++ b/playbooks/templates/rax-maas/neutron_metadata_agent_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('neutron_metadata_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { + if (metric["neutron-metadata-agent_status"] regex ".*cannot reach API.*") { return new AlarmStatus(WARNING, "neutron metadata agent can't reach API"); } - if (metric["neutron-metadata-agent_status"] != 1) { + if (metric["neutron-metadata-agent_status"] == 'No') { return new AlarmStatus(CRITICAL, "neutron-metadata-agent down"); } diff --git a/playbooks/templates/rax-maas/neutron_metering_agent_check.yaml.j2 b/playbooks/templates/rax-maas/neutron_metering_agent_check.yaml.j2 index 329759292..3bea42760 100644 --- a/playbooks/templates/rax-maas/neutron_metering_agent_check.yaml.j2 +++ b/playbooks/templates/rax-maas/neutron_metering_agent_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('neutron_metering_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { + if (metric["neutron-metering-agent_status"] regex ".*cannot reach API.*") { return new AlarmStatus(WARNING, "neutron metering agent can't reach API"); } - if (metric["neutron-metering-agent_status"] != 1) { + if (metric["neutron-metering-agent_status"] == 'No') { return new AlarmStatus(CRITICAL, "neutron-metering-agent down"); } diff --git a/playbooks/templates/rax-maas/nova_cert_check.yaml.j2 b/playbooks/templates/rax-maas/nova_cert_check.yaml.j2 index 7c5946aae..72f060ad9 100644 --- a/playbooks/templates/rax-maas/nova_cert_check.yaml.j2 +++ b/playbooks/templates/rax-maas/nova_cert_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('nova_cert_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { - return new AlarmStatus(WARNING, "Nova cert service can't reach API"); + if (metric["nova-cert_status"] regex ".*cannot reach API.*") { + return new AlarmStatus(WARNING, "nova cert service can't reach API"); } - if (metric["nova-cert_status"] != 1) { + if (metric["nova-cert_status"] == "No") { return new AlarmStatus(CRITICAL, "nova-cert down"); } diff --git a/playbooks/templates/rax-maas/nova_compute_check.yaml.j2 b/playbooks/templates/rax-maas/nova_compute_check.yaml.j2 index 5a2608509..04a25fbe8 100644 --- a/playbooks/templates/rax-maas/nova_compute_check.yaml.j2 +++ b/playbooks/templates/rax-maas/nova_compute_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('nova_compute_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { - return new AlarmStatus(WARNING, "Nova compute service can't reach API"); + if (metric["nova-compute_status"] regex ".*cannot reach API.*") { + return new AlarmStatus(WARNING, "nova compute service can't reach API"); } - if (metric["nova-compute_status"] != 1) { + if (metric["nova-compute_status"] == 'No') { return new AlarmStatus(CRITICAL, "nova-compute down"); } diff --git a/playbooks/templates/rax-maas/nova_conductor_check.yaml.j2 b/playbooks/templates/rax-maas/nova_conductor_check.yaml.j2 index ae02c7b83..f577b145c 100644 --- a/playbooks/templates/rax-maas/nova_conductor_check.yaml.j2 +++ b/playbooks/templates/rax-maas/nova_conductor_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('nova_conductor_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { - return new AlarmStatus(WARNING, "Nova conductor can't reach API"); + if (metric["nova-conductor_status"] regex ".*cannot reach API.*") { + return new AlarmStatus(WARNING, "nova conductor can't reach API"); } - if (metric["nova-conductor_status"] != 1) { + if (metric["nova-conductor_status"] == 'No') { return new AlarmStatus(CRITICAL, "nova-conductor down"); } diff --git a/playbooks/templates/rax-maas/nova_consoleauth_check.yaml.j2 b/playbooks/templates/rax-maas/nova_consoleauth_check.yaml.j2 index 3a1cdaf4b..d28e0e17a 100644 --- a/playbooks/templates/rax-maas/nova_consoleauth_check.yaml.j2 +++ b/playbooks/templates/rax-maas/nova_consoleauth_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('nova_consoleauth_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { - return new AlarmStatus(WARNING, "Nova consoleauth service can't reach API"); + if (metric["nova-consoleauth_status"] regex ".*cannot reach API.*") { + return new AlarmStatus(WARNING, "nova consoleauth service can't reach API"); } - if (metric["nova-consoleauth_status"] != 1) { + if (metric["nova-consoleauth_status"] == "No") { return new AlarmStatus(CRITICAL, "nova-consoleauth down"); } diff --git a/playbooks/templates/rax-maas/nova_scheduler_check.yaml.j2 b/playbooks/templates/rax-maas/nova_scheduler_check.yaml.j2 index a243ccc09..1f367be45 100644 --- a/playbooks/templates/rax-maas/nova_scheduler_check.yaml.j2 +++ b/playbooks/templates/rax-maas/nova_scheduler_check.yaml.j2 @@ -19,9 +19,9 @@ alarms : disabled : {{ (('nova_scheduler_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }} criteria : | :set consecutiveCount={{ maas_alarm_local_consecutive_count }} - if (metric["client_success"] != 1) { - return new AlarmStatus(WARNING, "Nova scheduler service can't reach API"); + if (metric["nova-scheduler_status"] regex ".*cannot reach API.*") { + return new AlarmStatus(WARNING, "nova scheduler service can't reach API"); } - if (metric["nova-scheduler_status"] != 1) { + if (metric["nova-scheduler_status"] == 'No') { return new AlarmStatus(CRITICAL, "nova-scheduler down"); }