Skip to content

Commit

Permalink
Merge pull request #532 from tonytan4ever/TURTLES-894
Browse files Browse the repository at this point in the history
Adjust neutron/nova agent related alarms and check scripts
  • Loading branch information
tonytan4ever authored Jul 4, 2018
2 parents 3eac154 + d3306ee commit ab81b07
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 42 deletions.
14 changes: 7 additions & 7 deletions playbooks/files/rax-maas/plugins/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,9 @@ polls the nova api and gets a list of all nova services running in the environme
Hostname or IP address of service to test
##### Example Output:

metric nova-scheduler_on_host_aio1_nova_scheduler_container-e7b92e0f uint32 1
metric nova-conductor_on_host_aio1_nova_conductor_container-dcddd54a uint32 1
metric nova-compute_on_host_aio1_nova_compute_container-19824c74 uint32 1
metric nova-scheduler_on_host_aio1_nova_scheduler_container-e7b92e0f string Yes
metric nova-conductor_on_host_aio1_nova_conductor_container-dcddd54a string No
metric nova-compute_on_host_aio1_nova_compute_container-19824c74 string Yes
...

***
Expand Down Expand Up @@ -265,10 +265,10 @@ polls the neutron api and gets a list of all neutron agents running in the envir
Hostname or IP address of service to test
##### Example Output:

metric neutron-metadata-agent_status uint32 1
metric neutron-linuxbridge-agent_status uint32 1
metric neutron-dhcp-agent_status uint32 1
metric neutron-linuxbridge-agent_status uint32 1
metric neutron-metadata-agent_status string Yes
metric neutron-linuxbridge-agent_status string No
metric neutron-dhcp-agent_status string neutron-dhcp-agent cannot reach API
metric neutron-linuxbridge-agent_status string Yes
...

***
Expand Down
16 changes: 16 additions & 0 deletions playbooks/files/rax-maas/plugins/maas_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@
TOKEN_FILE = '/root/.auth_ref.json'


NEUTRON_AGENT_TYPE_LIST = [
'neutron-linuxbridge-agent',
'neutron-dhcp-agent',
'neutron-l3-agent',
'neutron-metadata-agent',
'neutron-metering-agent'
]
NOVA_SERVICE_TYPE_LIST = [
'nova-cert',
'nova-compute',
'nova-conductor',
'nova-consoleauth',
'nova-scheduler'
]


try:
from cinderclient import client as c_client
from cinderclient import exceptions as c_exc
Expand Down
16 changes: 11 additions & 5 deletions playbooks/files/rax-maas/plugins/neutron_service_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
# limitations under the License.

import argparse
import sys

from maas_common import get_neutron_client
from maas_common import metric
from maas_common import metric_bool
from maas_common import print_output
from maas_common import status_err
from maas_common import status_err_no_exit
from maas_common import status_ok
from maas_common import NEUTRON_AGENT_TYPE_LIST


def check(args):
Expand All @@ -35,8 +36,13 @@ def check(args):
# not gathering api status metric here so catch any exception
except Exception as e:
metric_bool('client_success', False, m_name='maas_neutron')
for neutron_agent_type in NEUTRON_AGENT_TYPE_LIST:
metric('%s_status' % neutron_agent_type,
'string',
'%s cannot reach API' % neutron_agent_type,
m_name='maas_neutron')
status_err_no_exit(str(e), m_name='maas_neutron')
sys.exit(0)
return
else:
metric_bool('client_success', True, m_name='maas_neutron')

Expand All @@ -58,9 +64,9 @@ def check(args):
# return all the things
status_ok(m_name='maas_neutron')
for agent in agents:
agent_is_up = True
agent_is_up = "Yes"
if agent['admin_state_up'] and not agent['alive']:
agent_is_up = False
agent_is_up = "No"

if args.host:
name = '%s_status' % agent['binary']
Expand All @@ -71,7 +77,7 @@ def check(args):
agent['id'],
agent['host'])

metric_bool(name, agent_is_up, m_name='maas_neutron')
metric(name, 'string', agent_is_up, m_name='maas_neutron')


def main(args):
Expand Down
16 changes: 11 additions & 5 deletions playbooks/files/rax-maas/plugins/nova_service_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@
# limitations under the License.

import argparse
import sys

from maas_common import get_auth_ref
from maas_common import get_keystone_client
from maas_common import get_nova_client
from maas_common import metric
from maas_common import metric_bool
from maas_common import print_output
from maas_common import status_err
from maas_common import status_err_no_exit
from maas_common import status_ok
from maas_common import NOVA_SERVICE_TYPE_LIST


def check(auth_ref, args):
Expand All @@ -44,8 +45,13 @@ def check(auth_ref, args):
# not gathering api status metric here so catch any exception
except Exception as e:
metric_bool('client_success', False, m_name='maas_nova')
for nova_service_type in NOVA_SERVICE_TYPE_LIST:
metric('%s_status' % nova_service_type,
'string',
'%s cannot reach API' % nova_service_type,
m_name='maas_nova')
status_err_no_exit(str(e), m_name='maas_nova')
sys.exit(0)
return
else:
metric_bool('client_success', True, m_name='maas_nova')

Expand All @@ -61,17 +67,17 @@ def check(auth_ref, args):
# return all the things
status_ok(m_name='maas_nova')
for service in services:
service_is_up = True
service_is_up = "Yes"

if service.status == 'enabled' and service.state == 'down':
service_is_up = False
service_is_up = "No"

if args.host:
name = '%s_status' % service.binary
else:
name = '%s_on_host_%s_status' % (service.binary, service.host)

metric_bool(name, service_is_up, m_name='maas_nova')
metric(name, 'string', service_is_up, m_name='maas_nova')


def main(args):
Expand Down
4 changes: 2 additions & 2 deletions playbooks/templates/rax-maas/neutron_dhcp_agent_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('neutron_dhcp_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
if (metric["neutron-dhcp-agent_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "neutron dhcp agent can't reach API");
}
if (metric["neutron-dhcp-agent_status"] != 1) {
if (metric["neutron-dhcp-agent_status"] == 'No') {
return new AlarmStatus(CRITICAL, "neutron-dhcp-agent down");
}
4 changes: 2 additions & 2 deletions playbooks/templates/rax-maas/neutron_l3_agent_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('neutron_l3_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
if (metric["neutron-l3-agent_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "neutron l3 agent can't reach API");
}
if (metric["neutron-l3-agent_status"] != 1) {
if (metric["neutron-l3-agent_status"] == 'No') {
return new AlarmStatus(CRITICAL, "neutron-l3-agent down");
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('neutron_linuxbridge_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
if (metric["neutron-linuxbridge-agent_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "neutron linux-agent can't reach API");
}
if (metric["neutron-linuxbridge-agent_status"] != 1) {
if (metric["neutron-linuxbridge-agent_status"] == "No") {
return new AlarmStatus(CRITICAL, "neutron-linuxbridge-agent down");
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('neutron_metadata_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
if (metric["neutron-metadata-agent_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "neutron metadata agent can't reach API");
}
if (metric["neutron-metadata-agent_status"] != 1) {
if (metric["neutron-metadata-agent_status"] == 'No') {
return new AlarmStatus(CRITICAL, "neutron-metadata-agent down");
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('neutron_metering_agent_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
if (metric["neutron-metering-agent_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "neutron metering agent can't reach API");
}
if (metric["neutron-metering-agent_status"] != 1) {
if (metric["neutron-metering-agent_status"] == 'No') {
return new AlarmStatus(CRITICAL, "neutron-metering-agent down");
}
6 changes: 3 additions & 3 deletions playbooks/templates/rax-maas/nova_cert_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('nova_cert_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
return new AlarmStatus(WARNING, "Nova cert service can't reach API");
if (metric["nova-cert_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "nova cert service can't reach API");
}
if (metric["nova-cert_status"] != 1) {
if (metric["nova-cert_status"] == "No") {
return new AlarmStatus(CRITICAL, "nova-cert down");
}
6 changes: 3 additions & 3 deletions playbooks/templates/rax-maas/nova_compute_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('nova_compute_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
return new AlarmStatus(WARNING, "Nova compute service can't reach API");
if (metric["nova-compute_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "nova compute service can't reach API");
}
if (metric["nova-compute_status"] != 1) {
if (metric["nova-compute_status"] == 'No') {
return new AlarmStatus(CRITICAL, "nova-compute down");
}
6 changes: 3 additions & 3 deletions playbooks/templates/rax-maas/nova_conductor_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('nova_conductor_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
return new AlarmStatus(WARNING, "Nova conductor can't reach API");
if (metric["nova-conductor_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "nova conductor can't reach API");
}
if (metric["nova-conductor_status"] != 1) {
if (metric["nova-conductor_status"] == 'No') {
return new AlarmStatus(CRITICAL, "nova-conductor down");
}
6 changes: 3 additions & 3 deletions playbooks/templates/rax-maas/nova_consoleauth_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('nova_consoleauth_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
return new AlarmStatus(WARNING, "Nova consoleauth service can't reach API");
if (metric["nova-consoleauth_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "nova consoleauth service can't reach API");
}
if (metric["nova-consoleauth_status"] != 1) {
if (metric["nova-consoleauth_status"] == "No") {
return new AlarmStatus(CRITICAL, "nova-consoleauth down");
}
6 changes: 3 additions & 3 deletions playbooks/templates/rax-maas/nova_scheduler_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ alarms :
disabled : {{ (('nova_scheduler_status--'+inventory_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["client_success"] != 1) {
return new AlarmStatus(WARNING, "Nova scheduler service can't reach API");
if (metric["nova-scheduler_status"] regex ".*cannot reach API.*") {
return new AlarmStatus(WARNING, "nova scheduler service can't reach API");
}
if (metric["nova-scheduler_status"] != 1) {
if (metric["nova-scheduler_status"] == 'No') {
return new AlarmStatus(CRITICAL, "nova-scheduler down");
}

0 comments on commit ab81b07

Please sign in to comment.