Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to further stablise the manila-ganesha tests #1246

Merged
merged 4 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions zaza/openstack/charm_tests/ceilometer_agent/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,24 +69,31 @@ def test_400_gnocchi_metrics(self):

expected_metric_names = self.__get_expected_metric_names(
current_os_release)
logging.info("Expected metric names: %s",
', '.join(sorted(expected_metric_names)))

min_timeout_seconds = 500
polling_interval_seconds = (
polling_interval_seconds = int(
openstack_utils.get_application_config_option(
self.application_name, 'polling-interval'))
self.application_name, 'polling-interval') or 30)
timeout_seconds = max(10 * polling_interval_seconds,
min_timeout_seconds)
logging.info('Giving ceilometer-agent {}s to publish all metrics to '
'gnocchi...'.format(timeout_seconds))

max_time = time.time() + timeout_seconds
while time.time() < max_time:
logging.info("... Looking:")
found_metric_names = {metric['name']
for metric in gnocchi.metric.list()}
logging.info("... found metric names: %s",
', '.join(sorted(found_metric_names)))
missing_metric_names = expected_metric_names - found_metric_names
if len(missing_metric_names) == 0:
logging.info('All expected metrics found.')
break
logging.info("... still missing: %s",
', '.join(sorted(missing_metric_names)))
time.sleep(polling_interval_seconds)

unexpected_found_metric_names = (
Expand Down
3 changes: 1 addition & 2 deletions zaza/openstack/charm_tests/ceph/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,7 @@ def test_ceph_encryption(self):
source = '/tmp/dmcrypt-keys/*'
zaza_model.scp_from_unit(unit_name=unit_name,
source=source,
destination=tempdir,
scp_opts='-p')
destination=tempdir)
for elt in listdir(tempdir):
file_path = '/'.join([tempdir, elt])
if path.isfile(file_path):
Expand Down
35 changes: 35 additions & 0 deletions zaza/openstack/charm_tests/manila/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name,
command=ssh_cmd,
verify=verify_status)

def _umount_share_on_instance(self, instance_ip, ssh_user_name,
ssh_private_key, share_path):
"""Umount a share from a Nova instance.

The mount command is executed via SSH.

:param instance_ip: IP of the Nova instance.
:type instance_ip: string
:param ssh_user_name: SSH user name.
:type ssh_user_name: string
:param ssh_private_key: SSH private key.
:type ssh_private_key: string
:param share_path: share network path.
:type share_path: string
"""
ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir)

for attempt in tenacity.Retrying(
stop=tenacity.stop_after_attempt(5),
wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)):
with attempt:
openstack_utils.ssh_command(
vm_name="instance-{}".format(instance_ip),
ip=instance_ip,
username=ssh_user_name,
privkey=ssh_private_key,
command=ssh_cmd,
verify=verify_status)

@tenacity.retry(
stop=tenacity.stop_after_attempt(5),
wait=tenacity.wait_exponential(multiplier=3, min=2, max=10))
Expand Down Expand Up @@ -403,3 +432,9 @@ def test_manila_share(self):
fip_2, ssh_user_name, privkey, share_path)
self._validate_testing_file_from_instance(
fip_2, ssh_user_name, privkey)

# now umount the share on each instance to allow cleaning up.
self._umount_share_on_instance(
fip_1, ssh_user_name, privkey, share_path)
self._umount_share_on_instance(
fip_2, ssh_user_name, privkey, share_path)
173 changes: 170 additions & 3 deletions zaza/openstack/charm_tests/manila_ganesha/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@

"""Encapsulate Manila Ganesha testing."""

import json
import logging
import tenacity

from zaza.openstack.charm_tests.manila_ganesha.setup import (
MANILA_GANESHA_TYPE_NAME,
)

from zaza import sync_wrapper
import zaza.openstack.utilities.generic as generic_utils
import zaza.openstack.charm_tests.manila.tests as manila_tests
import zaza.model
Expand Down Expand Up @@ -54,29 +56,103 @@ def _restart_share_instance(self):
self.model_name,
ganeshas))
for ganesha in ganeshas:
ganesha_unit = zaza.model.get_units(ganesha)[0]
units = zaza.model.get_units(ganesha)
ganesha_unit = units[0]
hacluster_unit = zaza_utils_juju.get_subordinate_units(
[ganesha_unit.entity_id],
charm_name='hacluster')
logging.info('Ganesha in hacluster mode: {}'.format(
bool(hacluster_unit)))

for unit in zaza.model.get_units(ganesha):
for unit in units:
if hacluster_unit:
# While we really only need to run this on the machine
# hosting # nfs-ganesha and manila-share, running it
# everywhere isn't harmful. Pacemaker handles restarting
# the services
logging.info(
"For %s, running systemctl stop manila-share, "
"kill -HUP pidof ganesha.nfsd", unit.entity_id)
zaza.model.run_on_unit(
unit.entity_id,
"systemctl stop manila-share nfs-ganesha")
"systemctl stop manila-share")
zaza.model.run_on_unit(
unit.entity_id,
'pidof ganesha.nfsd && '
'kill -HUP $(pidof ganesha.nfsd)')
else:
logging.info(
"For %s, running systemctl restart manila-share "
"nfs-ganesha", unit.entity_id)
zaza.model.run_on_unit(
unit.entity_id,
"systemctl restart manila-share nfs-ganesha")

if hacluster_unit:
# now ensure that at least one manila-share and nfs-ganesha is
# at least running.
unit_names = [unit.entity_id for unit in units]
logging.info(
"Blocking until at least one manila-share is running")
self._block_until_at_least_one_unit_running_services(
unit_names, ['manila-share'])
else:
# block until they are all running.
for unit in units:
zaza.model.block_until_service_status(
unit_name=unit.entity_id,
services=['manila-share'],
target_status='running'
)

return True

@staticmethod
def _block_until_at_least_one_unit_running_services(
units, services, model_name=None, timeout=None):
"""Block until at least one unit is running the provided services.

:param units: List of names of unit to run action on
:type units: List[str]
:param services: List of services to check
:type services: List[str]
"""
async def _check_services():
for unit_name in units:
running_services = {}
for service in services:
command = r"pidof -x '{}'".format(service)
out = await zaza.model.async_run_on_unit(
unit_name,
command,
model_name=model_name,
timeout=timeout)
response_size = len(out['Stdout'].strip())
# response_size == 0 means NOT running.
running_services[service] = (response_size > 0)
states = ', '.join('{}: {}'.format(k, v)
for k, v in
running_services.items())
# Note this blocks the async call, but we don't really care as
# it should only be a short time.
logging.info('For unit {unit}, services: {states}'
.format(unit=unit_name, states=states))
active_services = [
service
for service, running in running_services.items()
if running]
if len(active_services) == len(services):
# all services are running
return True
# No unit has all services running
return False

async def _await_block():
await zaza.model.async_block_until(
_check_services, timeout=timeout)

sync_wrapper(_await_block)()

def _run_nrpe_check_command(self, commands):
try:
zaza.model.get_application("nrpe")
Expand Down Expand Up @@ -144,3 +220,94 @@ def test_905_nrpe_custom_service_checks(self):
]

self._run_nrpe_check_command(commands)

def _make_ceph_healthy(self, model_name=None):
"""Force ceph into a healthy status."""
# wait for 30 seconds for self to get healthy
healthy, ceph_status = self._wait_for_ceph_fs_healthy(
repeat=6, interval=5, model_name=None)
if healthy:
return
logging.info("Ceph is not healthy: %s", ceph_status)
# evict any clients.
self._evict_ceph_mds_clients(model_name)
self._restart_share_instance()
healthy, ceph_status = self._wait_for_ceph_fs_healthy(
repeat=10, interval=15, model_name=None)

def _wait_for_ceph_fs_healthy(
self, repeat=30, interval=20, model_name=None):
"""Wait until the ceph health is healthy."""
logging.info("Waiting for ceph to be healthy ...")
try:
for attempt in tenacity.Retrying(
wait=tenacity.wait_fixed(interval),
stop=tenacity.stop_after_attempt(repeat),
reraise=True,
):
logging.info("... checking Ceph")
with attempt:
healthy, ceph_status = self._check_ceph_fs_health(
model_name)
if not healthy:
raise RuntimeError("Ceph was unhealthy: {}"
.format(ceph_status))
except RuntimeError:
# we are only retrying for the retries, not to raise an exception.
pass
if healthy:
logging.info("...Ceph is healthy")
else:
logging.info("...Ceph is not healthy %s", ceph_status)
return healthy, ceph_status

@staticmethod
def _check_ceph_fs_health(model_name=None):
"""Check to see if the ceph fs system is healthy."""
cmd_result = zaza.model.run_on_leader(
"ceph-mon",
"sudo ceph status --format=json",
model_name=model_name)
status = json.loads(cmd_result['Stdout'])
ceph_status = status['health']['status']
return (ceph_status == "HEALTH_OK"), ceph_status

@staticmethod
def _evict_ceph_mds_clients(model_name=None):
"""Evict and ceph mds clients present.

Essentially work around a manila-ganesha deployment bug:
https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
"""
# NOTE:evicting a client adds them to the mds blocklist; this shouldn't
# matter for the ephemeral nature of the test.
# get the list of clients.
cmd_results = zaza.model.run_on_leader(
"ceph-mon", "sudo ceph tell mds.0 client ls",
model_name=model_name)
result = json.loads(cmd_results['Stdout'])
client_ids = [client['id'] for client in result]
logging.info("Evicting clients %s", ", ".join(
str(c) for c in client_ids))
# now evict the clients.
for client in client_ids:
logging.info("Evicting client %s", client)
zaza.model.run_on_leader(
"ceph-mon",
"sudo ceph tell mds.0 client evict id={}".format(client),
model_name=model_name)

def test_manila_share(self):
"""Test that a manila-ganesha share can be accessed on two instances.

This overrides the base manila test by prefixing a make ceph healthy
stage.
"""
# force a restart to clear out any clients that may be hanging around
# due to restarts on manila-ganesha during deployment; this also forces
# an HA manila into a stable state.
self._restart_share_instance()
# Clean out any old clients causes by restarting manila-ganesha shares
# and ganesha.nfsd daemons.
self._make_ceph_healthy()
super().test_manila_share()
1 change: 0 additions & 1 deletion zaza/openstack/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0,
'retry_exceptions': retry_exceptions,
'log': _log,
}
_log(f"ObjectRetrierWraps: wrapping {self.__obj}")

def __getattr__(self, name):
"""Get attribute; delegates to wrapped object."""
Expand Down
6 changes: 3 additions & 3 deletions zaza/openstack/utilities/openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
from zaza.openstack.utilities import (
exceptions,
generic as generic_utils,
ObjectRetrierWraps,
retry_on_connect_failure,
)
import zaza.utilities.networking as network_utils

Expand Down Expand Up @@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None):
"""
if not version:
version = 2
return ObjectRetrierWraps(
return retry_on_connect_failure(
novaclient_client.Client(version, session=session))


Expand Down Expand Up @@ -2565,7 +2565,7 @@ def resource_removed(resource,
msg='resource',
wait_exponential_multiplier=1,
wait_iteration_max_time=60,
stop_after_attempt=8):
stop_after_attempt=30):
"""Wait for an openstack resource to no longer be present.

:param resource: pointer to os resource type, ex: heat_client.stacks
Expand Down
Loading