From 3873dd56266f006ae928b2e4d4c129aad27270ae Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Tue, 16 Jul 2024 13:07:23 +0100 Subject: [PATCH 1/4] Improvements to further stablise the manila-ganesha tests This improves the manila-ganesha tests by checking that ceph is stable/healthy, and b) ensuring that after the restart of manila-ganesha, that it is stable/healthy. --- .../charm_tests/ceilometer_agent/tests.py | 11 ++- zaza/openstack/charm_tests/manila/tests.py | 56 ++++++++++++++ .../charm_tests/manila_ganesha/tests.py | 75 ++++++++++++++++++- zaza/openstack/utilities/__init__.py | 1 - zaza/openstack/utilities/openstack.py | 10 ++- 5 files changed, 144 insertions(+), 9 deletions(-) diff --git a/zaza/openstack/charm_tests/ceilometer_agent/tests.py b/zaza/openstack/charm_tests/ceilometer_agent/tests.py index cdb196751..572d92032 100644 --- a/zaza/openstack/charm_tests/ceilometer_agent/tests.py +++ b/zaza/openstack/charm_tests/ceilometer_agent/tests.py @@ -69,11 +69,13 @@ def test_400_gnocchi_metrics(self): expected_metric_names = self.__get_expected_metric_names( current_os_release) + logging.info("Expected metric names: %s", + ', '.join(sorted(expected_metric_names))) min_timeout_seconds = 500 - polling_interval_seconds = ( + polling_interval_seconds = int( openstack_utils.get_application_config_option( - self.application_name, 'polling-interval')) + self.application_name, 'polling-interval') or 30) timeout_seconds = max(10 * polling_interval_seconds, min_timeout_seconds) logging.info('Giving ceilometer-agent {}s to publish all metrics to ' @@ -81,12 +83,17 @@ def test_400_gnocchi_metrics(self): max_time = time.time() + timeout_seconds while time.time() < max_time: + logging.info("... Looking:") found_metric_names = {metric['name'] for metric in gnocchi.metric.list()} + logging.info("... found metric names: %s", + ', '.join(sorted(found_metric_names))) missing_metric_names = expected_metric_names - found_metric_names if len(missing_metric_names) == 0: logging.info('All expected metrics found.') break + logging.info("... still missing: %s", + ', '.join(sorted(missing_metric_names))) time.sleep(polling_interval_seconds) unexpected_found_metric_names = ( diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index 3a3f54c00..d721ba5b9 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name, command=ssh_cmd, verify=verify_status) + def _umount_share_on_instance(self, instance_ip, ssh_user_name, + ssh_private_key, share_path): + """Umount a share from a Nova instance. + + The mount command is executed via SSH. + + :param instance_ip: IP of the Nova instance. + :type instance_ip: string + :param ssh_user_name: SSH user name. + :type ssh_user_name: string + :param ssh_private_key: SSH private key. + :type ssh_private_key: string + :param share_path: share network path. + :type share_path: string + """ + ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir) + + for attempt in tenacity.Retrying( + stop=tenacity.stop_after_attempt(5), + wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)): + with attempt: + openstack_utils.ssh_command( + vm_name="instance-{}".format(instance_ip), + ip=instance_ip, + username=ssh_user_name, + privkey=ssh_private_key, + command=ssh_cmd, + verify=verify_status) + @tenacity.retry( stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)) @@ -323,6 +352,23 @@ def _restart_share_instance(self): """ return False + def _wait_for_ceph_healthy(self): + """Wait until the ceph health is healthy""" + logging.info("Waiting for ceph to be healthy") + for attempt in tenacity.Retrying( + wait=tenacity.wait_fixed(5), + stop=tenacity.stop_after_attempt(10), + reraise=True + ): + logging.info("... testing Ceph") + with attempt: + self.assertEqual( + zaza.model.run_on_leader( + "ceph-mon", "sudo ceph health")["Code"], + "0" + ) + logging.info("...Ceph is healthy") + def test_manila_share(self): """Test that a Manila share can be accessed on two instances. @@ -346,6 +392,10 @@ def test_manila_share(self): fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0] fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0] + # force a restart to clear out any clients that may be hanging around + # due to restarts on manila-ganesha during deployment. + self._restart_share_instance() + self._wait_for_ceph_healthy() # Create a share share = self.manila_client.shares.create( share_type=self.share_type_name, @@ -403,3 +453,9 @@ def test_manila_share(self): fip_2, ssh_user_name, privkey, share_path) self._validate_testing_file_from_instance( fip_2, ssh_user_name, privkey) + + # now umount the share on each instance to allow cleaning up. + self._umount_share_on_instance( + fip_1, ssh_user_name, privkey, share_path) + self._umount_share_on_instance( + fip_2, ssh_user_name, privkey, share_path) diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py index 158fb1ab6..d5c934a6b 100644 --- a/zaza/openstack/charm_tests/manila_ganesha/tests.py +++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py @@ -23,6 +23,7 @@ MANILA_GANESHA_TYPE_NAME, ) +from zaza import sync_wrapper import zaza.openstack.utilities.generic as generic_utils import zaza.openstack.charm_tests.manila.tests as manila_tests import zaza.model @@ -54,29 +55,99 @@ def _restart_share_instance(self): self.model_name, ganeshas)) for ganesha in ganeshas: - ganesha_unit = zaza.model.get_units(ganesha)[0] + units = zaza.model.get_units(ganesha) + ganesha_unit = units[0] hacluster_unit = zaza_utils_juju.get_subordinate_units( [ganesha_unit.entity_id], charm_name='hacluster') logging.info('Ganesha in hacluster mode: {}'.format( bool(hacluster_unit))) - for unit in zaza.model.get_units(ganesha): + for unit in units: if hacluster_unit: # While we really only need to run this on the machine # hosting # nfs-ganesha and manila-share, running it # everywhere isn't harmful. Pacemaker handles restarting # the services + logging.info( + "For %s, running systemctl stop manila-share " + "nfs-ganesha", unit.entity_id) zaza.model.run_on_unit( unit.entity_id, "systemctl stop manila-share nfs-ganesha") else: + logging.info( + "For %s, running systemctl restart manila-share " + "nfs-ganesha", unit.entity_id) zaza.model.run_on_unit( unit.entity_id, "systemctl restart manila-share nfs-ganesha") + if hacluster_unit: + # now ensure that at least one manila-share and nfs-ganesha is + # at least running. + unit_names = [unit.entity_id for unit in units] + logging.info( + "Blocking until at least one manila-share is running") + self._block_until_at_least_one_unit_running_services( + unit_names, ['manila-share']) + else: + # block until they are all running. + for unit in units: + zaza.model.block_until_service_status( + unit_name=unit.entity_id, + services=['manila-share'], + target_status='running' + ) + return True + @staticmethod + def _block_until_at_least_one_unit_running_services( + units, services, model_name=None, timeout=None): + """Block until at least one unit is running the provided services. + + :param units: List of names of unit to run action on + :type units: List[str] + :param services: List of services to check + :type services: List[str] + """ + async def _check_services(): + for unit_name in units: + running_services = {} + for service in services: + command = r"pidof -x '{}'".format(service) + out = await zaza.model.async_run_on_unit( + unit_name, + command, + model_name=model_name, + timeout=timeout) + response_size = len(out['Stdout'].strip()) + # response_size == 0 means NOT running. + running_services[service] = (response_size > 0) + states = ', '.join('{}: {}'.format(k, v) + for k, v in + running_services.items()) + # Note this blocks the async call, but we don't really care as + # it should only be a short time. + logging.info('For unit {unit}, services: {states}' + .format(unit=unit_name, states=states)) + active_services = [ + service + for service, running in running_services.items() + if running] + if len(active_services) == len(services): + # all services are running + return True + # No unit has all services running + return False + + async def _await_block(): + await zaza.model.async_block_until( + _check_services, timeout=timeout) + + sync_wrapper(_await_block)() + def _run_nrpe_check_command(self, commands): try: zaza.model.get_application("nrpe") diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py index 02826ff7f..5119e0b7a 100644 --- a/zaza/openstack/utilities/__init__.py +++ b/zaza/openstack/utilities/__init__.py @@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0, 'retry_exceptions': retry_exceptions, 'log': _log, } - _log(f"ObjectRetrierWraps: wrapping {self.__obj}") def __getattr__(self, name): """Get attribute; delegates to wrapped object.""" diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index e87f8547d..44df90208 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -86,7 +86,7 @@ from zaza.openstack.utilities import ( exceptions, generic as generic_utils, - ObjectRetrierWraps, + retry_on_connect_failure, ) import zaza.utilities.networking as network_utils @@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None): """ if not version: version = 2 - return ObjectRetrierWraps( + return retry_on_connect_failure( novaclient_client.Client(version, session=session)) @@ -2323,7 +2323,9 @@ def get_remote_ca_cert_file(application, model_name=None): model.scp_from_unit( unit, cert_file, - _tmp_ca.name) + _tmp_ca.name, + scp_opts='-q', + ) except JujuError: continue # ensure that the path to put the local cacert in actually exists. @@ -2565,7 +2567,7 @@ def resource_removed(resource, msg='resource', wait_exponential_multiplier=1, wait_iteration_max_time=60, - stop_after_attempt=8): + stop_after_attempt=30): """Wait for an openstack resource to no longer be present. :param resource: pointer to os resource type, ex: heat_client.stacks From 391136151c062c741545419d2be744b87ef24f40 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Tue, 16 Jul 2024 14:11:06 +0100 Subject: [PATCH 2/4] Fix pep8 and remove non-working scp_opts The scp_opts don't seem to be able to quiet the scp: error for the certificate. --- zaza/openstack/charm_tests/ceph/tests.py | 3 +-- zaza/openstack/charm_tests/manila/tests.py | 5 ++--- zaza/openstack/utilities/openstack.py | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/zaza/openstack/charm_tests/ceph/tests.py b/zaza/openstack/charm_tests/ceph/tests.py index 85de53dcf..d485fe871 100644 --- a/zaza/openstack/charm_tests/ceph/tests.py +++ b/zaza/openstack/charm_tests/ceph/tests.py @@ -272,8 +272,7 @@ def test_ceph_encryption(self): source = '/tmp/dmcrypt-keys/*' zaza_model.scp_from_unit(unit_name=unit_name, source=source, - destination=tempdir, - scp_opts='-p') + destination=tempdir) for elt in listdir(tempdir): file_path = '/'.join([tempdir, elt]) if path.isfile(file_path): diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index d721ba5b9..262673220 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -353,7 +353,7 @@ def _restart_share_instance(self): return False def _wait_for_ceph_healthy(self): - """Wait until the ceph health is healthy""" + """Wait until the ceph health is healthy.""" logging.info("Waiting for ceph to be healthy") for attempt in tenacity.Retrying( wait=tenacity.wait_fixed(5), @@ -365,8 +365,7 @@ def _wait_for_ceph_healthy(self): self.assertEqual( zaza.model.run_on_leader( "ceph-mon", "sudo ceph health")["Code"], - "0" - ) + "0") logging.info("...Ceph is healthy") def test_manila_share(self): diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index 44df90208..5babdb9f0 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -2323,9 +2323,7 @@ def get_remote_ca_cert_file(application, model_name=None): model.scp_from_unit( unit, cert_file, - _tmp_ca.name, - scp_opts='-q', - ) + _tmp_ca.name) except JujuError: continue # ensure that the path to put the local cacert in actually exists. From 363ff1db0faa3be360154a17b9fdc33da6baf535 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 18 Jul 2024 11:40:17 +0100 Subject: [PATCH 3/4] Modify manila-test to evict mds clients when ceph unhealthy During manila-ganesha charm deployment sometimes zombie ceph mds clients will be left attached to ceph fs. This locks the ceph fs service and breaks the test (can't create a manila-share). The bug is annoying, from a test gate perspective, but clears itself after 5-10 minutes, and so isn't (it is currently understood) a production issue. This patch detects the ceph unhealthy condition and then forcibly evicts the ceph MDS clients which then allows the test to proceed. Related-Bug: https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 --- zaza/openstack/charm_tests/manila/tests.py | 96 +++++++++++++++---- .../charm_tests/manila_ganesha/tests.py | 10 +- 2 files changed, 87 insertions(+), 19 deletions(-) diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index 262673220..99e45eba6 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -16,6 +16,7 @@ """Encapsulate Manila testing.""" +import json import logging import tenacity @@ -352,21 +353,81 @@ def _restart_share_instance(self): """ return False - def _wait_for_ceph_healthy(self): + def _make_ceph_healthy(self, model_name=None): + """Force ceph into a healthy status.""" + # wait for 30 seconds for self to get healthy + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=6, interval=5, model_name=None) + if healthy: + return + logging.info("Ceph is not healthy: %s", ceph_status) + # evict any clients. + self._evict_ceph_mds_clients(model_name) + self._restart_share_instance() + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=10, interval=15, model_name=None) + + def _wait_for_ceph_fs_healthy( + self, repeat=30, interval=20, model_name=None): """Wait until the ceph health is healthy.""" - logging.info("Waiting for ceph to be healthy") - for attempt in tenacity.Retrying( - wait=tenacity.wait_fixed(5), - stop=tenacity.stop_after_attempt(10), - reraise=True - ): - logging.info("... testing Ceph") - with attempt: - self.assertEqual( - zaza.model.run_on_leader( - "ceph-mon", "sudo ceph health")["Code"], - "0") - logging.info("...Ceph is healthy") + logging.info("Waiting for ceph to be healthy - up to 10 minutes") + try: + for attempt in tenacity.Retrying( + wait=tenacity.wait_fixed(interval), + stop=tenacity.stop_after_attempt(repeat), + reraise=True, + ): + logging.info("... checking Ceph") + with attempt: + healthy, ceph_status = self._check_ceph_fs_health( + model_name) + if not healthy: + raise RuntimeError("Ceph was unhealthy: {}" + .format(ceph_status)) + except RuntimeError: + # we are only retrying for the retries, not to raise an exception. + pass + if healthy: + logging.info("...Ceph is healthy") + else: + logging.info("...Ceph is not healthy %s", ceph_status) + return healthy, ceph_status + + @staticmethod + def _check_ceph_fs_health(model_name=None): + """Check to see if the ceph fs system is healthy.""" + cmd_result = zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph status --format=json", + model_name=model_name) + status = json.loads(cmd_result['Stdout']) + ceph_status = status['health']['status'] + return (ceph_status == "HEALTH_OK"), ceph_status + + @staticmethod + def _evict_ceph_mds_clients(model_name=None): + """Evict and ceph mds clients present. + + Essentially work around a manila-ganesha deployment bug: + https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 + """ + # NOTE:evicting a client adds them to the mds blocklist; this shouldn't + # matter for the ephemeral nature of the test. + # get the list of clients. + cmd_results = zaza.model.run_on_leader( + "ceph-mon", "sudo ceph tell mds.0 client ls", + model_name=model_name) + result = json.loads(cmd_results['Stdout']) + client_ids = [client['id'] for client in result] + logging.info("Evicting clients %s", ", ".join( + str(c) for c in client_ids)) + # now evict the clients. + for client in client_ids: + logging.info("Evicting client %s", client) + zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph tell mds.0 client evict id={}".format(client), + model_name=model_name) def test_manila_share(self): """Test that a Manila share can be accessed on two instances. @@ -392,9 +453,12 @@ def test_manila_share(self): fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0] # force a restart to clear out any clients that may be hanging around - # due to restarts on manila-ganesha during deployment. + # due to restarts on manila-ganesha during deployment; this also forces + # an HA manila into a stable state. self._restart_share_instance() - self._wait_for_ceph_healthy() + # Clean out any old clients causes by restarting manila-ganesha shares + # and ganesha.nfsd daemons. + self._make_ceph_healthy() # Create a share share = self.manila_client.shares.create( share_type=self.share_type_name, diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py index d5c934a6b..6a028173c 100644 --- a/zaza/openstack/charm_tests/manila_ganesha/tests.py +++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py @@ -70,11 +70,15 @@ def _restart_share_instance(self): # everywhere isn't harmful. Pacemaker handles restarting # the services logging.info( - "For %s, running systemctl stop manila-share " - "nfs-ganesha", unit.entity_id) + "For %s, running systemctl stop manila-share, " + "kill -HUP pidof ganesha.nfsd", unit.entity_id) + zaza.model.run_on_unit( + unit.entity_id, + "systemctl stop manila-share") zaza.model.run_on_unit( unit.entity_id, - "systemctl stop manila-share nfs-ganesha") + 'pidof ganesha.nfsd && ' + 'kill -HUP $(pidof ganesha.nfsd)') else: logging.info( "For %s, running systemctl restart manila-share " From af56e9e5be1960c89df993268e047bac77df2fe6 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 18 Jul 2024 13:39:21 +0100 Subject: [PATCH 4/4] Move ceph eviction code from manila to manila-ganesha This is so that manila *only* tests without ceph will not try to clean-up non-existant ceph units. --- zaza/openstack/charm_tests/manila/tests.py | 84 ----------------- .../charm_tests/manila_ganesha/tests.py | 92 +++++++++++++++++++ 2 files changed, 92 insertions(+), 84 deletions(-) diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index 99e45eba6..87e06a7ce 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -16,7 +16,6 @@ """Encapsulate Manila testing.""" -import json import logging import tenacity @@ -353,82 +352,6 @@ def _restart_share_instance(self): """ return False - def _make_ceph_healthy(self, model_name=None): - """Force ceph into a healthy status.""" - # wait for 30 seconds for self to get healthy - healthy, ceph_status = self._wait_for_ceph_fs_healthy( - repeat=6, interval=5, model_name=None) - if healthy: - return - logging.info("Ceph is not healthy: %s", ceph_status) - # evict any clients. - self._evict_ceph_mds_clients(model_name) - self._restart_share_instance() - healthy, ceph_status = self._wait_for_ceph_fs_healthy( - repeat=10, interval=15, model_name=None) - - def _wait_for_ceph_fs_healthy( - self, repeat=30, interval=20, model_name=None): - """Wait until the ceph health is healthy.""" - logging.info("Waiting for ceph to be healthy - up to 10 minutes") - try: - for attempt in tenacity.Retrying( - wait=tenacity.wait_fixed(interval), - stop=tenacity.stop_after_attempt(repeat), - reraise=True, - ): - logging.info("... checking Ceph") - with attempt: - healthy, ceph_status = self._check_ceph_fs_health( - model_name) - if not healthy: - raise RuntimeError("Ceph was unhealthy: {}" - .format(ceph_status)) - except RuntimeError: - # we are only retrying for the retries, not to raise an exception. - pass - if healthy: - logging.info("...Ceph is healthy") - else: - logging.info("...Ceph is not healthy %s", ceph_status) - return healthy, ceph_status - - @staticmethod - def _check_ceph_fs_health(model_name=None): - """Check to see if the ceph fs system is healthy.""" - cmd_result = zaza.model.run_on_leader( - "ceph-mon", - "sudo ceph status --format=json", - model_name=model_name) - status = json.loads(cmd_result['Stdout']) - ceph_status = status['health']['status'] - return (ceph_status == "HEALTH_OK"), ceph_status - - @staticmethod - def _evict_ceph_mds_clients(model_name=None): - """Evict and ceph mds clients present. - - Essentially work around a manila-ganesha deployment bug: - https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 - """ - # NOTE:evicting a client adds them to the mds blocklist; this shouldn't - # matter for the ephemeral nature of the test. - # get the list of clients. - cmd_results = zaza.model.run_on_leader( - "ceph-mon", "sudo ceph tell mds.0 client ls", - model_name=model_name) - result = json.loads(cmd_results['Stdout']) - client_ids = [client['id'] for client in result] - logging.info("Evicting clients %s", ", ".join( - str(c) for c in client_ids)) - # now evict the clients. - for client in client_ids: - logging.info("Evicting client %s", client) - zaza.model.run_on_leader( - "ceph-mon", - "sudo ceph tell mds.0 client evict id={}".format(client), - model_name=model_name) - def test_manila_share(self): """Test that a Manila share can be accessed on two instances. @@ -452,13 +375,6 @@ def test_manila_share(self): fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0] fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0] - # force a restart to clear out any clients that may be hanging around - # due to restarts on manila-ganesha during deployment; this also forces - # an HA manila into a stable state. - self._restart_share_instance() - # Clean out any old clients causes by restarting manila-ganesha shares - # and ganesha.nfsd daemons. - self._make_ceph_healthy() # Create a share share = self.manila_client.shares.create( share_type=self.share_type_name, diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py index 6a028173c..0798a6f70 100644 --- a/zaza/openstack/charm_tests/manila_ganesha/tests.py +++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py @@ -16,6 +16,7 @@ """Encapsulate Manila Ganesha testing.""" +import json import logging import tenacity @@ -219,3 +220,94 @@ def test_905_nrpe_custom_service_checks(self): ] self._run_nrpe_check_command(commands) + + def _make_ceph_healthy(self, model_name=None): + """Force ceph into a healthy status.""" + # wait for 30 seconds for self to get healthy + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=6, interval=5, model_name=None) + if healthy: + return + logging.info("Ceph is not healthy: %s", ceph_status) + # evict any clients. + self._evict_ceph_mds_clients(model_name) + self._restart_share_instance() + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=10, interval=15, model_name=None) + + def _wait_for_ceph_fs_healthy( + self, repeat=30, interval=20, model_name=None): + """Wait until the ceph health is healthy.""" + logging.info("Waiting for ceph to be healthy ...") + try: + for attempt in tenacity.Retrying( + wait=tenacity.wait_fixed(interval), + stop=tenacity.stop_after_attempt(repeat), + reraise=True, + ): + logging.info("... checking Ceph") + with attempt: + healthy, ceph_status = self._check_ceph_fs_health( + model_name) + if not healthy: + raise RuntimeError("Ceph was unhealthy: {}" + .format(ceph_status)) + except RuntimeError: + # we are only retrying for the retries, not to raise an exception. + pass + if healthy: + logging.info("...Ceph is healthy") + else: + logging.info("...Ceph is not healthy %s", ceph_status) + return healthy, ceph_status + + @staticmethod + def _check_ceph_fs_health(model_name=None): + """Check to see if the ceph fs system is healthy.""" + cmd_result = zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph status --format=json", + model_name=model_name) + status = json.loads(cmd_result['Stdout']) + ceph_status = status['health']['status'] + return (ceph_status == "HEALTH_OK"), ceph_status + + @staticmethod + def _evict_ceph_mds_clients(model_name=None): + """Evict and ceph mds clients present. + + Essentially work around a manila-ganesha deployment bug: + https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 + """ + # NOTE:evicting a client adds them to the mds blocklist; this shouldn't + # matter for the ephemeral nature of the test. + # get the list of clients. + cmd_results = zaza.model.run_on_leader( + "ceph-mon", "sudo ceph tell mds.0 client ls", + model_name=model_name) + result = json.loads(cmd_results['Stdout']) + client_ids = [client['id'] for client in result] + logging.info("Evicting clients %s", ", ".join( + str(c) for c in client_ids)) + # now evict the clients. + for client in client_ids: + logging.info("Evicting client %s", client) + zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph tell mds.0 client evict id={}".format(client), + model_name=model_name) + + def test_manila_share(self): + """Test that a manila-ganesha share can be accessed on two instances. + + This overrides the base manila test by prefixing a make ceph healthy + stage. + """ + # force a restart to clear out any clients that may be hanging around + # due to restarts on manila-ganesha during deployment; this also forces + # an HA manila into a stable state. + self._restart_share_instance() + # Clean out any old clients causes by restarting manila-ganesha shares + # and ganesha.nfsd daemons. + self._make_ceph_healthy() + super().test_manila_share()