From 3873dd56266f006ae928b2e4d4c129aad27270ae Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Tue, 16 Jul 2024 13:07:23 +0100 Subject: [PATCH] Improvements to further stablise the manila-ganesha tests This improves the manila-ganesha tests by checking that ceph is stable/healthy, and b) ensuring that after the restart of manila-ganesha, that it is stable/healthy. --- .../charm_tests/ceilometer_agent/tests.py | 11 ++- zaza/openstack/charm_tests/manila/tests.py | 56 ++++++++++++++ .../charm_tests/manila_ganesha/tests.py | 75 ++++++++++++++++++- zaza/openstack/utilities/__init__.py | 1 - zaza/openstack/utilities/openstack.py | 10 ++- 5 files changed, 144 insertions(+), 9 deletions(-) diff --git a/zaza/openstack/charm_tests/ceilometer_agent/tests.py b/zaza/openstack/charm_tests/ceilometer_agent/tests.py index cdb196751..572d92032 100644 --- a/zaza/openstack/charm_tests/ceilometer_agent/tests.py +++ b/zaza/openstack/charm_tests/ceilometer_agent/tests.py @@ -69,11 +69,13 @@ def test_400_gnocchi_metrics(self): expected_metric_names = self.__get_expected_metric_names( current_os_release) + logging.info("Expected metric names: %s", + ', '.join(sorted(expected_metric_names))) min_timeout_seconds = 500 - polling_interval_seconds = ( + polling_interval_seconds = int( openstack_utils.get_application_config_option( - self.application_name, 'polling-interval')) + self.application_name, 'polling-interval') or 30) timeout_seconds = max(10 * polling_interval_seconds, min_timeout_seconds) logging.info('Giving ceilometer-agent {}s to publish all metrics to ' @@ -81,12 +83,17 @@ def test_400_gnocchi_metrics(self): max_time = time.time() + timeout_seconds while time.time() < max_time: + logging.info("... Looking:") found_metric_names = {metric['name'] for metric in gnocchi.metric.list()} + logging.info("... found metric names: %s", + ', '.join(sorted(found_metric_names))) missing_metric_names = expected_metric_names - found_metric_names if len(missing_metric_names) == 0: logging.info('All expected metrics found.') break + logging.info("... still missing: %s", + ', '.join(sorted(missing_metric_names))) time.sleep(polling_interval_seconds) unexpected_found_metric_names = ( diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index 3a3f54c00..d721ba5b9 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name, command=ssh_cmd, verify=verify_status) + def _umount_share_on_instance(self, instance_ip, ssh_user_name, + ssh_private_key, share_path): + """Umount a share from a Nova instance. + + The mount command is executed via SSH. + + :param instance_ip: IP of the Nova instance. + :type instance_ip: string + :param ssh_user_name: SSH user name. + :type ssh_user_name: string + :param ssh_private_key: SSH private key. + :type ssh_private_key: string + :param share_path: share network path. + :type share_path: string + """ + ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir) + + for attempt in tenacity.Retrying( + stop=tenacity.stop_after_attempt(5), + wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)): + with attempt: + openstack_utils.ssh_command( + vm_name="instance-{}".format(instance_ip), + ip=instance_ip, + username=ssh_user_name, + privkey=ssh_private_key, + command=ssh_cmd, + verify=verify_status) + @tenacity.retry( stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)) @@ -323,6 +352,23 @@ def _restart_share_instance(self): """ return False + def _wait_for_ceph_healthy(self): + """Wait until the ceph health is healthy""" + logging.info("Waiting for ceph to be healthy") + for attempt in tenacity.Retrying( + wait=tenacity.wait_fixed(5), + stop=tenacity.stop_after_attempt(10), + reraise=True + ): + logging.info("... testing Ceph") + with attempt: + self.assertEqual( + zaza.model.run_on_leader( + "ceph-mon", "sudo ceph health")["Code"], + "0" + ) + logging.info("...Ceph is healthy") + def test_manila_share(self): """Test that a Manila share can be accessed on two instances. @@ -346,6 +392,10 @@ def test_manila_share(self): fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0] fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0] + # force a restart to clear out any clients that may be hanging around + # due to restarts on manila-ganesha during deployment. + self._restart_share_instance() + self._wait_for_ceph_healthy() # Create a share share = self.manila_client.shares.create( share_type=self.share_type_name, @@ -403,3 +453,9 @@ def test_manila_share(self): fip_2, ssh_user_name, privkey, share_path) self._validate_testing_file_from_instance( fip_2, ssh_user_name, privkey) + + # now umount the share on each instance to allow cleaning up. + self._umount_share_on_instance( + fip_1, ssh_user_name, privkey, share_path) + self._umount_share_on_instance( + fip_2, ssh_user_name, privkey, share_path) diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py index 158fb1ab6..d5c934a6b 100644 --- a/zaza/openstack/charm_tests/manila_ganesha/tests.py +++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py @@ -23,6 +23,7 @@ MANILA_GANESHA_TYPE_NAME, ) +from zaza import sync_wrapper import zaza.openstack.utilities.generic as generic_utils import zaza.openstack.charm_tests.manila.tests as manila_tests import zaza.model @@ -54,29 +55,99 @@ def _restart_share_instance(self): self.model_name, ganeshas)) for ganesha in ganeshas: - ganesha_unit = zaza.model.get_units(ganesha)[0] + units = zaza.model.get_units(ganesha) + ganesha_unit = units[0] hacluster_unit = zaza_utils_juju.get_subordinate_units( [ganesha_unit.entity_id], charm_name='hacluster') logging.info('Ganesha in hacluster mode: {}'.format( bool(hacluster_unit))) - for unit in zaza.model.get_units(ganesha): + for unit in units: if hacluster_unit: # While we really only need to run this on the machine # hosting # nfs-ganesha and manila-share, running it # everywhere isn't harmful. Pacemaker handles restarting # the services + logging.info( + "For %s, running systemctl stop manila-share " + "nfs-ganesha", unit.entity_id) zaza.model.run_on_unit( unit.entity_id, "systemctl stop manila-share nfs-ganesha") else: + logging.info( + "For %s, running systemctl restart manila-share " + "nfs-ganesha", unit.entity_id) zaza.model.run_on_unit( unit.entity_id, "systemctl restart manila-share nfs-ganesha") + if hacluster_unit: + # now ensure that at least one manila-share and nfs-ganesha is + # at least running. + unit_names = [unit.entity_id for unit in units] + logging.info( + "Blocking until at least one manila-share is running") + self._block_until_at_least_one_unit_running_services( + unit_names, ['manila-share']) + else: + # block until they are all running. + for unit in units: + zaza.model.block_until_service_status( + unit_name=unit.entity_id, + services=['manila-share'], + target_status='running' + ) + return True + @staticmethod + def _block_until_at_least_one_unit_running_services( + units, services, model_name=None, timeout=None): + """Block until at least one unit is running the provided services. + + :param units: List of names of unit to run action on + :type units: List[str] + :param services: List of services to check + :type services: List[str] + """ + async def _check_services(): + for unit_name in units: + running_services = {} + for service in services: + command = r"pidof -x '{}'".format(service) + out = await zaza.model.async_run_on_unit( + unit_name, + command, + model_name=model_name, + timeout=timeout) + response_size = len(out['Stdout'].strip()) + # response_size == 0 means NOT running. + running_services[service] = (response_size > 0) + states = ', '.join('{}: {}'.format(k, v) + for k, v in + running_services.items()) + # Note this blocks the async call, but we don't really care as + # it should only be a short time. + logging.info('For unit {unit}, services: {states}' + .format(unit=unit_name, states=states)) + active_services = [ + service + for service, running in running_services.items() + if running] + if len(active_services) == len(services): + # all services are running + return True + # No unit has all services running + return False + + async def _await_block(): + await zaza.model.async_block_until( + _check_services, timeout=timeout) + + sync_wrapper(_await_block)() + def _run_nrpe_check_command(self, commands): try: zaza.model.get_application("nrpe") diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py index 02826ff7f..5119e0b7a 100644 --- a/zaza/openstack/utilities/__init__.py +++ b/zaza/openstack/utilities/__init__.py @@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0, 'retry_exceptions': retry_exceptions, 'log': _log, } - _log(f"ObjectRetrierWraps: wrapping {self.__obj}") def __getattr__(self, name): """Get attribute; delegates to wrapped object.""" diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index e87f8547d..44df90208 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -86,7 +86,7 @@ from zaza.openstack.utilities import ( exceptions, generic as generic_utils, - ObjectRetrierWraps, + retry_on_connect_failure, ) import zaza.utilities.networking as network_utils @@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None): """ if not version: version = 2 - return ObjectRetrierWraps( + return retry_on_connect_failure( novaclient_client.Client(version, session=session)) @@ -2323,7 +2323,9 @@ def get_remote_ca_cert_file(application, model_name=None): model.scp_from_unit( unit, cert_file, - _tmp_ca.name) + _tmp_ca.name, + scp_opts='-q', + ) except JujuError: continue # ensure that the path to put the local cacert in actually exists. @@ -2565,7 +2567,7 @@ def resource_removed(resource, msg='resource', wait_exponential_multiplier=1, wait_iteration_max_time=60, - stop_after_attempt=8): + stop_after_attempt=30): """Wait for an openstack resource to no longer be present. :param resource: pointer to os resource type, ex: heat_client.stacks