openstack-charmers · freyes · Jul 18, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 18, 2024
diff --git a/zaza/openstack/charm_tests/ceilometer_agent/tests.py b/zaza/openstack/charm_tests/ceilometer_agent/tests.py
@@ -69,24 +69,31 @@ def test_400_gnocchi_metrics(self):
 
         expected_metric_names = self.__get_expected_metric_names(
             current_os_release)
+        logging.info("Expected metric names: %s",
+                     ', '.join(sorted(expected_metric_names)))
 
         min_timeout_seconds = 500
-        polling_interval_seconds = (
+        polling_interval_seconds = int(
             openstack_utils.get_application_config_option(
-                self.application_name, 'polling-interval'))
+                self.application_name, 'polling-interval') or 30)
         timeout_seconds = max(10 * polling_interval_seconds,
                               min_timeout_seconds)
         logging.info('Giving ceilometer-agent {}s to publish all metrics to '
                      'gnocchi...'.format(timeout_seconds))
 
         max_time = time.time() + timeout_seconds
         while time.time() < max_time:
+            logging.info("... Looking:")
             found_metric_names = {metric['name']
                                   for metric in gnocchi.metric.list()}
+            logging.info("... found metric names: %s",
+                         ', '.join(sorted(found_metric_names)))
             missing_metric_names = expected_metric_names - found_metric_names
             if len(missing_metric_names) == 0:
                 logging.info('All expected metrics found.')
                 break
+            logging.info("... still missing: %s",
+                         ', '.join(sorted(missing_metric_names)))
             time.sleep(polling_interval_seconds)
 
         unexpected_found_metric_names = (

diff --git a/zaza/openstack/charm_tests/ceph/tests.py b/zaza/openstack/charm_tests/ceph/tests.py
@@ -272,8 +272,7 @@ def test_ceph_encryption(self):
                 source = '/tmp/dmcrypt-keys/*'
                 zaza_model.scp_from_unit(unit_name=unit_name,
                                          source=source,
-                                         destination=tempdir,
-                                         scp_opts='-p')
+                                         destination=tempdir)
                 for elt in listdir(tempdir):
                     file_path = '/'.join([tempdir, elt])
                     if path.isfile(file_path):

diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
@@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name,
                     command=ssh_cmd,
                     verify=verify_status)
 
+    def _umount_share_on_instance(self, instance_ip, ssh_user_name,
+                                  ssh_private_key, share_path):
+        """Umount a share from a Nova instance.
+
+        The mount command is executed via SSH.
+
+        :param instance_ip: IP of the Nova instance.
+        :type instance_ip: string
+        :param ssh_user_name: SSH user name.
+        :type ssh_user_name: string
+        :param ssh_private_key: SSH private key.
+        :type ssh_private_key: string
+        :param share_path: share network path.
+        :type share_path: string
+        """
+        ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir)
+
+        for attempt in tenacity.Retrying(
+                stop=tenacity.stop_after_attempt(5),
+                wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)):
+            with attempt:
+                openstack_utils.ssh_command(
+                    vm_name="instance-{}".format(instance_ip),
+                    ip=instance_ip,
+                    username=ssh_user_name,
+                    privkey=ssh_private_key,
+                    command=ssh_cmd,
+                    verify=verify_status)
+
     @tenacity.retry(
         stop=tenacity.stop_after_attempt(5),
         wait=tenacity.wait_exponential(multiplier=3, min=2, max=10))
@@ -403,3 +432,9 @@ def test_manila_share(self):
                 fip_2, ssh_user_name, privkey, share_path)
             self._validate_testing_file_from_instance(
                 fip_2, ssh_user_name, privkey)
+
+        # now umount the share on each instance to allow cleaning up.
+        self._umount_share_on_instance(
+            fip_1, ssh_user_name, privkey, share_path)
+        self._umount_share_on_instance(
+            fip_2, ssh_user_name, privkey, share_path)
diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py
@@ -16,13 +16,15 @@
 
 """Encapsulate Manila Ganesha testing."""
 
+import json
 import logging
 import tenacity
 
 from zaza.openstack.charm_tests.manila_ganesha.setup import (
     MANILA_GANESHA_TYPE_NAME,
 )
 
+from zaza import sync_wrapper
 import zaza.openstack.utilities.generic as generic_utils
 import zaza.openstack.charm_tests.manila.tests as manila_tests
 import zaza.model
@@ -54,29 +56,103 @@ def _restart_share_instance(self):
             self.model_name,
             ganeshas))
         for ganesha in ganeshas:
-            ganesha_unit = zaza.model.get_units(ganesha)[0]
+            units = zaza.model.get_units(ganesha)
+            ganesha_unit = units[0]
             hacluster_unit = zaza_utils_juju.get_subordinate_units(
                 [ganesha_unit.entity_id],
                 charm_name='hacluster')
             logging.info('Ganesha in hacluster mode: {}'.format(
                 bool(hacluster_unit)))
 
-            for unit in zaza.model.get_units(ganesha):
+            for unit in units:
                 if hacluster_unit:
                     # While we really only need to run this on the machine
                     # hosting # nfs-ganesha and manila-share, running it
                     # everywhere isn't harmful. Pacemaker handles restarting
                     # the services
+                    logging.info(
+                        "For %s, running systemctl stop manila-share, "
+                        "kill -HUP pidof ganesha.nfsd", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
-                        "systemctl stop manila-share nfs-ganesha")
+                        "systemctl stop manila-share")
+                    zaza.model.run_on_unit(
+                        unit.entity_id,
+                        'pidof ganesha.nfsd && '
+                        'kill -HUP $(pidof ganesha.nfsd)')
                 else:
+                    logging.info(
+                        "For %s, running systemctl restart manila-share "
+                        "nfs-ganesha", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
                         "systemctl restart manila-share nfs-ganesha")
 
+            if hacluster_unit:
+                # now ensure that at least one manila-share and nfs-ganesha is
+                # at least running.
+                unit_names = [unit.entity_id for unit in units]
+                logging.info(
+                    "Blocking until at least one manila-share is running")
+                self._block_until_at_least_one_unit_running_services(
+                    unit_names, ['manila-share'])
+            else:
+                # block until they are all running.
+                for unit in units:
+                    zaza.model.block_until_service_status(
+                        unit_name=unit.entity_id,
+                        services=['manila-share'],
+                        target_status='running'
+                    )
+
         return True
 
+    @staticmethod
+    def _block_until_at_least_one_unit_running_services(
+            units, services, model_name=None, timeout=None):
+        """Block until at least one unit is running the provided services.
+
+        :param units: List of names of unit to run action on
+        :type units: List[str]
+        :param services: List of services to check
+        :type services: List[str]
+        """
+        async def _check_services():
+            for unit_name in units:
+                running_services = {}
+                for service in services:
+                    command = r"pidof -x '{}'".format(service)
+                    out = await zaza.model.async_run_on_unit(
+                        unit_name,
+                        command,
+                        model_name=model_name,
+                        timeout=timeout)
+                    response_size = len(out['Stdout'].strip())
+                    # response_size == 0 means NOT running.
+                    running_services[service] = (response_size > 0)
+                states = ', '.join('{}: {}'.format(k, v)
+                                   for k, v in
+                                   running_services.items())
+                # Note this blocks the async call, but we don't really care as
+                # it should only be a short time.
+                logging.info('For unit {unit}, services: {states}'
+                             .format(unit=unit_name, states=states))
+                active_services = [
+                    service
+                    for service, running in running_services.items()
+                    if running]
+                if len(active_services) == len(services):
+                    # all services are running
+                    return True
+            # No unit has all services running
+            return False
+
+        async def _await_block():
+            await zaza.model.async_block_until(
+                _check_services, timeout=timeout)
+
+        sync_wrapper(_await_block)()
+
     def _run_nrpe_check_command(self, commands):
         try:
             zaza.model.get_application("nrpe")
@@ -144,3 +220,94 @@ def test_905_nrpe_custom_service_checks(self):
         ]
 
         self._run_nrpe_check_command(commands)
+
+    def _make_ceph_healthy(self, model_name=None):
+        """Force ceph into a healthy status."""
+        # wait for 30 seconds for self to get healthy
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=6, interval=5, model_name=None)
+        if healthy:
+            return
+        logging.info("Ceph is not healthy: %s", ceph_status)
+        # evict any clients.
+        self._evict_ceph_mds_clients(model_name)
+        self._restart_share_instance()
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=10, interval=15, model_name=None)
+
+    def _wait_for_ceph_fs_healthy(
+            self, repeat=30, interval=20, model_name=None):
+        """Wait until the ceph health is healthy."""
+        logging.info("Waiting for ceph to be healthy ...")
+        try:
+            for attempt in tenacity.Retrying(
+                wait=tenacity.wait_fixed(interval),
+                stop=tenacity.stop_after_attempt(repeat),
+                reraise=True,
+            ):
+                logging.info("... checking Ceph")
+                with attempt:
+                    healthy, ceph_status = self._check_ceph_fs_health(
+                        model_name)
+                    if not healthy:
+                        raise RuntimeError("Ceph was unhealthy: {}"
+                                           .format(ceph_status))
+        except RuntimeError:
+            # we are only retrying for the retries, not to raise an exception.
+            pass
+        if healthy:
+            logging.info("...Ceph is healthy")
+        else:
+            logging.info("...Ceph is not healthy %s", ceph_status)
+        return healthy, ceph_status
+
+    @staticmethod
+    def _check_ceph_fs_health(model_name=None):
+        """Check to see if the ceph fs system is healthy."""
+        cmd_result = zaza.model.run_on_leader(
+            "ceph-mon",
+            "sudo ceph status --format=json",
+            model_name=model_name)
+        status = json.loads(cmd_result['Stdout'])
+        ceph_status = status['health']['status']
+        return (ceph_status == "HEALTH_OK"), ceph_status
+
+    @staticmethod
+    def _evict_ceph_mds_clients(model_name=None):
+        """Evict and ceph mds clients present.
+
+        Essentially work around a manila-ganesha deployment bug:
+        https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
+        """
+        # NOTE:evicting a client adds them to the mds blocklist; this shouldn't
+        # matter for the ephemeral nature of the test.
+        # get the list of clients.
+        cmd_results = zaza.model.run_on_leader(
+            "ceph-mon", "sudo ceph tell mds.0 client ls",
+            model_name=model_name)
+        result = json.loads(cmd_results['Stdout'])
+        client_ids = [client['id'] for client in result]
+        logging.info("Evicting clients %s", ", ".join(
+            str(c) for c in client_ids))
+        # now evict the clients.
+        for client in client_ids:
+            logging.info("Evicting client %s", client)
+            zaza.model.run_on_leader(
+                "ceph-mon",
+                "sudo ceph tell mds.0 client evict id={}".format(client),
+                model_name=model_name)
+
+    def test_manila_share(self):
+        """Test that a manila-ganesha share can be accessed on two instances.
+
+        This overrides the base manila test by prefixing a make ceph healthy
+        stage.
+        """
+        # force a restart to clear out any clients that may be hanging around
+        # due to restarts on manila-ganesha during deployment; this also forces
+        # an HA manila into a stable state.
+        self._restart_share_instance()
+        # Clean out any old clients causes by restarting manila-ganesha shares
+        # and ganesha.nfsd daemons.
+        self._make_ceph_healthy()
+        super().test_manila_share()
diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py
@@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0,
             'retry_exceptions': retry_exceptions,
             'log': _log,
         }
-        _log(f"ObjectRetrierWraps: wrapping {self.__obj}")
 
     def __getattr__(self, name):
         """Get attribute; delegates to wrapped object."""

diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py
@@ -86,7 +86,7 @@
 from zaza.openstack.utilities import (
     exceptions,
     generic as generic_utils,
-    ObjectRetrierWraps,
+    retry_on_connect_failure,
 )
 import zaza.utilities.networking as network_utils
 
@@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None):
     """
     if not version:
         version = 2
-    return ObjectRetrierWraps(
+    return retry_on_connect_failure(
         novaclient_client.Client(version, session=session))
 
 
@@ -2565,7 +2565,7 @@ def resource_removed(resource,
                      msg='resource',
                      wait_exponential_multiplier=1,
                      wait_iteration_max_time=60,
-                     stop_after_attempt=8):
+                     stop_after_attempt=30):
     """Wait for an openstack resource to no longer be present.
 
     :param resource: pointer to os resource type, ex: heat_client.stacks