From 3873dd56266f006ae928b2e4d4c129aad27270ae Mon Sep 17 00:00:00 2001
From: Alex Kavanagh <alex.kavanagh@canonical.com>
Date: Tue, 16 Jul 2024 13:07:23 +0100
Subject: [PATCH] Improvements to further stablise the manila-ganesha tests

This improves the manila-ganesha tests by checking that ceph is
stable/healthy, and b) ensuring that after the restart of
manila-ganesha, that it is stable/healthy.
---
 .../charm_tests/ceilometer_agent/tests.py     | 11 ++-
 zaza/openstack/charm_tests/manila/tests.py    | 56 ++++++++++++++
 .../charm_tests/manila_ganesha/tests.py       | 75 ++++++++++++++++++-
 zaza/openstack/utilities/__init__.py          |  1 -
 zaza/openstack/utilities/openstack.py         | 10 ++-
 5 files changed, 144 insertions(+), 9 deletions(-)

diff --git a/zaza/openstack/charm_tests/ceilometer_agent/tests.py b/zaza/openstack/charm_tests/ceilometer_agent/tests.py
index cdb196751..572d92032 100644
--- a/zaza/openstack/charm_tests/ceilometer_agent/tests.py
+++ b/zaza/openstack/charm_tests/ceilometer_agent/tests.py
@@ -69,11 +69,13 @@ def test_400_gnocchi_metrics(self):
 
         expected_metric_names = self.__get_expected_metric_names(
             current_os_release)
+        logging.info("Expected metric names: %s",
+                     ', '.join(sorted(expected_metric_names)))
 
         min_timeout_seconds = 500
-        polling_interval_seconds = (
+        polling_interval_seconds = int(
             openstack_utils.get_application_config_option(
-                self.application_name, 'polling-interval'))
+                self.application_name, 'polling-interval') or 30)
         timeout_seconds = max(10 * polling_interval_seconds,
                               min_timeout_seconds)
         logging.info('Giving ceilometer-agent {}s to publish all metrics to '
@@ -81,12 +83,17 @@ def test_400_gnocchi_metrics(self):
 
         max_time = time.time() + timeout_seconds
         while time.time() < max_time:
+            logging.info("... Looking:")
             found_metric_names = {metric['name']
                                   for metric in gnocchi.metric.list()}
+            logging.info("... found metric names: %s",
+                         ', '.join(sorted(found_metric_names)))
             missing_metric_names = expected_metric_names - found_metric_names
             if len(missing_metric_names) == 0:
                 logging.info('All expected metrics found.')
                 break
+            logging.info("... still missing: %s",
+                         ', '.join(sorted(missing_metric_names)))
             time.sleep(polling_interval_seconds)
 
         unexpected_found_metric_names = (
diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
index 3a3f54c00..d721ba5b9 100644
--- a/zaza/openstack/charm_tests/manila/tests.py
+++ b/zaza/openstack/charm_tests/manila/tests.py
@@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name,
                     command=ssh_cmd,
                     verify=verify_status)
 
+    def _umount_share_on_instance(self, instance_ip, ssh_user_name,
+                                  ssh_private_key, share_path):
+        """Umount a share from a Nova instance.
+
+        The mount command is executed via SSH.
+
+        :param instance_ip: IP of the Nova instance.
+        :type instance_ip: string
+        :param ssh_user_name: SSH user name.
+        :type ssh_user_name: string
+        :param ssh_private_key: SSH private key.
+        :type ssh_private_key: string
+        :param share_path: share network path.
+        :type share_path: string
+        """
+        ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir)
+
+        for attempt in tenacity.Retrying(
+                stop=tenacity.stop_after_attempt(5),
+                wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)):
+            with attempt:
+                openstack_utils.ssh_command(
+                    vm_name="instance-{}".format(instance_ip),
+                    ip=instance_ip,
+                    username=ssh_user_name,
+                    privkey=ssh_private_key,
+                    command=ssh_cmd,
+                    verify=verify_status)
+
     @tenacity.retry(
         stop=tenacity.stop_after_attempt(5),
         wait=tenacity.wait_exponential(multiplier=3, min=2, max=10))
@@ -323,6 +352,23 @@ def _restart_share_instance(self):
         """
         return False
 
+    def _wait_for_ceph_healthy(self):
+        """Wait until the ceph health is healthy"""
+        logging.info("Waiting for ceph to be healthy")
+        for attempt in tenacity.Retrying(
+            wait=tenacity.wait_fixed(5),
+            stop=tenacity.stop_after_attempt(10),
+            reraise=True
+        ):
+            logging.info("... testing Ceph")
+            with attempt:
+                self.assertEqual(
+                    zaza.model.run_on_leader(
+                        "ceph-mon", "sudo ceph health")["Code"],
+                    "0"
+        )
+        logging.info("...Ceph is healthy")
+
     def test_manila_share(self):
         """Test that a Manila share can be accessed on two instances.
 
@@ -346,6 +392,10 @@ def test_manila_share(self):
         fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0]
         fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0]
 
+        # force a restart to clear out any clients that may be hanging around
+        # due to restarts on manila-ganesha during deployment.
+        self._restart_share_instance()
+        self._wait_for_ceph_healthy()
         # Create a share
         share = self.manila_client.shares.create(
             share_type=self.share_type_name,
@@ -403,3 +453,9 @@ def test_manila_share(self):
                 fip_2, ssh_user_name, privkey, share_path)
             self._validate_testing_file_from_instance(
                 fip_2, ssh_user_name, privkey)
+
+        # now umount the share on each instance to allow cleaning up.
+        self._umount_share_on_instance(
+            fip_1, ssh_user_name, privkey, share_path)
+        self._umount_share_on_instance(
+            fip_2, ssh_user_name, privkey, share_path)
diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py
index 158fb1ab6..d5c934a6b 100644
--- a/zaza/openstack/charm_tests/manila_ganesha/tests.py
+++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py
@@ -23,6 +23,7 @@
     MANILA_GANESHA_TYPE_NAME,
 )
 
+from zaza import sync_wrapper
 import zaza.openstack.utilities.generic as generic_utils
 import zaza.openstack.charm_tests.manila.tests as manila_tests
 import zaza.model
@@ -54,29 +55,99 @@ def _restart_share_instance(self):
             self.model_name,
             ganeshas))
         for ganesha in ganeshas:
-            ganesha_unit = zaza.model.get_units(ganesha)[0]
+            units = zaza.model.get_units(ganesha)
+            ganesha_unit = units[0]
             hacluster_unit = zaza_utils_juju.get_subordinate_units(
                 [ganesha_unit.entity_id],
                 charm_name='hacluster')
             logging.info('Ganesha in hacluster mode: {}'.format(
                 bool(hacluster_unit)))
 
-            for unit in zaza.model.get_units(ganesha):
+            for unit in units:
                 if hacluster_unit:
                     # While we really only need to run this on the machine
                     # hosting # nfs-ganesha and manila-share, running it
                     # everywhere isn't harmful. Pacemaker handles restarting
                     # the services
+                    logging.info(
+                        "For %s, running systemctl stop manila-share "
+                        "nfs-ganesha", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
                         "systemctl stop manila-share nfs-ganesha")
                 else:
+                    logging.info(
+                        "For %s, running systemctl restart manila-share "
+                        "nfs-ganesha", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
                         "systemctl restart manila-share nfs-ganesha")
 
+            if hacluster_unit:
+                # now ensure that at least one manila-share and nfs-ganesha is
+                # at least running.
+                unit_names = [unit.entity_id for unit in units]
+                logging.info(
+                    "Blocking until at least one manila-share is running")
+                self._block_until_at_least_one_unit_running_services(
+                    unit_names, ['manila-share'])
+            else:
+                # block until they are all running.
+                for unit in units:
+                    zaza.model.block_until_service_status(
+                        unit_name=unit.entity_id,
+                        services=['manila-share'],
+                        target_status='running'
+                    )
+
         return True
 
+    @staticmethod
+    def _block_until_at_least_one_unit_running_services(
+            units, services, model_name=None, timeout=None):
+        """Block until at least one unit is running the provided services.
+
+        :param units: List of names of unit to run action on
+        :type units: List[str]
+        :param services: List of services to check
+        :type services: List[str]
+        """
+        async def _check_services():
+            for unit_name in units:
+                running_services = {}
+                for service in services:
+                    command = r"pidof -x '{}'".format(service)
+                    out = await zaza.model.async_run_on_unit(
+                        unit_name,
+                        command,
+                        model_name=model_name,
+                        timeout=timeout)
+                    response_size = len(out['Stdout'].strip())
+                    # response_size == 0 means NOT running.
+                    running_services[service] = (response_size > 0)
+                states = ', '.join('{}: {}'.format(k, v)
+                                   for k, v in
+                                   running_services.items())
+                # Note this blocks the async call, but we don't really care as
+                # it should only be a short time.
+                logging.info('For unit {unit}, services: {states}'
+                             .format(unit=unit_name, states=states))
+                active_services = [
+                    service
+                    for service, running in running_services.items()
+                    if running]
+                if len(active_services) == len(services):
+                    # all services are running
+                    return True
+            # No unit has all services running
+            return False
+
+        async def _await_block():
+            await zaza.model.async_block_until(
+                _check_services, timeout=timeout)
+
+        sync_wrapper(_await_block)()
+
     def _run_nrpe_check_command(self, commands):
         try:
             zaza.model.get_application("nrpe")
diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py
index 02826ff7f..5119e0b7a 100644
--- a/zaza/openstack/utilities/__init__.py
+++ b/zaza/openstack/utilities/__init__.py
@@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0,
             'retry_exceptions': retry_exceptions,
             'log': _log,
         }
-        _log(f"ObjectRetrierWraps: wrapping {self.__obj}")
 
     def __getattr__(self, name):
         """Get attribute; delegates to wrapped object."""
diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py
index e87f8547d..44df90208 100644
--- a/zaza/openstack/utilities/openstack.py
+++ b/zaza/openstack/utilities/openstack.py
@@ -86,7 +86,7 @@
 from zaza.openstack.utilities import (
     exceptions,
     generic as generic_utils,
-    ObjectRetrierWraps,
+    retry_on_connect_failure,
 )
 import zaza.utilities.networking as network_utils
 
@@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None):
     """
     if not version:
         version = 2
-    return ObjectRetrierWraps(
+    return retry_on_connect_failure(
         novaclient_client.Client(version, session=session))
 
 
@@ -2323,7 +2323,9 @@ def get_remote_ca_cert_file(application, model_name=None):
                 model.scp_from_unit(
                     unit,
                     cert_file,
-                    _tmp_ca.name)
+                    _tmp_ca.name,
+                    scp_opts='-q',
+                )
             except JujuError:
                 continue
             # ensure that the path to put the local cacert in actually exists.
@@ -2565,7 +2567,7 @@ def resource_removed(resource,
                      msg='resource',
                      wait_exponential_multiplier=1,
                      wait_iteration_max_time=60,
-                     stop_after_attempt=8):
+                     stop_after_attempt=30):
     """Wait for an openstack resource to no longer be present.
 
     :param resource: pointer to os resource type, ex: heat_client.stacks