From 3873dd56266f006ae928b2e4d4c129aad27270ae Mon Sep 17 00:00:00 2001
From: Alex Kavanagh <alex.kavanagh@canonical.com>
Date: Tue, 16 Jul 2024 13:07:23 +0100
Subject: [PATCH 1/4] Improvements to further stablise the manila-ganesha tests

This improves the manila-ganesha tests by checking that ceph is
stable/healthy, and b) ensuring that after the restart of
manila-ganesha, that it is stable/healthy.
---
 .../charm_tests/ceilometer_agent/tests.py     | 11 ++-
 zaza/openstack/charm_tests/manila/tests.py    | 56 ++++++++++++++
 .../charm_tests/manila_ganesha/tests.py       | 75 ++++++++++++++++++-
 zaza/openstack/utilities/__init__.py          |  1 -
 zaza/openstack/utilities/openstack.py         | 10 ++-
 5 files changed, 144 insertions(+), 9 deletions(-)

diff --git a/zaza/openstack/charm_tests/ceilometer_agent/tests.py b/zaza/openstack/charm_tests/ceilometer_agent/tests.py
index cdb196751..572d92032 100644
--- a/zaza/openstack/charm_tests/ceilometer_agent/tests.py
+++ b/zaza/openstack/charm_tests/ceilometer_agent/tests.py
@@ -69,11 +69,13 @@ def test_400_gnocchi_metrics(self):
 
         expected_metric_names = self.__get_expected_metric_names(
             current_os_release)
+        logging.info("Expected metric names: %s",
+                     ', '.join(sorted(expected_metric_names)))
 
         min_timeout_seconds = 500
-        polling_interval_seconds = (
+        polling_interval_seconds = int(
             openstack_utils.get_application_config_option(
-                self.application_name, 'polling-interval'))
+                self.application_name, 'polling-interval') or 30)
         timeout_seconds = max(10 * polling_interval_seconds,
                               min_timeout_seconds)
         logging.info('Giving ceilometer-agent {}s to publish all metrics to '
@@ -81,12 +83,17 @@ def test_400_gnocchi_metrics(self):
 
         max_time = time.time() + timeout_seconds
         while time.time() < max_time:
+            logging.info("... Looking:")
             found_metric_names = {metric['name']
                                   for metric in gnocchi.metric.list()}
+            logging.info("... found metric names: %s",
+                         ', '.join(sorted(found_metric_names)))
             missing_metric_names = expected_metric_names - found_metric_names
             if len(missing_metric_names) == 0:
                 logging.info('All expected metrics found.')
                 break
+            logging.info("... still missing: %s",
+                         ', '.join(sorted(missing_metric_names)))
             time.sleep(polling_interval_seconds)
 
         unexpected_found_metric_names = (
diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
index 3a3f54c00..d721ba5b9 100644
--- a/zaza/openstack/charm_tests/manila/tests.py
+++ b/zaza/openstack/charm_tests/manila/tests.py
@@ -231,6 +231,35 @@ def _mount_share_on_instance(self, instance_ip, ssh_user_name,
                     command=ssh_cmd,
                     verify=verify_status)
 
+    def _umount_share_on_instance(self, instance_ip, ssh_user_name,
+                                  ssh_private_key, share_path):
+        """Umount a share from a Nova instance.
+
+        The mount command is executed via SSH.
+
+        :param instance_ip: IP of the Nova instance.
+        :type instance_ip: string
+        :param ssh_user_name: SSH user name.
+        :type ssh_user_name: string
+        :param ssh_private_key: SSH private key.
+        :type ssh_private_key: string
+        :param share_path: share network path.
+        :type share_path: string
+        """
+        ssh_cmd = 'sudo umount {mount_dir}'.format(mount_dir=self.mount_dir)
+
+        for attempt in tenacity.Retrying(
+                stop=tenacity.stop_after_attempt(5),
+                wait=tenacity.wait_exponential(multiplier=3, min=2, max=10)):
+            with attempt:
+                openstack_utils.ssh_command(
+                    vm_name="instance-{}".format(instance_ip),
+                    ip=instance_ip,
+                    username=ssh_user_name,
+                    privkey=ssh_private_key,
+                    command=ssh_cmd,
+                    verify=verify_status)
+
     @tenacity.retry(
         stop=tenacity.stop_after_attempt(5),
         wait=tenacity.wait_exponential(multiplier=3, min=2, max=10))
@@ -323,6 +352,23 @@ def _restart_share_instance(self):
         """
         return False
 
+    def _wait_for_ceph_healthy(self):
+        """Wait until the ceph health is healthy"""
+        logging.info("Waiting for ceph to be healthy")
+        for attempt in tenacity.Retrying(
+            wait=tenacity.wait_fixed(5),
+            stop=tenacity.stop_after_attempt(10),
+            reraise=True
+        ):
+            logging.info("... testing Ceph")
+            with attempt:
+                self.assertEqual(
+                    zaza.model.run_on_leader(
+                        "ceph-mon", "sudo ceph health")["Code"],
+                    "0"
+        )
+        logging.info("...Ceph is healthy")
+
     def test_manila_share(self):
         """Test that a Manila share can be accessed on two instances.
 
@@ -346,6 +392,10 @@ def test_manila_share(self):
         fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0]
         fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0]
 
+        # force a restart to clear out any clients that may be hanging around
+        # due to restarts on manila-ganesha during deployment.
+        self._restart_share_instance()
+        self._wait_for_ceph_healthy()
         # Create a share
         share = self.manila_client.shares.create(
             share_type=self.share_type_name,
@@ -403,3 +453,9 @@ def test_manila_share(self):
                 fip_2, ssh_user_name, privkey, share_path)
             self._validate_testing_file_from_instance(
                 fip_2, ssh_user_name, privkey)
+
+        # now umount the share on each instance to allow cleaning up.
+        self._umount_share_on_instance(
+            fip_1, ssh_user_name, privkey, share_path)
+        self._umount_share_on_instance(
+            fip_2, ssh_user_name, privkey, share_path)
diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py
index 158fb1ab6..d5c934a6b 100644
--- a/zaza/openstack/charm_tests/manila_ganesha/tests.py
+++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py
@@ -23,6 +23,7 @@
     MANILA_GANESHA_TYPE_NAME,
 )
 
+from zaza import sync_wrapper
 import zaza.openstack.utilities.generic as generic_utils
 import zaza.openstack.charm_tests.manila.tests as manila_tests
 import zaza.model
@@ -54,29 +55,99 @@ def _restart_share_instance(self):
             self.model_name,
             ganeshas))
         for ganesha in ganeshas:
-            ganesha_unit = zaza.model.get_units(ganesha)[0]
+            units = zaza.model.get_units(ganesha)
+            ganesha_unit = units[0]
             hacluster_unit = zaza_utils_juju.get_subordinate_units(
                 [ganesha_unit.entity_id],
                 charm_name='hacluster')
             logging.info('Ganesha in hacluster mode: {}'.format(
                 bool(hacluster_unit)))
 
-            for unit in zaza.model.get_units(ganesha):
+            for unit in units:
                 if hacluster_unit:
                     # While we really only need to run this on the machine
                     # hosting # nfs-ganesha and manila-share, running it
                     # everywhere isn't harmful. Pacemaker handles restarting
                     # the services
+                    logging.info(
+                        "For %s, running systemctl stop manila-share "
+                        "nfs-ganesha", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
                         "systemctl stop manila-share nfs-ganesha")
                 else:
+                    logging.info(
+                        "For %s, running systemctl restart manila-share "
+                        "nfs-ganesha", unit.entity_id)
                     zaza.model.run_on_unit(
                         unit.entity_id,
                         "systemctl restart manila-share nfs-ganesha")
 
+            if hacluster_unit:
+                # now ensure that at least one manila-share and nfs-ganesha is
+                # at least running.
+                unit_names = [unit.entity_id for unit in units]
+                logging.info(
+                    "Blocking until at least one manila-share is running")
+                self._block_until_at_least_one_unit_running_services(
+                    unit_names, ['manila-share'])
+            else:
+                # block until they are all running.
+                for unit in units:
+                    zaza.model.block_until_service_status(
+                        unit_name=unit.entity_id,
+                        services=['manila-share'],
+                        target_status='running'
+                    )
+
         return True
 
+    @staticmethod
+    def _block_until_at_least_one_unit_running_services(
+            units, services, model_name=None, timeout=None):
+        """Block until at least one unit is running the provided services.
+
+        :param units: List of names of unit to run action on
+        :type units: List[str]
+        :param services: List of services to check
+        :type services: List[str]
+        """
+        async def _check_services():
+            for unit_name in units:
+                running_services = {}
+                for service in services:
+                    command = r"pidof -x '{}'".format(service)
+                    out = await zaza.model.async_run_on_unit(
+                        unit_name,
+                        command,
+                        model_name=model_name,
+                        timeout=timeout)
+                    response_size = len(out['Stdout'].strip())
+                    # response_size == 0 means NOT running.
+                    running_services[service] = (response_size > 0)
+                states = ', '.join('{}: {}'.format(k, v)
+                                   for k, v in
+                                   running_services.items())
+                # Note this blocks the async call, but we don't really care as
+                # it should only be a short time.
+                logging.info('For unit {unit}, services: {states}'
+                             .format(unit=unit_name, states=states))
+                active_services = [
+                    service
+                    for service, running in running_services.items()
+                    if running]
+                if len(active_services) == len(services):
+                    # all services are running
+                    return True
+            # No unit has all services running
+            return False
+
+        async def _await_block():
+            await zaza.model.async_block_until(
+                _check_services, timeout=timeout)
+
+        sync_wrapper(_await_block)()
+
     def _run_nrpe_check_command(self, commands):
         try:
             zaza.model.get_application("nrpe")
diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py
index 02826ff7f..5119e0b7a 100644
--- a/zaza/openstack/utilities/__init__.py
+++ b/zaza/openstack/utilities/__init__.py
@@ -118,7 +118,6 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0,
             'retry_exceptions': retry_exceptions,
             'log': _log,
         }
-        _log(f"ObjectRetrierWraps: wrapping {self.__obj}")
 
     def __getattr__(self, name):
         """Get attribute; delegates to wrapped object."""
diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py
index e87f8547d..44df90208 100644
--- a/zaza/openstack/utilities/openstack.py
+++ b/zaza/openstack/utilities/openstack.py
@@ -86,7 +86,7 @@
 from zaza.openstack.utilities import (
     exceptions,
     generic as generic_utils,
-    ObjectRetrierWraps,
+    retry_on_connect_failure,
 )
 import zaza.utilities.networking as network_utils
 
@@ -385,7 +385,7 @@ def get_nova_session_client(session, version=None):
     """
     if not version:
         version = 2
-    return ObjectRetrierWraps(
+    return retry_on_connect_failure(
         novaclient_client.Client(version, session=session))
 
 
@@ -2323,7 +2323,9 @@ def get_remote_ca_cert_file(application, model_name=None):
                 model.scp_from_unit(
                     unit,
                     cert_file,
-                    _tmp_ca.name)
+                    _tmp_ca.name,
+                    scp_opts='-q',
+                )
             except JujuError:
                 continue
             # ensure that the path to put the local cacert in actually exists.
@@ -2565,7 +2567,7 @@ def resource_removed(resource,
                      msg='resource',
                      wait_exponential_multiplier=1,
                      wait_iteration_max_time=60,
-                     stop_after_attempt=8):
+                     stop_after_attempt=30):
     """Wait for an openstack resource to no longer be present.
 
     :param resource: pointer to os resource type, ex: heat_client.stacks

From 391136151c062c741545419d2be744b87ef24f40 Mon Sep 17 00:00:00 2001
From: Alex Kavanagh <alex.kavanagh@canonical.com>
Date: Tue, 16 Jul 2024 14:11:06 +0100
Subject: [PATCH 2/4] Fix pep8 and remove non-working scp_opts

The scp_opts don't seem to be able to quiet the scp: error for the
certificate.
---
 zaza/openstack/charm_tests/ceph/tests.py   | 3 +--
 zaza/openstack/charm_tests/manila/tests.py | 5 ++---
 zaza/openstack/utilities/openstack.py      | 4 +---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/zaza/openstack/charm_tests/ceph/tests.py b/zaza/openstack/charm_tests/ceph/tests.py
index 85de53dcf..d485fe871 100644
--- a/zaza/openstack/charm_tests/ceph/tests.py
+++ b/zaza/openstack/charm_tests/ceph/tests.py
@@ -272,8 +272,7 @@ def test_ceph_encryption(self):
                 source = '/tmp/dmcrypt-keys/*'
                 zaza_model.scp_from_unit(unit_name=unit_name,
                                          source=source,
-                                         destination=tempdir,
-                                         scp_opts='-p')
+                                         destination=tempdir)
                 for elt in listdir(tempdir):
                     file_path = '/'.join([tempdir, elt])
                     if path.isfile(file_path):
diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
index d721ba5b9..262673220 100644
--- a/zaza/openstack/charm_tests/manila/tests.py
+++ b/zaza/openstack/charm_tests/manila/tests.py
@@ -353,7 +353,7 @@ def _restart_share_instance(self):
         return False
 
     def _wait_for_ceph_healthy(self):
-        """Wait until the ceph health is healthy"""
+        """Wait until the ceph health is healthy."""
         logging.info("Waiting for ceph to be healthy")
         for attempt in tenacity.Retrying(
             wait=tenacity.wait_fixed(5),
@@ -365,8 +365,7 @@ def _wait_for_ceph_healthy(self):
                 self.assertEqual(
                     zaza.model.run_on_leader(
                         "ceph-mon", "sudo ceph health")["Code"],
-                    "0"
-        )
+                    "0")
         logging.info("...Ceph is healthy")
 
     def test_manila_share(self):
diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py
index 44df90208..5babdb9f0 100644
--- a/zaza/openstack/utilities/openstack.py
+++ b/zaza/openstack/utilities/openstack.py
@@ -2323,9 +2323,7 @@ def get_remote_ca_cert_file(application, model_name=None):
                 model.scp_from_unit(
                     unit,
                     cert_file,
-                    _tmp_ca.name,
-                    scp_opts='-q',
-                )
+                    _tmp_ca.name)
             except JujuError:
                 continue
             # ensure that the path to put the local cacert in actually exists.

From 363ff1db0faa3be360154a17b9fdc33da6baf535 Mon Sep 17 00:00:00 2001
From: Alex Kavanagh <alex.kavanagh@canonical.com>
Date: Thu, 18 Jul 2024 11:40:17 +0100
Subject: [PATCH 3/4] Modify manila-test to evict mds clients when ceph
 unhealthy

During manila-ganesha charm deployment sometimes zombie ceph mds clients
will be left attached to ceph fs.  This locks the ceph fs service and
breaks the test (can't create a manila-share). The bug is annoying, from
a test gate perspective, but clears itself after 5-10 minutes, and so
isn't (it is currently understood) a production issue.

This patch detects the ceph unhealthy condition and then forcibly evicts
the ceph MDS clients which then allows the test to proceed.

Related-Bug: https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
---
 zaza/openstack/charm_tests/manila/tests.py    | 96 +++++++++++++++----
 .../charm_tests/manila_ganesha/tests.py       | 10 +-
 2 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
index 262673220..99e45eba6 100644
--- a/zaza/openstack/charm_tests/manila/tests.py
+++ b/zaza/openstack/charm_tests/manila/tests.py
@@ -16,6 +16,7 @@
 
 """Encapsulate Manila testing."""
 
+import json
 import logging
 import tenacity
 
@@ -352,21 +353,81 @@ def _restart_share_instance(self):
         """
         return False
 
-    def _wait_for_ceph_healthy(self):
+    def _make_ceph_healthy(self, model_name=None):
+        """Force ceph into a healthy status."""
+        # wait for 30 seconds for self to get healthy
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=6, interval=5, model_name=None)
+        if healthy:
+            return
+        logging.info("Ceph is not healthy: %s", ceph_status)
+        # evict any clients.
+        self._evict_ceph_mds_clients(model_name)
+        self._restart_share_instance()
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=10, interval=15, model_name=None)
+
+    def _wait_for_ceph_fs_healthy(
+            self, repeat=30, interval=20, model_name=None):
         """Wait until the ceph health is healthy."""
-        logging.info("Waiting for ceph to be healthy")
-        for attempt in tenacity.Retrying(
-            wait=tenacity.wait_fixed(5),
-            stop=tenacity.stop_after_attempt(10),
-            reraise=True
-        ):
-            logging.info("... testing Ceph")
-            with attempt:
-                self.assertEqual(
-                    zaza.model.run_on_leader(
-                        "ceph-mon", "sudo ceph health")["Code"],
-                    "0")
-        logging.info("...Ceph is healthy")
+        logging.info("Waiting for ceph to be healthy - up to 10 minutes")
+        try:
+            for attempt in tenacity.Retrying(
+                wait=tenacity.wait_fixed(interval),
+                stop=tenacity.stop_after_attempt(repeat),
+                reraise=True,
+            ):
+                logging.info("... checking Ceph")
+                with attempt:
+                    healthy, ceph_status = self._check_ceph_fs_health(
+                        model_name)
+                    if not healthy:
+                        raise RuntimeError("Ceph was unhealthy: {}"
+                                           .format(ceph_status))
+        except RuntimeError:
+            # we are only retrying for the retries, not to raise an exception.
+            pass
+        if healthy:
+            logging.info("...Ceph is healthy")
+        else:
+            logging.info("...Ceph is not healthy %s", ceph_status)
+        return healthy, ceph_status
+
+    @staticmethod
+    def _check_ceph_fs_health(model_name=None):
+        """Check to see if the ceph fs system is healthy."""
+        cmd_result = zaza.model.run_on_leader(
+            "ceph-mon",
+            "sudo ceph status --format=json",
+            model_name=model_name)
+        status = json.loads(cmd_result['Stdout'])
+        ceph_status = status['health']['status']
+        return (ceph_status == "HEALTH_OK"), ceph_status
+
+    @staticmethod
+    def _evict_ceph_mds_clients(model_name=None):
+        """Evict and ceph mds clients present.
+
+        Essentially work around a manila-ganesha deployment bug:
+        https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
+        """
+        # NOTE:evicting a client adds them to the mds blocklist; this shouldn't
+        # matter for the ephemeral nature of the test.
+        # get the list of clients.
+        cmd_results = zaza.model.run_on_leader(
+            "ceph-mon", "sudo ceph tell mds.0 client ls",
+            model_name=model_name)
+        result = json.loads(cmd_results['Stdout'])
+        client_ids = [client['id'] for client in result]
+        logging.info("Evicting clients %s", ", ".join(
+            str(c) for c in client_ids))
+        # now evict the clients.
+        for client in client_ids:
+            logging.info("Evicting client %s", client)
+            zaza.model.run_on_leader(
+                "ceph-mon",
+                "sudo ceph tell mds.0 client evict id={}".format(client),
+                model_name=model_name)
 
     def test_manila_share(self):
         """Test that a Manila share can be accessed on two instances.
@@ -392,9 +453,12 @@ def test_manila_share(self):
         fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0]
 
         # force a restart to clear out any clients that may be hanging around
-        # due to restarts on manila-ganesha during deployment.
+        # due to restarts on manila-ganesha during deployment; this also forces
+        # an HA manila into a stable state.
         self._restart_share_instance()
-        self._wait_for_ceph_healthy()
+        # Clean out any old clients causes by restarting manila-ganesha shares
+        # and ganesha.nfsd daemons.
+        self._make_ceph_healthy()
         # Create a share
         share = self.manila_client.shares.create(
             share_type=self.share_type_name,
diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py
index d5c934a6b..6a028173c 100644
--- a/zaza/openstack/charm_tests/manila_ganesha/tests.py
+++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py
@@ -70,11 +70,15 @@ def _restart_share_instance(self):
                     # everywhere isn't harmful. Pacemaker handles restarting
                     # the services
                     logging.info(
-                        "For %s, running systemctl stop manila-share "
-                        "nfs-ganesha", unit.entity_id)
+                        "For %s, running systemctl stop manila-share, "
+                        "kill -HUP pidof ganesha.nfsd", unit.entity_id)
+                    zaza.model.run_on_unit(
+                        unit.entity_id,
+                        "systemctl stop manila-share")
                     zaza.model.run_on_unit(
                         unit.entity_id,
-                        "systemctl stop manila-share nfs-ganesha")
+                        'pidof ganesha.nfsd && '
+                        'kill -HUP $(pidof ganesha.nfsd)')
                 else:
                     logging.info(
                         "For %s, running systemctl restart manila-share "

From af56e9e5be1960c89df993268e047bac77df2fe6 Mon Sep 17 00:00:00 2001
From: Alex Kavanagh <alex.kavanagh@canonical.com>
Date: Thu, 18 Jul 2024 13:39:21 +0100
Subject: [PATCH 4/4] Move ceph eviction code from manila to manila-ganesha

This is so that manila *only* tests without ceph will not try to
clean-up non-existant ceph units.
---
 zaza/openstack/charm_tests/manila/tests.py    | 84 -----------------
 .../charm_tests/manila_ganesha/tests.py       | 92 +++++++++++++++++++
 2 files changed, 92 insertions(+), 84 deletions(-)

diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py
index 99e45eba6..87e06a7ce 100644
--- a/zaza/openstack/charm_tests/manila/tests.py
+++ b/zaza/openstack/charm_tests/manila/tests.py
@@ -16,7 +16,6 @@
 
 """Encapsulate Manila testing."""
 
-import json
 import logging
 import tenacity
 
@@ -353,82 +352,6 @@ def _restart_share_instance(self):
         """
         return False
 
-    def _make_ceph_healthy(self, model_name=None):
-        """Force ceph into a healthy status."""
-        # wait for 30 seconds for self to get healthy
-        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
-            repeat=6, interval=5, model_name=None)
-        if healthy:
-            return
-        logging.info("Ceph is not healthy: %s", ceph_status)
-        # evict any clients.
-        self._evict_ceph_mds_clients(model_name)
-        self._restart_share_instance()
-        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
-            repeat=10, interval=15, model_name=None)
-
-    def _wait_for_ceph_fs_healthy(
-            self, repeat=30, interval=20, model_name=None):
-        """Wait until the ceph health is healthy."""
-        logging.info("Waiting for ceph to be healthy - up to 10 minutes")
-        try:
-            for attempt in tenacity.Retrying(
-                wait=tenacity.wait_fixed(interval),
-                stop=tenacity.stop_after_attempt(repeat),
-                reraise=True,
-            ):
-                logging.info("... checking Ceph")
-                with attempt:
-                    healthy, ceph_status = self._check_ceph_fs_health(
-                        model_name)
-                    if not healthy:
-                        raise RuntimeError("Ceph was unhealthy: {}"
-                                           .format(ceph_status))
-        except RuntimeError:
-            # we are only retrying for the retries, not to raise an exception.
-            pass
-        if healthy:
-            logging.info("...Ceph is healthy")
-        else:
-            logging.info("...Ceph is not healthy %s", ceph_status)
-        return healthy, ceph_status
-
-    @staticmethod
-    def _check_ceph_fs_health(model_name=None):
-        """Check to see if the ceph fs system is healthy."""
-        cmd_result = zaza.model.run_on_leader(
-            "ceph-mon",
-            "sudo ceph status --format=json",
-            model_name=model_name)
-        status = json.loads(cmd_result['Stdout'])
-        ceph_status = status['health']['status']
-        return (ceph_status == "HEALTH_OK"), ceph_status
-
-    @staticmethod
-    def _evict_ceph_mds_clients(model_name=None):
-        """Evict and ceph mds clients present.
-
-        Essentially work around a manila-ganesha deployment bug:
-        https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
-        """
-        # NOTE:evicting a client adds them to the mds blocklist; this shouldn't
-        # matter for the ephemeral nature of the test.
-        # get the list of clients.
-        cmd_results = zaza.model.run_on_leader(
-            "ceph-mon", "sudo ceph tell mds.0 client ls",
-            model_name=model_name)
-        result = json.loads(cmd_results['Stdout'])
-        client_ids = [client['id'] for client in result]
-        logging.info("Evicting clients %s", ", ".join(
-            str(c) for c in client_ids))
-        # now evict the clients.
-        for client in client_ids:
-            logging.info("Evicting client %s", client)
-            zaza.model.run_on_leader(
-                "ceph-mon",
-                "sudo ceph tell mds.0 client evict id={}".format(client),
-                model_name=model_name)
-
     def test_manila_share(self):
         """Test that a Manila share can be accessed on two instances.
 
@@ -452,13 +375,6 @@ def test_manila_share(self):
         fip_1 = neutron_tests.floating_ips_from_instance(instance_1)[0]
         fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0]
 
-        # force a restart to clear out any clients that may be hanging around
-        # due to restarts on manila-ganesha during deployment; this also forces
-        # an HA manila into a stable state.
-        self._restart_share_instance()
-        # Clean out any old clients causes by restarting manila-ganesha shares
-        # and ganesha.nfsd daemons.
-        self._make_ceph_healthy()
         # Create a share
         share = self.manila_client.shares.create(
             share_type=self.share_type_name,
diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py
index 6a028173c..0798a6f70 100644
--- a/zaza/openstack/charm_tests/manila_ganesha/tests.py
+++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py
@@ -16,6 +16,7 @@
 
 """Encapsulate Manila Ganesha testing."""
 
+import json
 import logging
 import tenacity
 
@@ -219,3 +220,94 @@ def test_905_nrpe_custom_service_checks(self):
         ]
 
         self._run_nrpe_check_command(commands)
+
+    def _make_ceph_healthy(self, model_name=None):
+        """Force ceph into a healthy status."""
+        # wait for 30 seconds for self to get healthy
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=6, interval=5, model_name=None)
+        if healthy:
+            return
+        logging.info("Ceph is not healthy: %s", ceph_status)
+        # evict any clients.
+        self._evict_ceph_mds_clients(model_name)
+        self._restart_share_instance()
+        healthy, ceph_status = self._wait_for_ceph_fs_healthy(
+            repeat=10, interval=15, model_name=None)
+
+    def _wait_for_ceph_fs_healthy(
+            self, repeat=30, interval=20, model_name=None):
+        """Wait until the ceph health is healthy."""
+        logging.info("Waiting for ceph to be healthy ...")
+        try:
+            for attempt in tenacity.Retrying(
+                wait=tenacity.wait_fixed(interval),
+                stop=tenacity.stop_after_attempt(repeat),
+                reraise=True,
+            ):
+                logging.info("... checking Ceph")
+                with attempt:
+                    healthy, ceph_status = self._check_ceph_fs_health(
+                        model_name)
+                    if not healthy:
+                        raise RuntimeError("Ceph was unhealthy: {}"
+                                           .format(ceph_status))
+        except RuntimeError:
+            # we are only retrying for the retries, not to raise an exception.
+            pass
+        if healthy:
+            logging.info("...Ceph is healthy")
+        else:
+            logging.info("...Ceph is not healthy %s", ceph_status)
+        return healthy, ceph_status
+
+    @staticmethod
+    def _check_ceph_fs_health(model_name=None):
+        """Check to see if the ceph fs system is healthy."""
+        cmd_result = zaza.model.run_on_leader(
+            "ceph-mon",
+            "sudo ceph status --format=json",
+            model_name=model_name)
+        status = json.loads(cmd_result['Stdout'])
+        ceph_status = status['health']['status']
+        return (ceph_status == "HEALTH_OK"), ceph_status
+
+    @staticmethod
+    def _evict_ceph_mds_clients(model_name=None):
+        """Evict and ceph mds clients present.
+
+        Essentially work around a manila-ganesha deployment bug:
+        https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
+        """
+        # NOTE:evicting a client adds them to the mds blocklist; this shouldn't
+        # matter for the ephemeral nature of the test.
+        # get the list of clients.
+        cmd_results = zaza.model.run_on_leader(
+            "ceph-mon", "sudo ceph tell mds.0 client ls",
+            model_name=model_name)
+        result = json.loads(cmd_results['Stdout'])
+        client_ids = [client['id'] for client in result]
+        logging.info("Evicting clients %s", ", ".join(
+            str(c) for c in client_ids))
+        # now evict the clients.
+        for client in client_ids:
+            logging.info("Evicting client %s", client)
+            zaza.model.run_on_leader(
+                "ceph-mon",
+                "sudo ceph tell mds.0 client evict id={}".format(client),
+                model_name=model_name)
+
+    def test_manila_share(self):
+        """Test that a manila-ganesha share can be accessed on two instances.
+
+        This overrides the base manila test by prefixing a make ceph healthy
+        stage.
+        """
+        # force a restart to clear out any clients that may be hanging around
+        # due to restarts on manila-ganesha during deployment; this also forces
+        # an HA manila into a stable state.
+        self._restart_share_instance()
+        # Clean out any old clients causes by restarting manila-ganesha shares
+        # and ganesha.nfsd daemons.
+        self._make_ceph_healthy()
+        super().test_manila_share()