From 363ff1db0faa3be360154a17b9fdc33da6baf535 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 18 Jul 2024 11:40:17 +0100 Subject: [PATCH] Modify manila-test to evict mds clients when ceph unhealthy During manila-ganesha charm deployment sometimes zombie ceph mds clients will be left attached to ceph fs. This locks the ceph fs service and breaks the test (can't create a manila-share). The bug is annoying, from a test gate perspective, but clears itself after 5-10 minutes, and so isn't (it is currently understood) a production issue. This patch detects the ceph unhealthy condition and then forcibly evicts the ceph MDS clients which then allows the test to proceed. Related-Bug: https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 --- zaza/openstack/charm_tests/manila/tests.py | 96 +++++++++++++++---- .../charm_tests/manila_ganesha/tests.py | 10 +- 2 files changed, 87 insertions(+), 19 deletions(-) diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index 262673220..99e45eba6 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -16,6 +16,7 @@ """Encapsulate Manila testing.""" +import json import logging import tenacity @@ -352,21 +353,81 @@ def _restart_share_instance(self): """ return False - def _wait_for_ceph_healthy(self): + def _make_ceph_healthy(self, model_name=None): + """Force ceph into a healthy status.""" + # wait for 30 seconds for self to get healthy + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=6, interval=5, model_name=None) + if healthy: + return + logging.info("Ceph is not healthy: %s", ceph_status) + # evict any clients. + self._evict_ceph_mds_clients(model_name) + self._restart_share_instance() + healthy, ceph_status = self._wait_for_ceph_fs_healthy( + repeat=10, interval=15, model_name=None) + + def _wait_for_ceph_fs_healthy( + self, repeat=30, interval=20, model_name=None): """Wait until the ceph health is healthy.""" - logging.info("Waiting for ceph to be healthy") - for attempt in tenacity.Retrying( - wait=tenacity.wait_fixed(5), - stop=tenacity.stop_after_attempt(10), - reraise=True - ): - logging.info("... testing Ceph") - with attempt: - self.assertEqual( - zaza.model.run_on_leader( - "ceph-mon", "sudo ceph health")["Code"], - "0") - logging.info("...Ceph is healthy") + logging.info("Waiting for ceph to be healthy - up to 10 minutes") + try: + for attempt in tenacity.Retrying( + wait=tenacity.wait_fixed(interval), + stop=tenacity.stop_after_attempt(repeat), + reraise=True, + ): + logging.info("... checking Ceph") + with attempt: + healthy, ceph_status = self._check_ceph_fs_health( + model_name) + if not healthy: + raise RuntimeError("Ceph was unhealthy: {}" + .format(ceph_status)) + except RuntimeError: + # we are only retrying for the retries, not to raise an exception. + pass + if healthy: + logging.info("...Ceph is healthy") + else: + logging.info("...Ceph is not healthy %s", ceph_status) + return healthy, ceph_status + + @staticmethod + def _check_ceph_fs_health(model_name=None): + """Check to see if the ceph fs system is healthy.""" + cmd_result = zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph status --format=json", + model_name=model_name) + status = json.loads(cmd_result['Stdout']) + ceph_status = status['health']['status'] + return (ceph_status == "HEALTH_OK"), ceph_status + + @staticmethod + def _evict_ceph_mds_clients(model_name=None): + """Evict and ceph mds clients present. + + Essentially work around a manila-ganesha deployment bug: + https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498 + """ + # NOTE:evicting a client adds them to the mds blocklist; this shouldn't + # matter for the ephemeral nature of the test. + # get the list of clients. + cmd_results = zaza.model.run_on_leader( + "ceph-mon", "sudo ceph tell mds.0 client ls", + model_name=model_name) + result = json.loads(cmd_results['Stdout']) + client_ids = [client['id'] for client in result] + logging.info("Evicting clients %s", ", ".join( + str(c) for c in client_ids)) + # now evict the clients. + for client in client_ids: + logging.info("Evicting client %s", client) + zaza.model.run_on_leader( + "ceph-mon", + "sudo ceph tell mds.0 client evict id={}".format(client), + model_name=model_name) def test_manila_share(self): """Test that a Manila share can be accessed on two instances. @@ -392,9 +453,12 @@ def test_manila_share(self): fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0] # force a restart to clear out any clients that may be hanging around - # due to restarts on manila-ganesha during deployment. + # due to restarts on manila-ganesha during deployment; this also forces + # an HA manila into a stable state. self._restart_share_instance() - self._wait_for_ceph_healthy() + # Clean out any old clients causes by restarting manila-ganesha shares + # and ganesha.nfsd daemons. + self._make_ceph_healthy() # Create a share share = self.manila_client.shares.create( share_type=self.share_type_name, diff --git a/zaza/openstack/charm_tests/manila_ganesha/tests.py b/zaza/openstack/charm_tests/manila_ganesha/tests.py index d5c934a6b..6a028173c 100644 --- a/zaza/openstack/charm_tests/manila_ganesha/tests.py +++ b/zaza/openstack/charm_tests/manila_ganesha/tests.py @@ -70,11 +70,15 @@ def _restart_share_instance(self): # everywhere isn't harmful. Pacemaker handles restarting # the services logging.info( - "For %s, running systemctl stop manila-share " - "nfs-ganesha", unit.entity_id) + "For %s, running systemctl stop manila-share, " + "kill -HUP pidof ganesha.nfsd", unit.entity_id) + zaza.model.run_on_unit( + unit.entity_id, + "systemctl stop manila-share") zaza.model.run_on_unit( unit.entity_id, - "systemctl stop manila-share nfs-ganesha") + 'pidof ganesha.nfsd && ' + 'kill -HUP $(pidof ganesha.nfsd)') else: logging.info( "For %s, running systemctl restart manila-share "