Skip to content

Commit

Permalink
Modify manila-test to evict mds clients when ceph unhealthy
Browse files Browse the repository at this point in the history
During manila-ganesha charm deployment sometimes zombie ceph mds clients
will be left attached to ceph fs.  This locks the ceph fs service and
breaks the test (can't create a manila-share). The bug is annoying, from
a test gate perspective, but clears itself after 5-10 minutes, and so
isn't (it is currently understood) a production issue.

This patch detects the ceph unhealthy condition and then forcibly evicts
the ceph MDS clients which then allows the test to proceed.

Related-Bug: https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
  • Loading branch information
ajkavanagh committed Jul 18, 2024
1 parent 3911361 commit 363ff1d
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 19 deletions.
96 changes: 80 additions & 16 deletions zaza/openstack/charm_tests/manila/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

"""Encapsulate Manila testing."""

import json
import logging
import tenacity

Expand Down Expand Up @@ -352,21 +353,81 @@ def _restart_share_instance(self):
"""
return False

def _wait_for_ceph_healthy(self):
def _make_ceph_healthy(self, model_name=None):
"""Force ceph into a healthy status."""
# wait for 30 seconds for self to get healthy
healthy, ceph_status = self._wait_for_ceph_fs_healthy(
repeat=6, interval=5, model_name=None)
if healthy:
return
logging.info("Ceph is not healthy: %s", ceph_status)
# evict any clients.
self._evict_ceph_mds_clients(model_name)
self._restart_share_instance()
healthy, ceph_status = self._wait_for_ceph_fs_healthy(
repeat=10, interval=15, model_name=None)

def _wait_for_ceph_fs_healthy(
self, repeat=30, interval=20, model_name=None):
"""Wait until the ceph health is healthy."""
logging.info("Waiting for ceph to be healthy")
for attempt in tenacity.Retrying(
wait=tenacity.wait_fixed(5),
stop=tenacity.stop_after_attempt(10),
reraise=True
):
logging.info("... testing Ceph")
with attempt:
self.assertEqual(
zaza.model.run_on_leader(
"ceph-mon", "sudo ceph health")["Code"],
"0")
logging.info("...Ceph is healthy")
logging.info("Waiting for ceph to be healthy - up to 10 minutes")
try:
for attempt in tenacity.Retrying(
wait=tenacity.wait_fixed(interval),
stop=tenacity.stop_after_attempt(repeat),
reraise=True,
):
logging.info("... checking Ceph")
with attempt:
healthy, ceph_status = self._check_ceph_fs_health(
model_name)
if not healthy:
raise RuntimeError("Ceph was unhealthy: {}"
.format(ceph_status))
except RuntimeError:
# we are only retrying for the retries, not to raise an exception.
pass
if healthy:
logging.info("...Ceph is healthy")
else:
logging.info("...Ceph is not healthy %s", ceph_status)
return healthy, ceph_status

@staticmethod
def _check_ceph_fs_health(model_name=None):
"""Check to see if the ceph fs system is healthy."""
cmd_result = zaza.model.run_on_leader(
"ceph-mon",
"sudo ceph status --format=json",
model_name=model_name)
status = json.loads(cmd_result['Stdout'])
ceph_status = status['health']['status']
return (ceph_status == "HEALTH_OK"), ceph_status

@staticmethod
def _evict_ceph_mds_clients(model_name=None):
"""Evict and ceph mds clients present.
Essentially work around a manila-ganesha deployment bug:
https://bugs.launchpad.net/charm-manila-ganesha/+bug/2073498
"""
# NOTE:evicting a client adds them to the mds blocklist; this shouldn't
# matter for the ephemeral nature of the test.
# get the list of clients.
cmd_results = zaza.model.run_on_leader(
"ceph-mon", "sudo ceph tell mds.0 client ls",
model_name=model_name)
result = json.loads(cmd_results['Stdout'])
client_ids = [client['id'] for client in result]
logging.info("Evicting clients %s", ", ".join(
str(c) for c in client_ids))
# now evict the clients.
for client in client_ids:
logging.info("Evicting client %s", client)
zaza.model.run_on_leader(
"ceph-mon",
"sudo ceph tell mds.0 client evict id={}".format(client),
model_name=model_name)

def test_manila_share(self):
"""Test that a Manila share can be accessed on two instances.
Expand All @@ -392,9 +453,12 @@ def test_manila_share(self):
fip_2 = neutron_tests.floating_ips_from_instance(instance_2)[0]

# force a restart to clear out any clients that may be hanging around
# due to restarts on manila-ganesha during deployment.
# due to restarts on manila-ganesha during deployment; this also forces
# an HA manila into a stable state.
self._restart_share_instance()
self._wait_for_ceph_healthy()
# Clean out any old clients causes by restarting manila-ganesha shares
# and ganesha.nfsd daemons.
self._make_ceph_healthy()
# Create a share
share = self.manila_client.shares.create(
share_type=self.share_type_name,
Expand Down
10 changes: 7 additions & 3 deletions zaza/openstack/charm_tests/manila_ganesha/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,15 @@ def _restart_share_instance(self):
# everywhere isn't harmful. Pacemaker handles restarting
# the services
logging.info(
"For %s, running systemctl stop manila-share "
"nfs-ganesha", unit.entity_id)
"For %s, running systemctl stop manila-share, "
"kill -HUP pidof ganesha.nfsd", unit.entity_id)
zaza.model.run_on_unit(
unit.entity_id,
"systemctl stop manila-share")
zaza.model.run_on_unit(
unit.entity_id,
"systemctl stop manila-share nfs-ganesha")
'pidof ganesha.nfsd && '
'kill -HUP $(pidof ganesha.nfsd)')
else:
logging.info(
"For %s, running systemctl restart manila-share "
Expand Down

0 comments on commit 363ff1d

Please sign in to comment.