Skip to content

Commit

Permalink
Merge pull request ceph#58282 from NitzanMordhai/wip-nitzan-daemonwat…
Browse files Browse the repository at this point in the history
…chdog-should-terminate-thrasher-when-bark

qa/tasks: watchdog should terminate thrasher
  • Loading branch information
NitzanMordhai authored Aug 6, 2024
2 parents e88ab65 + a035b5a commit 58a668d
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 8 deletions.
16 changes: 14 additions & 2 deletions qa/tasks/ceph_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,11 +870,16 @@ def all_up_in(self):
self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
str(osd), str(1))

def do_join(self):
def stop(self):
"""
Break out of this Ceph loop
Stop the thrasher
"""
self.stopping = True

def join(self):
"""
Break out of this Ceph loop
"""
self.thread.get()
if self.sighup_delay:
self.log("joining the do_sighup greenlet")
Expand All @@ -889,6 +894,13 @@ def do_join(self):
self.log("joining the do_noscrub_toggle greenlet")
self.noscrub_toggle_thread.join()

def stop_and_join(self):
"""
Stop and join the thrasher
"""
self.stop()
return self.join()

def grow_pool(self):
"""
Increase the size of the pool
Expand Down
1 change: 1 addition & 0 deletions qa/tasks/daemonwatchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def watch(self):
for thrasher in self.thrashers:
if thrasher.exception is not None:
self.log("{name} failed".format(name=thrasher.name))
thrasher.stop_and_join()
bark = True

if bark:
Expand Down
17 changes: 15 additions & 2 deletions qa/tasks/mon_thrash.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,26 @@ def log(self, x):
"""
self.logger.info(x)

def do_join(self):
def stop(self):
"""
Stop the thrashing process.
"""
self.stopping = True

def join(self):
"""
Break out of this processes thrashing loop.
"""
self.stopping.set()
self.thread.get()

def stop_and_join(self):
"""
Stop the thrashing process and join the thread.
"""
self.stop()
return self.join()

def should_thrash_store(self):
"""
If allowed, indicate that we should thrash a certain percentage of
Expand Down Expand Up @@ -450,6 +463,6 @@ def task(ctx, config):
yield
finally:
log.info('joining mon_thrasher')
thrash_proc.do_join()
thrash_proc.stop_and_join()
mons = _get_mons(ctx)
manager.wait_for_mon_quorum_size(len(mons))
14 changes: 11 additions & 3 deletions qa/tasks/scrub.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def task(ctx, config):
yield
finally:
log.info('joining scrub')
scrub_proc.do_join()
scrub_proc.stop_and_join()

class Scrubber:
"""
Expand Down Expand Up @@ -91,11 +91,19 @@ def tmp(x):

self.thread = gevent.spawn(self.do_scrub)

def do_join(self):
"""Scrubbing thread finished"""
def stop(self):
"""Stop scrubbing"""
self.stopping = True

def join(self):
"""Scrubbing thread finished"""
self.thread.get()

def stop_and_join(self):
"""Stop scrubbing thread"""
self.stop()
return self.join()

def do_scrub(self):
"""Perform the scrub operation"""
frequency = self.config.get("frequency", 30)
Expand Down
7 changes: 7 additions & 0 deletions qa/tasks/thrasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ def exception(self):
def set_thrasher_exception(self, e):
self._exception = e

def stop(self):
raise NotImplementedError("Subclasses didn't implement this method.")

class ThrasherGreenlet(Thrasher, Greenlet):

class Stopped(Exception): ...
Expand Down Expand Up @@ -46,3 +49,7 @@ def sleep_unless_stopped(self, seconds, raise_stopped = True):
if self.is_stopped and raise_stopped:
raise self.Stopped()
return not self.is_stopped

def stop_and_join(self):
self.stop()
return self.join()
2 changes: 1 addition & 1 deletion qa/tasks/thrashosds.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def task(ctx, config):
yield
finally:
log.info('joining thrashosds')
thrash_proc.do_join()
thrash_proc.stop_and_join()
cluster_manager.wait_for_all_osds_up()
cluster_manager.flush_all_pg_stats()
cluster_manager.wait_for_recovery(config.get('timeout', 360))
Expand Down

0 comments on commit 58a668d

Please sign in to comment.