Work Stealing (#48)

* Add a work stealing benchmark * Replace implementation of MPMCQ This replaces the existing queue implementation with a new version that supports dequeue_all. The dequeue_all will be used in a subsequent PR, which will implement a better work stealing algorithm. * Add multiple queues per core This adds a fixed set of queues per core that are used in a round-robin fashion. This allows more efficient stealing, which will be part of the next commit. * Fairness The new stealing mechanism can work with fairness, but works less well. It is more likely that a steal will take the fairness token, and hence we get some less good behaviour.
microsoft · Nov 21, 2024 · 7deb587 · 7deb587
1 parent 7579e66
commit 7deb587
Show file tree

Hide file tree

Showing 9 changed files with 480 additions and 206 deletions.
diff --git a/src/rt/ds/wrapindex.h b/src/rt/ds/wrapindex.h
@@ -0,0 +1,37 @@
+// Copyright Microsoft and Project Verona Contributors.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include <cstddef>
+
+// WrapIndex is a simple class that wraps around an index.
+template<size_t N>
+class WrapIndex
+{
+  size_t index;
+
+public:
+  WrapIndex() : index(0) {}
+
+  // Returns the next index and wraps around.
+  size_t operator++()
+  {
+    index = (index + 1) % N;
+    return index;
+  }
+
+  size_t operator--(int)
+  {
+    auto result = index;
+    if (result == 0)
+      index = N - 1;
+    else
+      index--;
+    return result;
+  }
+
+  operator size_t() const
+  {
+    return index;
+  }
+};
diff --git a/src/rt/sched/core.h b/src/rt/sched/core.h
@@ -5,6 +5,7 @@
 #include "mpmcq.h"
 #include "schedulerstats.h"
 #include "work.h"
+#include "workstealingqueue.h"
 
 #include <atomic>
 #include <snmalloc/snmalloc.h>
@@ -15,9 +16,22 @@ namespace verona::rt
   {
   public:
     size_t affinity = 0;
-    MPMCQ<Work> q;
+    WorkStealingQueue<4> q;
     std::atomic<Core*> next{nullptr};
-    std::atomic<bool> should_steal_for_fairness{false};
+
+    std::atomic<bool> should_steal_for_fairness{true};
+
+    /**
+     * @brief Create a token work object.  It is affinitised to the `this`
+     * core, and marks that stealing is required, for fairness.
+     */
+    Work* token_work{Closure::make([this](Work* w) {
+      this->should_steal_for_fairness = true;
+      // The token work is only deallocated during the destruction of the core.
+      // The destructor will run the token work, and return true, so that the
+      // closure code will run destructors and deallocate the memory.
+      return this->token_work == nullptr;
+    })};
 
     /// Progress and synchronization between the threads.
     //  These counters represent progress on a CPU core, not necessarily on
@@ -28,24 +42,14 @@ namespace verona::rt
 
     SchedulerStats stats;
 
-    /**
-     * @brief Create a token work object.  It is affinitised to the `home`
-     * core, and marks that stealing is required, for fairness. Once completed
-     * it reschedules itself on the home core.
-     */
-    Work* create_token_work(Core* home)
-    {
-      auto w = Closure::make([home](Work* w) {
-        home->should_steal_for_fairness = true;
-        home->q.enqueue(w);
-        return false;
-      });
-      return w;
-    }
-
   public:
-    Core() : q{create_token_work(this)} {}
+    Core() : q{} {}
 
-    ~Core() {}
+    ~Core()
+    {
+      auto tw = token_work;
+      token_work = nullptr;
+      tw->run();
+    }
   };
 }