reble · mfrancepillois · Feb 23, 2024 · Feb 26, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -341,6 +341,11 @@ class assume_buffer_outlives_graph {
   public:
     assume_buffer_outlives_graph() = default;
 };
+
+class disable_in_order_optimization {
+public:
+  disable_in_order_optimization() = default;
+};
 } // namespace graph
 
 namespace node {
@@ -646,6 +651,15 @@ which is used in a graph will be kept alive on the host for the lifetime of the
 graph. Destroying that buffer during the lifetime of a `command_graph`
 constructed with this property results in undefined behavior.
 
+===== Disable-In-Order-Optimization Property [[disable-in-order-optimization]]
+
+The `property::graph::disable_in_order_optimization` property disables 
+an optimization which can be applied to in-order graph to 
+reduce its execution time.
+This property is passed to the `command_graph::finalize()` function.
+Note that this property must be passed to the finalize function if users 
+want to profile the exec_graph execution.
+
 ==== Graph Member Functions
 
 Table {counter: tableNumber}. Constructor of the `command_graph` class.
@@ -849,8 +863,9 @@ Preconditions:
 
 Parameters:
 
-* `propList` - Optional parameter for passing properties. No finalization
-  properties are defined by this extension.
+* `propList` - Optional parameter for passing properties. The only defined 
+property is `disable_in_order_optimization`. See <<disable_in_order_optimization, Disable-In-Order-Optimization>>
+for more details.
 
 Returns: A new executable graph object which can be submitted to a queue.
 

diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
@@ -2304,7 +2304,7 @@ typedef enum {
 struct pi_ext_command_buffer_desc final {
   pi_ext_structure_type stype;
   const void *pNext;
-  pi_queue_properties *properties;
+  pi_bool is_in_order;
 };
 
 /// API to create a command-buffer.

diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp
@@ -47,8 +47,9 @@ enum DataLessPropKind {
   GraphAssumeDataOutlivesBuffer = 22,
   GraphAssumeBufferOutlivesGraph = 23,
   GraphDependOnAllLeaves = 24,
+  DisableInOrderOptimization = 25,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 24,
+  LastKnownDataLessPropKind = 25,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -146,6 +146,15 @@ class assume_buffer_outlives_graph
 public:
   assume_buffer_outlives_graph() = default;
 };
+
+/// Property used to to add all previous graph leaves as dependencies when
+/// creating a new node with command_graph::add().
+class disable_in_order_optimization
+    : public ::sycl::detail::DataLessProperty<
+          ::sycl::detail::DisableInOrderOptimization> {
+public:
+  disable_in_order_optimization() = default;
+};
 } // namespace graph
 
 namespace node {
@@ -340,8 +349,10 @@ class __SYCL_EXPORT executable_command_graph {
   /// Constructor used by internal runtime.
   /// @param Graph Detail implementation class to construct with.
   /// @param Ctx Context to use for graph.
+  /// @param DisableInOrderOptimization Disable the In Order graph opimization
   executable_command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
-                           const sycl::context &Ctx);
+                           const sycl::context &Ctx,
+                           const bool DisableInOrderOptimization = false);
 
   template <class Obj>
   friend decltype(Obj::impl)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,9 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 3487672ceba0fd3575b5f3f15a832b100dc5fbad
-  # Author: Artur Gainullin <[email protected]>
-  # Date:   Fri Feb 16 09:59:50 2024 -0800
-  #
-  #     [UR] Provide flexibility to replace unified-memory-framework repo and tag
-  set(UNIFIED_RUNTIME_TAG 3487672ceba0fd3575b5f3f15a832b100dc5fbad)
-
+  set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
+  set(UNIFIED_RUNTIME_TAG maxime/in-order-cmd-list)
+
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
   endif()

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -4425,13 +4425,14 @@ piextCommandBufferCreate(pi_context Context, pi_device Device,
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
   ur_device_handle_t UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  const ur_exp_command_buffer_desc_t *UrDesc =
-      reinterpret_cast<const ur_exp_command_buffer_desc_t *>(Desc);
+  ur_exp_command_buffer_desc_t UrDesc;
+  UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC;
+  UrDesc.isInOrder = Desc->is_in_order;
   ur_exp_command_buffer_handle_t *UrCommandBuffer =
       reinterpret_cast<ur_exp_command_buffer_handle_t *>(RetCommandBuffer);
 
   HANDLE_ERRORS(
-      urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer));
+      urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer));
 
   return PI_SUCCESS;
 }

diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
@@ -265,6 +265,7 @@ void exec_graph_impl::makePartitions() {
     }
     if (Partition->MRoots.size() > 0) {
       Partition->schedule();
+      Partition->checkIfGraphIsSinglePath();
       MPartitions.push_back(Partition);
       PartitionFinalNum++;
     }
@@ -613,6 +614,8 @@ std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents() {
 void exec_graph_impl::findRealDeps(
     std::vector<sycl::detail::pi::PiExtSyncPoint> &Deps,
     std::shared_ptr<node_impl> CurrentNode, int ReferencePartitionNum) {
+  // if (MPartitions[MPartitionNodes[CurrentNode]]->MIsInOrderGraph)
+  //   return;
   if (CurrentNode->isEmpty()) {
     for (auto &N : CurrentNode->MPredecessors) {
       auto NodeImpl = N.lock();
@@ -680,7 +683,9 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
 void exec_graph_impl::createCommandBuffers(
     sycl::device Device, std::shared_ptr<partition> &Partition) {
   sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer;
-  sycl::detail::pi::PiExtCommandBufferDesc Desc{};
+  sycl::detail::pi::PiExtCommandBufferDesc Desc{
+      pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr,
+      pi_bool(Partition->MIsInOrderGraph & UseInOrderCommandList)};
   auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
   const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
   auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
@@ -694,6 +699,8 @@ void exec_graph_impl::createCommandBuffers(
 
   Partition->MPiCommandBuffers[Device] = OutCommandBuffer;
 
+  printGraphAsDot("execgraph.dot", false);
+
   for (const auto &Node : Partition->MSchedule) {
     // Empty nodes are not processed as other nodes, but only their
     // dependencies are propagated in findRealDeps
@@ -1152,12 +1159,16 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) {
 }
 
 command_graph<graph_state::executable>
-modifiable_command_graph::finalize(const sycl::property_list &) const {
+modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
   // Graph is read and written in this scope so we lock
   // this graph with full priviledges.
   graph_impl::WriteLock Lock(impl->MMutex);
-  return command_graph<graph_state::executable>{this->impl,
-                                                this->impl->getContext()};
+  bool DisableInOrderOptim = false;
+  if (PropList.has_property<property::graph::disable_in_order_optimization>()) {
+    DisableInOrderOptim = true;
+  }
+  return command_graph<graph_state::executable>{
+      this->impl, this->impl->getContext(), DisableInOrderOptim};
 }
 
 bool modifiable_command_graph::begin_recording(queue &RecordingQueue) {
@@ -1271,8 +1282,10 @@ std::vector<node> modifiable_command_graph::get_root_nodes() const {
 }
 
 executable_command_graph::executable_command_graph(
-    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
-    : impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
+    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
+    const bool DisableInOrderOptimization)
+    : impl(std::make_shared<detail::exec_graph_impl>(
+          Ctx, Graph, DisableInOrderOptimization)) {
   finalizeImpl(); // Create backend representation for executable graph
 }
 

diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
@@ -331,6 +331,24 @@ class node_impl {
     }
   }
 
+  /// Test is the node contains a N-D copy
+  /// @return true is the op is a N-D copy
+  bool is2DCopyNode() {
+    if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) ||
+        (MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) ||
+        (MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) {
+      sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get();
+      sycl::detail::Requirement *ReqSrc =
+          (sycl::detail::Requirement *)(Copy->getSrc());
+      sycl::detail::Requirement *ReqDst =
+          (sycl::detail::Requirement *)(Copy->getDst());
+      if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
 private:
   /// Prints Node information to Stream.
   /// @param Stream Where to print the Node information
@@ -550,13 +568,37 @@ class partition {
       MPiCommandBuffers;
   /// List of predecessors to this partition.
   std::vector<std::shared_ptr<partition>> MPredecessors;
+  /// True is the graph of this partition is a single path graph
+  /// and InOrder optmization can be applied on it.
+  bool MIsInOrderGraph = false;
 
   /// @return True if the partition contains a host task
   bool isHostTask() const {
     return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
                               sycl::detail::CG::CGTYPE::CodeplayHostTask));
   }
 
+  /// Checks if the graph is single path, i.e. each node has a single successor.
+  /// If so, the MIsInOrderGraph flag is set.
+  void checkIfGraphIsSinglePath() {
+    // Is the graph of this partition a single path graph?
+    // If so, we can optimize its execution using InOrder optimizations
+    MIsInOrderGraph = true;
+    if (MRoots.size() > 1) {
+      MIsInOrderGraph = false;
+      return;
+    }
+    for (auto Node : MSchedule) {
+      // In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not
+      // be enqueued in an in-order cmd-list (causing execution to stall).
+      // The 2D Copy test should be removed from here when the bug is fixed.
+      if ((Node->MSuccessors.size() > 1) || (Node->is2DCopyNode())) {
+        MIsInOrderGraph = false;
+        return;
+      }
+    }
+  }
+
   /// Add nodes to MSchedule.
   void schedule();
 };
@@ -1021,10 +1063,14 @@ class exec_graph_impl {
   /// nodes).
   /// @param Context Context to create graph with.
   /// @param GraphImpl Modifiable graph implementation to create with.
+  /// @param DisableInOrderOptimization Disable the In Order Command-List
+  /// optimization
   exec_graph_impl(sycl::context Context,
-                  const std::shared_ptr<graph_impl> &GraphImpl)
+                  const std::shared_ptr<graph_impl> &GraphImpl,
+                  const bool DisableInOrderOptimization = false)
       : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context),
-        MRequirements(), MExecutionEvents() {
+        MRequirements(), MExecutionEvents(),
+        UseInOrderCommandList(!DisableInOrderOptimization) {
     // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
     duplicateNodes();
   }
@@ -1195,6 +1241,8 @@ class exec_graph_impl {
       MPartitionsExecutionEvents;
   /// Storage for copies of nodes from the original modifiable graph.
   std::vector<std::shared_ptr<node_impl>> MNodeStorage;
+  /// If true, the L0 backend In-order CommandList optimization is enabled.
+  bool UseInOrderCommandList = true;
 };
 
 } // namespace detail

diff --git a/sycl/test-e2e/Graph/event_profiling_info.cpp b/sycl/test-e2e/Graph/event_profiling_info.cpp
@@ -130,8 +130,12 @@ int main() {
 
     KernelGraph.end_recording(Queue);
 
-    auto CopyGraphExec = CopyGraph.finalize();
-    auto KernelGraphExec = KernelGraph.finalize();
+    // The profiling is not available with the In-Order optimization.
+    // We therefore disable this optimization.
+    auto CopyGraphExec = CopyGraph.finalize(
+        exp_ext::property::graph::disable_in_order_optimization{});
+    auto KernelGraphExec = KernelGraph.finalize(
+        exp_ext::property::graph::disable_in_order_optimization{});
 
     event CopyEvent, KernelEvent1, KernelEvent2;
     // Run graphs
@@ -178,6 +182,25 @@ int main() {
     assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
            verifyProfiling(KernelEvent2) &&
            compareProfiling(KernelEvent1, KernelEvent2));
+
+    // Checks exception thrown if profiling is requested while in-order
+    // optimization enabled. Note that in-order cmd-list optmization is only
+    // available for level-zero backend.
+    auto Backend = Dev.get_backend();
+    if (Backend == backend::ext_oneapi_level_zero) {
+      auto CopyGraphExecInOrder = CopyGraph.finalize();
+      auto EventInOrder = Queue.submit(
+          [&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExecInOrder); });
+      Queue.wait_and_throw();
+      bool Success = false;
+      try {
+        EventInOrder
+            .get_profiling_info<sycl::info::event_profiling::command_start>();
+      } catch (sycl::exception &E) {
+        Success = true;
+      }
+      assert(Success);
+    }
   }
 
   host_accessor HostData(BufferTo);