reble · mfrancepillois · Feb 23, 2024 · Feb 26, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -341,6 +341,10 @@ class assume_buffer_outlives_graph {
   public:
     assume_buffer_outlives_graph() = default;
 };
+
+public:
+  enable_profiling() = default;
+};
 } // namespace graph
 
 namespace node {
@@ -646,6 +650,18 @@ which is used in a graph will be kept alive on the host for the lifetime of the
 graph. Destroying that buffer during the lifetime of a `command_graph`
 constructed with this property results in undefined behavior.
 
+===== Enable-Profiling Property [[enable-profiling]]
+
+The `property::graph::enable_profiling` property can be passed to the 
+`command_graph::finalize()` function and enables profiling support 
+for the returned `command_graph<graph_state::executable>`.
+Passing this property to the finalize function implies disabling 
+certain optimizations to enable graph profiling.
+As a result, the execution time of a graph finalized with profiling enabled 
+is longer than that of a graph without profiling capability.
+In this case, an error will be thrown when attempting to profiling an event 
+from a graph submission that was created without this property.
+
 ==== Graph Member Functions
 
 Table {counter: tableNumber}. Constructor of the `command_graph` class.
@@ -849,8 +865,9 @@ Preconditions:
 
 Parameters:
 
-* `propList` - Optional parameter for passing properties. No finalization
-  properties are defined by this extension.
+* `propList` - Optional parameter for passing properties. The only defined 
+property is `enable_profiling`. See <<enable-profiling, Enable-Profiling>>
+for more details.
 
 Returns: A new executable graph object which can be submitted to a queue.
 

diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
@@ -2304,7 +2304,8 @@ typedef enum {
 struct pi_ext_command_buffer_desc final {
   pi_ext_structure_type stype;
   const void *pNext;
-  pi_queue_properties *properties;
+  pi_bool is_in_order;
+  pi_bool enable_profiling;
 };
 
 /// API to create a command-buffer.

diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp
@@ -47,8 +47,9 @@ enum DataLessPropKind {
   GraphAssumeDataOutlivesBuffer = 22,
   GraphAssumeBufferOutlivesGraph = 23,
   GraphDependOnAllLeaves = 24,
+  GraphEnableProfiling = 25,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 24,
+  LastKnownDataLessPropKind = 25,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -146,6 +146,15 @@ class assume_buffer_outlives_graph
 public:
   assume_buffer_outlives_graph() = default;
 };
+
+/// Property used to enable graph profiling.
+/// Passing this property to the `command_graph::finalize()` function
+/// ensures that profiling can be used on the generated graph.
+class enable_profiling : public ::sycl::detail::DataLessProperty<
+                             ::sycl::detail::GraphEnableProfiling> {
+public:
+  enable_profiling() = default;
+};
 } // namespace graph
 
 namespace node {
@@ -340,8 +349,10 @@ class __SYCL_EXPORT executable_command_graph {
   /// Constructor used by internal runtime.
   /// @param Graph Detail implementation class to construct with.
   /// @param Ctx Context to use for graph.
+  /// @param EnableProfiling Enable graph profiling.
   executable_command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
-                           const sycl::context &Ctx);
+                           const sycl::context &Ctx,
+                           const bool EnableProfiling = false);
 
   template <class Obj>
   friend decltype(Obj::impl)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,9 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 3487672ceba0fd3575b5f3f15a832b100dc5fbad
-  # Author: Artur Gainullin <[email protected]>
-  # Date:   Fri Feb 16 09:59:50 2024 -0800
-  #
-  #     [UR] Provide flexibility to replace unified-memory-framework repo and tag
-  set(UNIFIED_RUNTIME_TAG 3487672ceba0fd3575b5f3f15a832b100dc5fbad)
-
+  set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
+  set(UNIFIED_RUNTIME_TAG maxime/in-order-cmd-list)
+
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
   endif()

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -4425,13 +4425,15 @@ piextCommandBufferCreate(pi_context Context, pi_device Device,
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
   ur_device_handle_t UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  const ur_exp_command_buffer_desc_t *UrDesc =
-      reinterpret_cast<const ur_exp_command_buffer_desc_t *>(Desc);
+  ur_exp_command_buffer_desc_t UrDesc;
+  UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC;
+  UrDesc.isInOrder = Desc->is_in_order;
+  UrDesc.enableProfiling = Desc->enable_profiling;
   ur_exp_command_buffer_handle_t *UrCommandBuffer =
       reinterpret_cast<ur_exp_command_buffer_handle_t *>(RetCommandBuffer);
 
   HANDLE_ERRORS(
-      urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer));
+      urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer));
 
   return PI_SUCCESS;
 }

diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
@@ -290,6 +290,8 @@ class event_impl {
     return MEventFromSubmittedExecCommandBuffer;
   }
 
+  void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; }
+
 protected:
   // When instrumentation is enabled emits trace event for event wait begin and
   // returns the telemetry event generated for the wait
@@ -313,7 +315,7 @@ class event_impl {
   std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
   void *MCommand = nullptr;
   std::weak_ptr<queue_impl> MQueue;
-  const bool MIsProfilingEnabled = false;
+  bool MIsProfilingEnabled = false;
   const bool MFallbackProfiling = false;
 
   std::weak_ptr<queue_impl> MWorkerQueue;

diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
@@ -265,6 +265,7 @@ void exec_graph_impl::makePartitions() {
     }
     if (Partition->MRoots.size() > 0) {
       Partition->schedule();
+      Partition->checkIfGraphIsSinglePath();
       MPartitions.push_back(Partition);
       PartitionFinalNum++;
     }
@@ -680,7 +681,10 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
 void exec_graph_impl::createCommandBuffers(
     sycl::device Device, std::shared_ptr<partition> &Partition) {
   sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer;
-  sycl::detail::pi::PiExtCommandBufferDesc Desc{};
+  sycl::detail::pi::PiExtCommandBufferDesc Desc{
+      pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr,
+      pi_bool(Partition->MIsInOrderGraph & !MEnableProfiling),
+      pi_bool(MEnableProfiling)};
   auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
   const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
   auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
@@ -942,6 +946,9 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
       NewEvent->attachEventToComplete(Elem.second);
     }
   }
+  if (!MEnableProfiling) {
+    NewEvent->setProfilingEnabled(false);
+  }
   sycl::event QueueEvent =
       sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
   return QueueEvent;
@@ -1152,12 +1159,16 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) {
 }
 
 command_graph<graph_state::executable>
-modifiable_command_graph::finalize(const sycl::property_list &) const {
+modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
   // Graph is read and written in this scope so we lock
   // this graph with full priviledges.
   graph_impl::WriteLock Lock(impl->MMutex);
-  return command_graph<graph_state::executable>{this->impl,
-                                                this->impl->getContext()};
+  bool EnableProfiling = false;
+  if (PropList.has_property<property::graph::enable_profiling>()) {
+    EnableProfiling = true;
+  }
+  return command_graph<graph_state::executable>{
+      this->impl, this->impl->getContext(), EnableProfiling};
 }
 
 bool modifiable_command_graph::begin_recording(queue &RecordingQueue) {
@@ -1271,8 +1282,10 @@ std::vector<node> modifiable_command_graph::get_root_nodes() const {
 }
 
 executable_command_graph::executable_command_graph(
-    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
-    : impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
+    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
+    const bool EnableProfiling)
+    : impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph,
+                                                     EnableProfiling)) {
   finalizeImpl(); // Create backend representation for executable graph
 }
 

diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
@@ -331,6 +331,24 @@ class node_impl {
     }
   }
 
+  /// Test if the node contains a N-D copy
+  /// @return true if the op is a N-D copy
+  bool isNDCopyNode() const {
+    if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) ||
+        (MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) ||
+        (MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) {
+      sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get();
+      sycl::detail::Requirement *ReqSrc =
+          (sycl::detail::Requirement *)(Copy->getSrc());
+      sycl::detail::Requirement *ReqDst =
+          (sycl::detail::Requirement *)(Copy->getDst());
+      if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
 private:
   /// Prints Node information to Stream.
   /// @param Stream Where to print the Node information
@@ -550,13 +568,35 @@ class partition {
       MPiCommandBuffers;
   /// List of predecessors to this partition.
   std::vector<std::shared_ptr<partition>> MPredecessors;
+  /// True is the graph of this partition is a single path graph
+  /// and InOrder optmization can be applied on it.
+  bool MIsInOrderGraph = false;
 
   /// @return True if the partition contains a host task
   bool isHostTask() const {
     return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
                               sycl::detail::CG::CGTYPE::CodeplayHostTask));
   }
 
+  /// Checks if the graph is single path, i.e. each node has a single successor.
+  /// If so, the MIsInOrderGraph flag is set.
+  void checkIfGraphIsSinglePath() {
+    MIsInOrderGraph = true;
+    if (MRoots.size() > 1) {
+      MIsInOrderGraph = false;
+      return;
+    }
+    for (const auto &Node : MSchedule) {
+      // In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not
+      // be enqueued in an in-order cmd-list (causing execution to stall).
+      // The 2D Copy test should be removed from here when the bug is fixed.
+      if ((Node->MSuccessors.size() > 1) || (Node->isNDCopyNode())) {
+        MIsInOrderGraph = false;
+        return;
+      }
+    }
+  }
+
   /// Add nodes to MSchedule.
   void schedule();
 };
@@ -1021,10 +1061,12 @@ class exec_graph_impl {
   /// nodes).
   /// @param Context Context to create graph with.
   /// @param GraphImpl Modifiable graph implementation to create with.
+  /// @param EnableProfiling Enable graph profiling.
   exec_graph_impl(sycl::context Context,
-                  const std::shared_ptr<graph_impl> &GraphImpl)
+                  const std::shared_ptr<graph_impl> &GraphImpl,
+                  const bool EnableProfiling = false)
       : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context),
-        MRequirements(), MExecutionEvents() {
+        MRequirements(), MExecutionEvents(), MEnableProfiling(EnableProfiling) {
     // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
     duplicateNodes();
   }
@@ -1195,6 +1237,8 @@ class exec_graph_impl {
       MPartitionsExecutionEvents;
   /// Storage for copies of nodes from the original modifiable graph.
   std::vector<std::shared_ptr<node_impl>> MNodeStorage;
+  /// If true, the graph profiling is enabled.
+  bool MEnableProfiling = false;
 };
 
 } // namespace detail

diff --git a/sycl/test-e2e/Graph/event_profiling_info.cpp b/sycl/test-e2e/Graph/event_profiling_info.cpp
@@ -130,8 +130,12 @@ int main() {
 
     KernelGraph.end_recording(Queue);
 
-    auto CopyGraphExec = CopyGraph.finalize();
-    auto KernelGraphExec = KernelGraph.finalize();
+    // The profiling is not available with the in-order optimization.
+    // We therefore disable this optimization.
+    auto CopyGraphExec =
+        CopyGraph.finalize(exp_ext::property::graph::enable_profiling{});
+    auto KernelGraphExec =
+        KernelGraph.finalize(exp_ext::property::graph::enable_profiling{});
 
     event CopyEvent, KernelEvent1, KernelEvent2;
     // Run graphs
@@ -178,6 +182,21 @@ int main() {
     assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
            verifyProfiling(KernelEvent2) &&
            compareProfiling(KernelEvent1, KernelEvent2));
+
+    // Checks exception thrown if profiling is requested while the
+    // enable_profiling property has not been passed to `finalize()`.
+    auto CopyGraphExecInOrder = CopyGraph.finalize();
+    auto EventInOrder = Queue.submit(
+        [&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExecInOrder); });
+    Queue.wait_and_throw();
+    bool Success = false;
+    try {
+      EventInOrder
+          .get_profiling_info<sycl::info::event_profiling::command_start>();
+    } catch (sycl::exception &E) {
+      Success = true;
+    }
+    assert(Success);
   }
 
   host_accessor HostData(BufferTo);