Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Graph] Enable L0 optimizations (no profiling mode) #358

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ class assume_buffer_outlives_graph {
public:
assume_buffer_outlives_graph() = default;
};

EwanC marked this conversation as resolved.
Show resolved Hide resolved
class disable_in_order_optimization {
public:
disable_in_order_optimization() = default;
};
} // namespace graph

namespace node {
Expand Down Expand Up @@ -646,6 +651,15 @@ which is used in a graph will be kept alive on the host for the lifetime of the
graph. Destroying that buffer during the lifetime of a `command_graph`
constructed with this property results in undefined behavior.

===== Disable-In-Order-Optimization Property [[disable-in-order-optimization]]

The `property::graph::disable_in_order_optimization` property disables
an optimization which can be applied to in-order graph to
reduce its execution time.
This property is passed to the `command_graph::finalize()` function.
Note that this property must be passed to the finalize function if users
want to profile the exec_graph execution.
EwanC marked this conversation as resolved.
Show resolved Hide resolved

==== Graph Member Functions

Table {counter: tableNumber}. Constructor of the `command_graph` class.
Expand Down Expand Up @@ -849,8 +863,9 @@ Preconditions:

Parameters:

* `propList` - Optional parameter for passing properties. No finalization
properties are defined by this extension.
* `propList` - Optional parameter for passing properties. The only defined
property is `disable_in_order_optimization`. See <<disable_in_order_optimization, Disable-In-Order-Optimization>>
EwanC marked this conversation as resolved.
Show resolved Hide resolved
for more details.

Returns: A new executable graph object which can be submitted to a queue.

Expand Down
2 changes: 1 addition & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2304,7 +2304,7 @@ typedef enum {
struct pi_ext_command_buffer_desc final {
pi_ext_structure_type stype;
const void *pNext;
pi_queue_properties *properties;
pi_bool is_in_order;
};

/// API to create a command-buffer.
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/detail/property_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ enum DataLessPropKind {
GraphAssumeDataOutlivesBuffer = 22,
GraphAssumeBufferOutlivesGraph = 23,
GraphDependOnAllLeaves = 24,
DisableInOrderOptimization = 25,
// Indicates the last known dataless property.
LastKnownDataLessPropKind = 24,
LastKnownDataLessPropKind = 25,
// Exceeding 32 may cause ABI breaking change on some of OSes.
DataLessPropKindSize = 32
};
Expand Down
13 changes: 12 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,15 @@ class assume_buffer_outlives_graph
public:
assume_buffer_outlives_graph() = default;
};

/// Property used to to add all previous graph leaves as dependencies when
/// creating a new node with command_graph::add().
EwanC marked this conversation as resolved.
Show resolved Hide resolved
class disable_in_order_optimization
: public ::sycl::detail::DataLessProperty<
::sycl::detail::DisableInOrderOptimization> {
public:
disable_in_order_optimization() = default;
};
} // namespace graph

namespace node {
Expand Down Expand Up @@ -340,8 +349,10 @@ class __SYCL_EXPORT executable_command_graph {
/// Constructor used by internal runtime.
/// @param Graph Detail implementation class to construct with.
/// @param Ctx Context to use for graph.
/// @param DisableInOrderOptimization Disable the In Order graph opimization
executable_command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
const sycl::context &Ctx);
const sycl::context &Ctx,
const bool DisableInOrderOptimization = false);

template <class Obj>
friend decltype(Obj::impl)
Expand Down
11 changes: 3 additions & 8 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,9 @@ endif()
if(SYCL_PI_UR_USE_FETCH_CONTENT)
include(FetchContent)

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 3487672ceba0fd3575b5f3f15a832b100dc5fbad
# Author: Artur Gainullin <[email protected]>
# Date: Fri Feb 16 09:59:50 2024 -0800
#
# [UR] Provide flexibility to replace unified-memory-framework repo and tag
set(UNIFIED_RUNTIME_TAG 3487672ceba0fd3575b5f3f15a832b100dc5fbad)

set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
set(UNIFIED_RUNTIME_TAG maxime/in-order-cmd-list)

if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
endif()
Expand Down
7 changes: 4 additions & 3 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4425,13 +4425,14 @@ piextCommandBufferCreate(pi_context Context, pi_device Device,
ur_context_handle_t UrContext =
reinterpret_cast<ur_context_handle_t>(Context);
ur_device_handle_t UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
const ur_exp_command_buffer_desc_t *UrDesc =
reinterpret_cast<const ur_exp_command_buffer_desc_t *>(Desc);
ur_exp_command_buffer_desc_t UrDesc;
UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC;
UrDesc.isInOrder = Desc->is_in_order;
ur_exp_command_buffer_handle_t *UrCommandBuffer =
reinterpret_cast<ur_exp_command_buffer_handle_t *>(RetCommandBuffer);

HANDLE_ERRORS(
urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer));
urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer));

return PI_SUCCESS;
}
Expand Down
25 changes: 19 additions & 6 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ void exec_graph_impl::makePartitions() {
}
if (Partition->MRoots.size() > 0) {
Partition->schedule();
Partition->checkIfGraphIsSinglePath();
MPartitions.push_back(Partition);
PartitionFinalNum++;
}
Expand Down Expand Up @@ -613,6 +614,8 @@ std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents() {
void exec_graph_impl::findRealDeps(
std::vector<sycl::detail::pi::PiExtSyncPoint> &Deps,
std::shared_ptr<node_impl> CurrentNode, int ReferencePartitionNum) {
// if (MPartitions[MPartitionNodes[CurrentNode]]->MIsInOrderGraph)
// return;
EwanC marked this conversation as resolved.
Show resolved Hide resolved
if (CurrentNode->isEmpty()) {
for (auto &N : CurrentNode->MPredecessors) {
auto NodeImpl = N.lock();
Expand Down Expand Up @@ -680,7 +683,9 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
void exec_graph_impl::createCommandBuffers(
sycl::device Device, std::shared_ptr<partition> &Partition) {
sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer;
sycl::detail::pi::PiExtCommandBufferDesc Desc{};
sycl::detail::pi::PiExtCommandBufferDesc Desc{
pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr,
pi_bool(Partition->MIsInOrderGraph & UseInOrderCommandList)};
auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
Expand All @@ -694,6 +699,8 @@ void exec_graph_impl::createCommandBuffers(

Partition->MPiCommandBuffers[Device] = OutCommandBuffer;

printGraphAsDot("execgraph.dot", false);
EwanC marked this conversation as resolved.
Show resolved Hide resolved

for (const auto &Node : Partition->MSchedule) {
// Empty nodes are not processed as other nodes, but only their
// dependencies are propagated in findRealDeps
Expand Down Expand Up @@ -1152,12 +1159,16 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) {
}

command_graph<graph_state::executable>
modifiable_command_graph::finalize(const sycl::property_list &) const {
modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
// Graph is read and written in this scope so we lock
// this graph with full priviledges.
graph_impl::WriteLock Lock(impl->MMutex);
return command_graph<graph_state::executable>{this->impl,
this->impl->getContext()};
bool DisableInOrderOptim = false;
if (PropList.has_property<property::graph::disable_in_order_optimization>()) {
DisableInOrderOptim = true;
}
return command_graph<graph_state::executable>{
this->impl, this->impl->getContext(), DisableInOrderOptim};
}

bool modifiable_command_graph::begin_recording(queue &RecordingQueue) {
Expand Down Expand Up @@ -1271,8 +1282,10 @@ std::vector<node> modifiable_command_graph::get_root_nodes() const {
}

executable_command_graph::executable_command_graph(
const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
: impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
const bool DisableInOrderOptimization)
: impl(std::make_shared<detail::exec_graph_impl>(
Ctx, Graph, DisableInOrderOptimization)) {
finalizeImpl(); // Create backend representation for executable graph
}

Expand Down
52 changes: 50 additions & 2 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,24 @@ class node_impl {
}
}

/// Test is the node contains a N-D copy
mfrancepillois marked this conversation as resolved.
Show resolved Hide resolved
/// @return true is the op is a N-D copy
mfrancepillois marked this conversation as resolved.
Show resolved Hide resolved
bool is2DCopyNode() {
EwanC marked this conversation as resolved.
Show resolved Hide resolved
if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) ||
(MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) ||
(MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) {
sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get();
sycl::detail::Requirement *ReqSrc =
(sycl::detail::Requirement *)(Copy->getSrc());
sycl::detail::Requirement *ReqDst =
(sycl::detail::Requirement *)(Copy->getDst());
if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) {
return true;
}
}
return false;
}

private:
/// Prints Node information to Stream.
/// @param Stream Where to print the Node information
Expand Down Expand Up @@ -550,13 +568,37 @@ class partition {
MPiCommandBuffers;
/// List of predecessors to this partition.
std::vector<std::shared_ptr<partition>> MPredecessors;
/// True is the graph of this partition is a single path graph
/// and InOrder optmization can be applied on it.
Bensuo marked this conversation as resolved.
Show resolved Hide resolved
bool MIsInOrderGraph = false;

/// @return True if the partition contains a host task
bool isHostTask() const {
return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
sycl::detail::CG::CGTYPE::CodeplayHostTask));
}

/// Checks if the graph is single path, i.e. each node has a single successor.
/// If so, the MIsInOrderGraph flag is set.
void checkIfGraphIsSinglePath() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the overhead of adding this routine in comparison to the potential in-order optimization?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its difficult to give an approximation of the overhead for this routine since it depends of the graph typology.
That said, if in-order graph is found we win on both sides: finalization delay (we do not need to create events) and execution time (we do not have to execute events nor synchronization on them).
On my setup (12th Gen Intel(R) Core(TM) i9-12900K, Intel(R) Level-Zero, Intel(R) UHD Graphics 770 1.3 [1.3.28454]), the finalization delay for an 2000 nodes in-order graph is reduced by ~40%. The execution time is reduced by ~15% and the second execution by ~20% compared to execution with event profiling capability disabled (and respectively 20% and 30% with the current implementation (i.e. event profiling enabled)).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, this sounds promising. I think we should run some microbenchmarks with and without these changes to better understand the overhead for nonlinear graphs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On my setup, the checkIfGraphIsSinglePath function takes less than 0.01% of the total runtime of finalize for checking 2000 nodes.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im not sure if just looking at the single function call is a fair metric, because the Schedule of the Graph is already available. That obviously is an implementation detail and won't necessarily take away the concerns about the complexity of this check. Another aspect that might be relevant: What if the check fails, it might be still beneficial to execute Schedule as-is on an in-order CommandList, or interleaving on multiple in-order CommandLists. That brings up the question if a scheduling hint that we'd pass as a property on graph finalization is a better option?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extending running things in-order to be user controlled with a property or something definitely seems like it could be useful.

But as to the other point of discussion here, do we really need to care that much about small optimizations of finalize()? It is expensive by design and barring any outlandishly slow performance it seems to me it doesn't really matter much at all how it performs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the idea of adding a hint/property to enable in-order command lists in more situation, it seems that probably requires some more in-depth discussion and is probably better done as a separate PR to avoid delaying this one too much.

// Is the graph of this partition a single path graph?
// If so, we can optimize its execution using InOrder optimizations
mfrancepillois marked this conversation as resolved.
Show resolved Hide resolved
MIsInOrderGraph = true;
if (MRoots.size() > 1) {
MIsInOrderGraph = false;
return;
}
for (auto Node : MSchedule) {
EwanC marked this conversation as resolved.
Show resolved Hide resolved
// In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not
// be enqueued in an in-order cmd-list (causing execution to stall).
// The 2D Copy test should be removed from here when the bug is fixed.
if ((Node->MSuccessors.size() > 1) || (Node->is2DCopyNode())) {
MIsInOrderGraph = false;
return;
}
}
}

/// Add nodes to MSchedule.
void schedule();
};
Expand Down Expand Up @@ -1021,10 +1063,14 @@ class exec_graph_impl {
/// nodes).
/// @param Context Context to create graph with.
/// @param GraphImpl Modifiable graph implementation to create with.
/// @param DisableInOrderOptimization Disable the In Order Command-List
/// optimization
exec_graph_impl(sycl::context Context,
const std::shared_ptr<graph_impl> &GraphImpl)
const std::shared_ptr<graph_impl> &GraphImpl,
const bool DisableInOrderOptimization = false)
: MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context),
MRequirements(), MExecutionEvents() {
MRequirements(), MExecutionEvents(),
UseInOrderCommandList(!DisableInOrderOptimization) {
// Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
duplicateNodes();
}
Expand Down Expand Up @@ -1195,6 +1241,8 @@ class exec_graph_impl {
MPartitionsExecutionEvents;
/// Storage for copies of nodes from the original modifiable graph.
std::vector<std::shared_ptr<node_impl>> MNodeStorage;
/// If true, the L0 backend In-order CommandList optimization is enabled.
EwanC marked this conversation as resolved.
Show resolved Hide resolved
bool UseInOrderCommandList = true;
};

} // namespace detail
Expand Down
27 changes: 25 additions & 2 deletions sycl/test-e2e/Graph/event_profiling_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,12 @@ int main() {

KernelGraph.end_recording(Queue);

auto CopyGraphExec = CopyGraph.finalize();
auto KernelGraphExec = KernelGraph.finalize();
// The profiling is not available with the In-Order optimization.
mfrancepillois marked this conversation as resolved.
Show resolved Hide resolved
// We therefore disable this optimization.
auto CopyGraphExec = CopyGraph.finalize(
exp_ext::property::graph::disable_in_order_optimization{});
auto KernelGraphExec = KernelGraph.finalize(
exp_ext::property::graph::disable_in_order_optimization{});

event CopyEvent, KernelEvent1, KernelEvent2;
// Run graphs
Expand Down Expand Up @@ -178,6 +182,25 @@ int main() {
assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
verifyProfiling(KernelEvent2) &&
compareProfiling(KernelEvent1, KernelEvent2));

// Checks exception thrown if profiling is requested while in-order
// optimization enabled. Note that in-order cmd-list optmization is only
// available for level-zero backend.
auto Backend = Dev.get_backend();
if (Backend == backend::ext_oneapi_level_zero) {
EwanC marked this conversation as resolved.
Show resolved Hide resolved
auto CopyGraphExecInOrder = CopyGraph.finalize();
auto EventInOrder = Queue.submit(
[&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExecInOrder); });
Queue.wait_and_throw();
bool Success = false;
try {
EventInOrder
.get_profiling_info<sycl::info::event_profiling::command_start>();
} catch (sycl::exception &E) {
Success = true;
}
assert(Success);
}
}

host_accessor HostData(BufferTo);
Expand Down