Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Graph] Enable L0 optimizations (no profiling mode) #358

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ class assume_buffer_outlives_graph {
public:
assume_buffer_outlives_graph() = default;
};

EwanC marked this conversation as resolved.
Show resolved Hide resolved
public:
enable_profiling() = default;
};
} // namespace graph

namespace node {
Expand Down Expand Up @@ -646,6 +650,18 @@ which is used in a graph will be kept alive on the host for the lifetime of the
graph. Destroying that buffer during the lifetime of a `command_graph`
constructed with this property results in undefined behavior.

===== Enable-Profiling Property [[enable-profiling]]

The `property::graph::enable_profiling` property can be passed to the
`command_graph::finalize()` function and enables profiling support
for the returned `command_graph<graph_state::executable>`.
Passing this property to the finalize function implies disabling
certain optimizations to enable graph profiling.
As a result, the execution time of a graph finalized with profiling enabled
is longer than that of a graph without profiling capability.
In this case, an error will be thrown when attempting to profiling an event
mfrancepillois marked this conversation as resolved.
Show resolved Hide resolved
from a graph submission that was created without this property.

==== Graph Member Functions

Table {counter: tableNumber}. Constructor of the `command_graph` class.
Expand Down Expand Up @@ -849,8 +865,9 @@ Preconditions:

Parameters:

* `propList` - Optional parameter for passing properties. No finalization
properties are defined by this extension.
* `propList` - Optional parameter for passing properties. The only defined
property is `enable_profiling`. See <<enable-profiling, Enable-Profiling>>
for more details.

Returns: A new executable graph object which can be submitted to a queue.

Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2304,7 +2304,8 @@ typedef enum {
struct pi_ext_command_buffer_desc final {
pi_ext_structure_type stype;
const void *pNext;
pi_queue_properties *properties;
pi_bool is_in_order;
pi_bool enable_profiling;
};

/// API to create a command-buffer.
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/detail/property_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ enum DataLessPropKind {
GraphAssumeDataOutlivesBuffer = 22,
GraphAssumeBufferOutlivesGraph = 23,
GraphDependOnAllLeaves = 24,
GraphEnableProfiling = 25,
// Indicates the last known dataless property.
LastKnownDataLessPropKind = 24,
LastKnownDataLessPropKind = 25,
// Exceeding 32 may cause ABI breaking change on some of OSes.
DataLessPropKindSize = 32
};
Expand Down
13 changes: 12 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,15 @@ class assume_buffer_outlives_graph
public:
assume_buffer_outlives_graph() = default;
};

/// Property used to enable graph profiling.
/// Passing this property to the `command_graph::finalize()` function
/// ensures that profiling can be used on the generated graph.
class enable_profiling : public ::sycl::detail::DataLessProperty<
::sycl::detail::GraphEnableProfiling> {
public:
enable_profiling() = default;
};
} // namespace graph

namespace node {
Expand Down Expand Up @@ -340,8 +349,10 @@ class __SYCL_EXPORT executable_command_graph {
/// Constructor used by internal runtime.
/// @param Graph Detail implementation class to construct with.
/// @param Ctx Context to use for graph.
/// @param EnableProfiling Enable graph profiling.
executable_command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
const sycl::context &Ctx);
const sycl::context &Ctx,
const bool EnableProfiling = false);
Bensuo marked this conversation as resolved.
Show resolved Hide resolved

template <class Obj>
friend decltype(Obj::impl)
Expand Down
11 changes: 3 additions & 8 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,9 @@ endif()
if(SYCL_PI_UR_USE_FETCH_CONTENT)
include(FetchContent)

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 3487672ceba0fd3575b5f3f15a832b100dc5fbad
# Author: Artur Gainullin <[email protected]>
# Date: Fri Feb 16 09:59:50 2024 -0800
#
# [UR] Provide flexibility to replace unified-memory-framework repo and tag
set(UNIFIED_RUNTIME_TAG 3487672ceba0fd3575b5f3f15a832b100dc5fbad)

set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
set(UNIFIED_RUNTIME_TAG maxime/in-order-cmd-list)

if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
endif()
Expand Down
8 changes: 5 additions & 3 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4425,13 +4425,15 @@ piextCommandBufferCreate(pi_context Context, pi_device Device,
ur_context_handle_t UrContext =
reinterpret_cast<ur_context_handle_t>(Context);
ur_device_handle_t UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
const ur_exp_command_buffer_desc_t *UrDesc =
reinterpret_cast<const ur_exp_command_buffer_desc_t *>(Desc);
ur_exp_command_buffer_desc_t UrDesc;
UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC;
UrDesc.isInOrder = Desc->is_in_order;
UrDesc.enableProfiling = Desc->enable_profiling;
ur_exp_command_buffer_handle_t *UrCommandBuffer =
reinterpret_cast<ur_exp_command_buffer_handle_t *>(RetCommandBuffer);

HANDLE_ERRORS(
urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer));
urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer));

return PI_SUCCESS;
}
Expand Down
4 changes: 3 additions & 1 deletion sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ class event_impl {
return MEventFromSubmittedExecCommandBuffer;
}

void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; }

protected:
// When instrumentation is enabled emits trace event for event wait begin and
// returns the telemetry event generated for the wait
Expand All @@ -313,7 +315,7 @@ class event_impl {
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
void *MCommand = nullptr;
std::weak_ptr<queue_impl> MQueue;
const bool MIsProfilingEnabled = false;
bool MIsProfilingEnabled = false;
EwanC marked this conversation as resolved.
Show resolved Hide resolved
const bool MFallbackProfiling = false;

std::weak_ptr<queue_impl> MWorkerQueue;
Expand Down
25 changes: 19 additions & 6 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ void exec_graph_impl::makePartitions() {
}
if (Partition->MRoots.size() > 0) {
Partition->schedule();
Partition->checkIfGraphIsSinglePath();
MPartitions.push_back(Partition);
PartitionFinalNum++;
}
Expand Down Expand Up @@ -680,7 +681,10 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
void exec_graph_impl::createCommandBuffers(
sycl::device Device, std::shared_ptr<partition> &Partition) {
sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer;
sycl::detail::pi::PiExtCommandBufferDesc Desc{};
sycl::detail::pi::PiExtCommandBufferDesc Desc{
pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr,
pi_bool(Partition->MIsInOrderGraph & !MEnableProfiling),
Bensuo marked this conversation as resolved.
Show resolved Hide resolved
pi_bool(MEnableProfiling)};
auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
Expand Down Expand Up @@ -942,6 +946,9 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
NewEvent->attachEventToComplete(Elem.second);
}
}
if (!MEnableProfiling) {
NewEvent->setProfilingEnabled(false);
}
sycl::event QueueEvent =
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
return QueueEvent;
Expand Down Expand Up @@ -1152,12 +1159,16 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) {
}

command_graph<graph_state::executable>
modifiable_command_graph::finalize(const sycl::property_list &) const {
modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
// Graph is read and written in this scope so we lock
// this graph with full priviledges.
graph_impl::WriteLock Lock(impl->MMutex);
return command_graph<graph_state::executable>{this->impl,
this->impl->getContext()};
bool EnableProfiling = false;
if (PropList.has_property<property::graph::enable_profiling>()) {
EnableProfiling = true;
}
return command_graph<graph_state::executable>{
this->impl, this->impl->getContext(), EnableProfiling};
}

bool modifiable_command_graph::begin_recording(queue &RecordingQueue) {
Expand Down Expand Up @@ -1271,8 +1282,10 @@ std::vector<node> modifiable_command_graph::get_root_nodes() const {
}

executable_command_graph::executable_command_graph(
const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
: impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
const bool EnableProfiling)
: impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph,
EnableProfiling)) {
finalizeImpl(); // Create backend representation for executable graph
}

Expand Down
48 changes: 46 additions & 2 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,24 @@ class node_impl {
}
}

/// Test if the node contains a N-D copy
/// @return true if the op is a N-D copy
bool isNDCopyNode() const {
if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) ||
(MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) ||
(MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) {
sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get();
sycl::detail::Requirement *ReqSrc =
(sycl::detail::Requirement *)(Copy->getSrc());
sycl::detail::Requirement *ReqDst =
(sycl::detail::Requirement *)(Copy->getDst());
if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) {
return true;
}
}
return false;
}

private:
/// Prints Node information to Stream.
/// @param Stream Where to print the Node information
Expand Down Expand Up @@ -550,13 +568,35 @@ class partition {
MPiCommandBuffers;
/// List of predecessors to this partition.
std::vector<std::shared_ptr<partition>> MPredecessors;
/// True is the graph of this partition is a single path graph
/// and InOrder optmization can be applied on it.
Bensuo marked this conversation as resolved.
Show resolved Hide resolved
bool MIsInOrderGraph = false;

/// @return True if the partition contains a host task
bool isHostTask() const {
return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
sycl::detail::CG::CGTYPE::CodeplayHostTask));
}

/// Checks if the graph is single path, i.e. each node has a single successor.
/// If so, the MIsInOrderGraph flag is set.
void checkIfGraphIsSinglePath() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the overhead of adding this routine in comparison to the potential in-order optimization?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its difficult to give an approximation of the overhead for this routine since it depends of the graph typology.
That said, if in-order graph is found we win on both sides: finalization delay (we do not need to create events) and execution time (we do not have to execute events nor synchronization on them).
On my setup (12th Gen Intel(R) Core(TM) i9-12900K, Intel(R) Level-Zero, Intel(R) UHD Graphics 770 1.3 [1.3.28454]), the finalization delay for an 2000 nodes in-order graph is reduced by ~40%. The execution time is reduced by ~15% and the second execution by ~20% compared to execution with event profiling capability disabled (and respectively 20% and 30% with the current implementation (i.e. event profiling enabled)).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, this sounds promising. I think we should run some microbenchmarks with and without these changes to better understand the overhead for nonlinear graphs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On my setup, the checkIfGraphIsSinglePath function takes less than 0.01% of the total runtime of finalize for checking 2000 nodes.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im not sure if just looking at the single function call is a fair metric, because the Schedule of the Graph is already available. That obviously is an implementation detail and won't necessarily take away the concerns about the complexity of this check. Another aspect that might be relevant: What if the check fails, it might be still beneficial to execute Schedule as-is on an in-order CommandList, or interleaving on multiple in-order CommandLists. That brings up the question if a scheduling hint that we'd pass as a property on graph finalization is a better option?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extending running things in-order to be user controlled with a property or something definitely seems like it could be useful.

But as to the other point of discussion here, do we really need to care that much about small optimizations of finalize()? It is expensive by design and barring any outlandishly slow performance it seems to me it doesn't really matter much at all how it performs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the idea of adding a hint/property to enable in-order command lists in more situation, it seems that probably requires some more in-depth discussion and is probably better done as a separate PR to avoid delaying this one too much.

MIsInOrderGraph = true;
if (MRoots.size() > 1) {
MIsInOrderGraph = false;
return;
}
for (const auto &Node : MSchedule) {
// In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not
// be enqueued in an in-order cmd-list (causing execution to stall).
// The 2D Copy test should be removed from here when the bug is fixed.
if ((Node->MSuccessors.size() > 1) || (Node->isNDCopyNode())) {
MIsInOrderGraph = false;
return;
}
}
}

/// Add nodes to MSchedule.
void schedule();
};
Expand Down Expand Up @@ -1021,10 +1061,12 @@ class exec_graph_impl {
/// nodes).
/// @param Context Context to create graph with.
/// @param GraphImpl Modifiable graph implementation to create with.
/// @param EnableProfiling Enable graph profiling.
exec_graph_impl(sycl::context Context,
const std::shared_ptr<graph_impl> &GraphImpl)
const std::shared_ptr<graph_impl> &GraphImpl,
const bool EnableProfiling = false)
: MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context),
MRequirements(), MExecutionEvents() {
MRequirements(), MExecutionEvents(), MEnableProfiling(EnableProfiling) {
// Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
duplicateNodes();
}
Expand Down Expand Up @@ -1195,6 +1237,8 @@ class exec_graph_impl {
MPartitionsExecutionEvents;
/// Storage for copies of nodes from the original modifiable graph.
std::vector<std::shared_ptr<node_impl>> MNodeStorage;
/// If true, the graph profiling is enabled.
bool MEnableProfiling = false;
};

} // namespace detail
Expand Down
23 changes: 21 additions & 2 deletions sycl/test-e2e/Graph/event_profiling_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,12 @@ int main() {

KernelGraph.end_recording(Queue);

auto CopyGraphExec = CopyGraph.finalize();
auto KernelGraphExec = KernelGraph.finalize();
// The profiling is not available with the in-order optimization.
// We therefore disable this optimization.
Bensuo marked this conversation as resolved.
Show resolved Hide resolved
auto CopyGraphExec =
CopyGraph.finalize(exp_ext::property::graph::enable_profiling{});
auto KernelGraphExec =
KernelGraph.finalize(exp_ext::property::graph::enable_profiling{});

event CopyEvent, KernelEvent1, KernelEvent2;
// Run graphs
Expand Down Expand Up @@ -178,6 +182,21 @@ int main() {
assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
verifyProfiling(KernelEvent2) &&
compareProfiling(KernelEvent1, KernelEvent2));

// Checks exception thrown if profiling is requested while the
// enable_profiling property has not been passed to `finalize()`.
auto CopyGraphExecInOrder = CopyGraph.finalize();
auto EventInOrder = Queue.submit(
[&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExecInOrder); });
Queue.wait_and_throw();
bool Success = false;
try {
EventInOrder
.get_profiling_info<sycl::info::event_profiling::command_start>();
} catch (sycl::exception &E) {
Success = true;
}
assert(Success);
EwanC marked this conversation as resolved.
Show resolved Hide resolved
}

host_accessor HostData(BufferTo);
Expand Down