From ef24789e28ee58d302e6475579ee90d3bf765763 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 18 Feb 2022 15:16:27 -0600 Subject: [PATCH 01/82] Inital version of sycl graph prototype --- .../sycl/ext/oneapi/experimental/graph.hpp | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 sycl/include/sycl/ext/oneapi/experimental/graph.hpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp new file mode 100644 index 0000000000000..08a7d094e9054 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -0,0 +1,212 @@ +//==--------- graph.hpp --- SYCL graph extension ---------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +struct node_impl; + +struct graph_impl; + +using node_ptr = std::shared_ptr; + +using graph_ptr = std::shared_ptr; + +class wrapper { + using T = std::function; + T my_func; + std::vector my_deps; +public: + wrapper(T t, const std::vector& deps) : my_func(t), my_deps(deps) {}; + + void operator()(sycl::handler& cgh) { + cgh.depends_on(my_deps); + std::invoke(my_func,cgh); + } +}; + +struct node_impl { + bool is_scheduled; + + graph_ptr my_graph; + sycl::event my_event; + + std::vector my_successors; + std::vector my_predecessors; + + std::function my_body; + + void exec( sycl::queue q ) { + std::vector __deps; + for(auto i:my_predecessors) __deps.push_back(i->get_event()); + my_event = q.submit(wrapper{my_body,__deps}); + } + + void register_successor(node_ptr n) { + my_successors.push_back(n); + n->register_predecessor(node_ptr(this)); + } + + void register_predecessor(node_ptr n) { my_predecessors.push_back(n); } + + sycl::event get_event(void) {return my_event;} + + template + node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {} + + // Recursively adding nodes to execution stack: + void topology_sort(std::list& schedule) { + is_scheduled = true; + for(auto i:my_successors) { + if(!i->is_scheduled) i->topology_sort(schedule); + } + schedule.push_front(node_ptr(this)); + } +}; + +struct graph_impl { + std::set my_roots; + std::list my_schedule; + + graph_ptr parent; + + void exec( sycl::queue q ) { + if( my_schedule.empty() ) { + for(auto n : my_roots) { + n->topology_sort(my_schedule); + } + } + for(auto n : my_schedule) n->exec(q); + } + + void exec_and_wait( sycl::queue q ) { + exec(q); + q.wait(); + } + + void add_root(node_ptr n) { + my_roots.insert(n); + for(auto n : my_schedule) n->is_scheduled=false; + my_schedule.clear(); + } + + void remove_root(node_ptr n) { + my_roots.erase(n); + for(auto n : my_schedule) n->is_scheduled=false; + my_schedule.clear(); + } + + graph_impl() {} +}; + +} // namespace detail + +class node; + +class graph; + +class executable_graph; + +struct node { + // TODO: add properties to distinguish between empty, host, device nodes. + detail::node_ptr my_node; + detail::graph_ptr my_graph; + + template + node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {}; + void register_successor(node n) { my_node->register_successor(n.my_node); } + void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); } + + void set_root() { my_graph->add_root(my_node);} + + // TODO: Add query functions: is_root, ... +}; + +class executable_graph { +public: + int my_tag; + sycl::queue my_queue; + + void exec_and_wait();// { my_queue.wait(); } + + executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) { + g->exec(my_queue); + } +}; + +class graph { +public: + // Adding empty node with [0..n] predecessors: + node add_empty_node(const std::vector& dep = {}); + + // Adding node for host task + template + node add_host_node(T hostTaskCallable, const std::vector& dep = {}); + + // Adding device node: + template + node add_device_node(T cgf, const std::vector& dep = {}); + + // Adding dependency between two nodes. + void make_edge(node sender, node receiver); + + // TODO: Extend queue to directly submit graph + void exec_and_wait( sycl::queue q ); + + executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};}; + + graph() : my_graph(new detail::graph_impl()) {} + + // Creating a subgraph (with predecessors) + graph(graph& parent, const std::vector& dep = {}) {} + + bool is_subgraph(); + +private: + detail::graph_ptr my_graph; +}; + +void executable_graph::exec_and_wait() { my_queue.wait(); } + +template +node graph::add_device_node(T cgf , const std::vector& dep) { + node _node(my_graph,cgf); + if( !dep.empty() ) { + for(auto n : dep) this->make_edge(n,_node); + } else { + _node.set_root(); + } + return _node; +} + +void graph::make_edge(node sender, node receiver) { + sender.register_successor(receiver);//register successor + my_graph->remove_root(receiver.my_node); //remove receiver from root node list +} + +void graph::exec_and_wait( sycl::queue q ) { + my_graph->exec_and_wait(q); +}; + +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) + From a786fee9bc820d77464a70bacfce68496260ac07 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 18 Feb 2022 15:15:10 -0600 Subject: [PATCH 02/82] Adding initial sycl graph doc --- .../SYCL_EXT_ONEAPI_GRAPH.asciidoc | 290 ++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc new file mode 100644 index 0000000000000..3bb7051730b7d --- /dev/null +++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc @@ -0,0 +1,290 @@ += SYCL_EXT_ONEAPI_GRAPH +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en + +:blank: pass:[ +] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + +== Notice + +Copyright (c) 2022 Intel Corporation. All rights reserved. + +IMPORTANT: This specification is a draft. + +NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are +trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. +used by permission by Khronos. + +NOTE: This document is better viewed when rendered as html with asciidoctor. +GitHub does not render image icons. + +This extension is written against the SYCL 2020 revision 4 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +NOTE: This extension is experimental: interfaces are subject to change later. + +== Introduction + +This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating +Its definition and execution. + +== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification section 6.3.3 "Feature test macros". Therefore, an +implementation supporting this extension must predefine the macro +`SYCL_EXT_ONEAPI_GRAPH` to one of the values defined in the table below. +Applications can test for the existence of this macro to determine if the +implementation supports this feature, or applications can test the macro's +value to determine which of the extension's APIs the implementation supports. + +Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. +[%header,cols="1,5"] +|=== +|Value |Description +|1 |Initial extension version. Base features are supported. +|=== + +== SYCL Graph Terminology + +Table 2. Terminology. +|=== +|Concept|Description +|graph| Class that stores structured work units and their dependencies +|node| The unit of work. Can have different attributes. +|edge| Dependency between work units. Happens before relation. +|=== + +== Node + +Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution. +A graph has to be created first, the structure of a graph is defined second by adding nodes and edges. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + + class node{ + }; +} +---- + +NOTE: + +== Edge + +A dependency between two nodes representing a happens before relationship. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + + // Adding dependency between two nodes. + void make_edge(node sender, node receiver); +} +---- + +== Graph + +Graph is a class that represents a directed acyclic graph of nodes. +A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. +Member functions as listed in Table 2 and 3 can be used to add nodes to a graph. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + + class graph { + }; + +} +---- + +=== Executable Graph + +`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. +The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed. +Each `executable_graph` object can only be executed once at the same time on its assigned queue. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + + class executable_graph { + }; + +} +---- + + +Table 3. Constructors of the `graph` class. +|=== +|Constructor|Description + +|`graph()` +|Creates a `graph` object + +|`graph(graph& parent)` +|Creates a nested `graph` object + +|=== + +Table 4. Member functions of the `graph` class. +|=== +|Member function|Description + +|`node add_empty_node(const std::vector& dep = {});` +|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) . + +|`template + node add_host_node(T hostTaskCallable, const std::vector& dep = {});` +|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime. + +|`template + node add_device_node(T cgf, const std::vector& dep = {});` +|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec. + +|`template + executable_graph make_executable(const queue& syclQueue);` +|Returns a queue specific graph object that can be submitted to a queue. + +|`template + executable_graph make_executable(const device& syclDevice, const context& syclContext);` +|Returns a device and context specific graph object that can be submitted to a queue. + +|=== + +Table 5. Member functions of the `graph` class (memory operations). +|=== +|Member function|Description + +|`node add_memcpy_node(void* dest, const void* src, size_t numBytes, const std::vector& dep = {});` +|Adding a node that encapsulates a `memcpy` operation. + +|`node add_memset_node(void* ptr, int value, size_t numBytes, const std::vector& dep = {});` +|Adding a node that encapsulates a `memset` operation. + +|`node add_malloc_node(void *data, size_t numBytes, usm::alloc kind, const std::vector& dep = {});` +|Adding a node that encapsulates a `malloc` operation. + +|`node add_free_node(void *data, const std::vector& dep = {});` +|Adding a node that encapsulates a `free` operation. + +|=== + + +== Examples + +1. Dot product + +[source,c++] +---- +... + +#include + +int main() { + const size_t n = 10; + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + +#ifndef POC_IMPL + sycl::queue q; +#else + sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}}; + sycl::queue q{p}; +#endif + + sycl::ext::oneapi::experimental::graph g; + + float *x = sycl::malloc_shared(n, q); + float *y = sycl::malloc_shared(n, q); + float *z = sycl::malloc_shared(n, q); + + float *dotp = sycl::malloc_shared(1, q); + + for (int i = 0; i < n; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; + } + + auto node_a = g.add_device_node([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = alpha * x[i] + beta * y[i]; + }); + }); + + auto node_b = g.add_device_node([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + z[i] = gamma * z[i] + beta * y[i]; + }); + }); + + auto node_c = g.add_device_node( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(dotp, 0.0f, std::plus()), + [=](sycl::id<1> it, auto &sum) { + const size_t i = it[0]; + sum += x[i] * z[i]; + }); + }, + {node_a, node_b}); + + auto exec = g.make_exec(q); + +#ifndef POC_IMPL + q.submit(exec).wait(); +#else + exec.exec_and_wait(); +#endif + + sycl::free(x, q); + sycl::free(y, q); + sycl::free(z, q); + sycl::free(dotp, q); + + return 0; +} + + +... +---- + +== Issues for later investigations + +. Explicit memory movement can cause POC to stall. + +== Non-implemented features +Please, note that the following features are not yet implemented: + +. Level Zero backend only +. Memory operation nodes not implemented +. Host node not implemented +. Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead. + +== Revision History + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|======================================== +|Rev|Date|Author|Changes +|1|2022-02-11|Pablo Reble|Initial public working draft +|======================================== From a6c9b113a9a1af71fa6daeb2d7d8d70e2beac894 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 11 Mar 2022 16:42:43 +0100 Subject: [PATCH 03/82] update extension proposal started to incorporate feedback --- .../SYCL_EXT_ONEAPI_GRAPH.asciidoc | 92 ++++++++----------- 1 file changed, 38 insertions(+), 54 deletions(-) diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc index 3bb7051730b7d..efe81d24b767d 100644 --- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc +++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc @@ -69,15 +69,15 @@ Table 2. Terminology. == Node -Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution. +Node is a class that encapsulates tasks like SYCL kernel functions or host tasks for deferred execution. A graph has to be created first, the structure of a graph is defined second by adding nodes and edges. [source,c++] ---- namespace sycl::ext::oneapi::experimental { - class node{ - }; + class node{ + }; } ---- @@ -85,7 +85,7 @@ NOTE: == Edge -A dependency between two nodes representing a happens before relationship. +A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs. [source,c++] ---- @@ -99,45 +99,46 @@ namespace sycl::ext::oneapi::experimental { == Graph Graph is a class that represents a directed acyclic graph of nodes. -A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. +A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed. Member functions as listed in Table 2 and 3 can be used to add nodes to a graph. [source,c++] ---- namespace sycl::ext::oneapi::experimental { - class graph { + enum class graph_state{ + modifiable, + executable }; + template + class graph { + public: + operator graph(); + }; + + graph make_graph(); + + graph compile(const graph Graph); + } ----- -=== Executable Graph +sycl::event sycl::queue(const graph Graph); -`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. -The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed. -Each `executable_graph` object can only be executed once at the same time on its assigned queue. - -[source,c++] ---- -namespace sycl::ext::oneapi::experimental { - - class executable_graph { - }; -} ----- +=== Executable Graph +A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution. +The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node. +Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error. -Table 3. Constructors of the `graph` class. +Table 3. Constructor of the `graph` class. |=== |Constructor|Description |`graph()` -|Creates a `graph` object - -|`graph(graph& parent)` -|Creates a nested `graph` object +|Creates a `graph` object. It's default state is `graph_state::modifiable`. |=== @@ -145,24 +146,12 @@ Table 4. Member functions of the `graph` class. |=== |Member function|Description -|`node add_empty_node(const std::vector& dep = {});` -|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) . - -|`template - node add_host_node(T hostTaskCallable, const std::vector& dep = {});` -|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime. +|`node add_node(const std::vector& dep = {});` +|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. |`template - node add_device_node(T cgf, const std::vector& dep = {});` -|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec. - -|`template - executable_graph make_executable(const queue& syclQueue);` -|Returns a queue specific graph object that can be submitted to a queue. - -|`template - executable_graph make_executable(const device& syclDevice, const context& syclContext);` -|Returns a device and context specific graph object that can be submitted to a queue. + node add_node(T cgf, const std::vector& dep = {});` +|This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec. |=== @@ -187,6 +176,8 @@ Table 5. Member functions of the `graph` class (memory operations). == Examples +NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document. + 1. Dot product [source,c++] @@ -201,14 +192,9 @@ int main() { float beta = 2.0f; float gamma = 3.0f; -#ifndef POC_IMPL sycl::queue q; -#else - sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}}; - sycl::queue q{p}; -#endif - sycl::ext::oneapi::experimental::graph g; + auto g = sycl::ext::oneapi::experimental::make_graph(); float *x = sycl::malloc_shared(n, q); float *y = sycl::malloc_shared(n, q); @@ -222,21 +208,21 @@ int main() { z[i] = 3.0f; } - auto node_a = g.add_device_node([&](sycl::handler &h) { + auto node_a = g.add_node([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }); - auto node_b = g.add_device_node([&](sycl::handler &h) { + auto node_b = g.add_node([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); }); - auto node_c = g.add_device_node( + auto node_c = g.add_node( [&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), @@ -247,13 +233,9 @@ int main() { }, {node_a, node_b}); - auto exec = g.make_exec(q); + auto exec = compile(q); -#ifndef POC_IMPL q.submit(exec).wait(); -#else - exec.exec_and_wait(); -#endif sycl::free(x, q); sycl::free(y, q); @@ -278,6 +260,7 @@ Please, note that the following features are not yet implemented: . Memory operation nodes not implemented . Host node not implemented . Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead. +. `class graph` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead. == Revision History @@ -287,4 +270,5 @@ Please, note that the following features are not yet implemented: |======================================== |Rev|Date|Author|Changes |1|2022-02-11|Pablo Reble|Initial public working draft +|2|2022-03-11|Pablo Reble|Incorporate feedback from PR |======================================== From da316f5912f3d772b882ce988c3ed5315f8a5b96 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 11 Mar 2022 20:47:16 +0100 Subject: [PATCH 04/82] typo --- sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc index efe81d24b767d..28e1d78f5de1b 100644 --- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc +++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc @@ -85,7 +85,7 @@ NOTE: == Edge -A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs. +A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs. [source,c++] ---- From 719e3ec4de90ba4a1597ce3c58a77737c3cc4cbd Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 2 May 2022 21:06:42 -0500 Subject: [PATCH 05/82] fix typos and syntax issues --- .../experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc index 28e1d78f5de1b..4b0a5ea805d35 100644 --- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc +++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc @@ -38,7 +38,7 @@ NOTE: This extension is experimental: interfaces are subject to change later. == Introduction This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating -Its definition and execution. +its definition and execution. == Feature test macro @@ -64,7 +64,7 @@ Table 2. Terminology. |Concept|Description |graph| Class that stores structured work units and their dependencies |node| The unit of work. Can have different attributes. -|edge| Dependency between work units. Happens before relation. +|edge| Dependency between work units. Happens-before relation. |=== == Node @@ -85,7 +85,7 @@ NOTE: == Edge -A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs. +A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs. [source,c++] ---- @@ -119,7 +119,7 @@ namespace sycl::ext::oneapi::experimental { graph make_graph(); - graph compile(const graph Graph); + graph compile(const graph Graph); } @@ -130,7 +130,7 @@ sycl::event sycl::queue(const graph Graph); === Executable Graph A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution. -The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node. +The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node. Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error. Table 3. Constructor of the `graph` class. From cb9d49b95c7eb1d09d4e11e4616227117302f63b Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 11 Aug 2022 16:26:35 -0500 Subject: [PATCH 06/82] Revert "Inital version of sycl graph prototype" This reverts commit ef24789e28ee58d302e6475579ee90d3bf765763. --- .../sycl/ext/oneapi/experimental/graph.hpp | 212 ------------------ 1 file changed, 212 deletions(-) delete mode 100644 sycl/include/sycl/ext/oneapi/experimental/graph.hpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp deleted file mode 100644 index 08a7d094e9054..0000000000000 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ /dev/null @@ -1,212 +0,0 @@ -//==--------- graph.hpp --- SYCL graph extension ---------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#include -#include - -__SYCL_INLINE_NAMESPACE(cl) { -namespace sycl { -namespace ext { -namespace oneapi { -namespace experimental { -namespace detail { - -struct node_impl; - -struct graph_impl; - -using node_ptr = std::shared_ptr; - -using graph_ptr = std::shared_ptr; - -class wrapper { - using T = std::function; - T my_func; - std::vector my_deps; -public: - wrapper(T t, const std::vector& deps) : my_func(t), my_deps(deps) {}; - - void operator()(sycl::handler& cgh) { - cgh.depends_on(my_deps); - std::invoke(my_func,cgh); - } -}; - -struct node_impl { - bool is_scheduled; - - graph_ptr my_graph; - sycl::event my_event; - - std::vector my_successors; - std::vector my_predecessors; - - std::function my_body; - - void exec( sycl::queue q ) { - std::vector __deps; - for(auto i:my_predecessors) __deps.push_back(i->get_event()); - my_event = q.submit(wrapper{my_body,__deps}); - } - - void register_successor(node_ptr n) { - my_successors.push_back(n); - n->register_predecessor(node_ptr(this)); - } - - void register_predecessor(node_ptr n) { my_predecessors.push_back(n); } - - sycl::event get_event(void) {return my_event;} - - template - node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {} - - // Recursively adding nodes to execution stack: - void topology_sort(std::list& schedule) { - is_scheduled = true; - for(auto i:my_successors) { - if(!i->is_scheduled) i->topology_sort(schedule); - } - schedule.push_front(node_ptr(this)); - } -}; - -struct graph_impl { - std::set my_roots; - std::list my_schedule; - - graph_ptr parent; - - void exec( sycl::queue q ) { - if( my_schedule.empty() ) { - for(auto n : my_roots) { - n->topology_sort(my_schedule); - } - } - for(auto n : my_schedule) n->exec(q); - } - - void exec_and_wait( sycl::queue q ) { - exec(q); - q.wait(); - } - - void add_root(node_ptr n) { - my_roots.insert(n); - for(auto n : my_schedule) n->is_scheduled=false; - my_schedule.clear(); - } - - void remove_root(node_ptr n) { - my_roots.erase(n); - for(auto n : my_schedule) n->is_scheduled=false; - my_schedule.clear(); - } - - graph_impl() {} -}; - -} // namespace detail - -class node; - -class graph; - -class executable_graph; - -struct node { - // TODO: add properties to distinguish between empty, host, device nodes. - detail::node_ptr my_node; - detail::graph_ptr my_graph; - - template - node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {}; - void register_successor(node n) { my_node->register_successor(n.my_node); } - void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); } - - void set_root() { my_graph->add_root(my_node);} - - // TODO: Add query functions: is_root, ... -}; - -class executable_graph { -public: - int my_tag; - sycl::queue my_queue; - - void exec_and_wait();// { my_queue.wait(); } - - executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) { - g->exec(my_queue); - } -}; - -class graph { -public: - // Adding empty node with [0..n] predecessors: - node add_empty_node(const std::vector& dep = {}); - - // Adding node for host task - template - node add_host_node(T hostTaskCallable, const std::vector& dep = {}); - - // Adding device node: - template - node add_device_node(T cgf, const std::vector& dep = {}); - - // Adding dependency between two nodes. - void make_edge(node sender, node receiver); - - // TODO: Extend queue to directly submit graph - void exec_and_wait( sycl::queue q ); - - executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};}; - - graph() : my_graph(new detail::graph_impl()) {} - - // Creating a subgraph (with predecessors) - graph(graph& parent, const std::vector& dep = {}) {} - - bool is_subgraph(); - -private: - detail::graph_ptr my_graph; -}; - -void executable_graph::exec_and_wait() { my_queue.wait(); } - -template -node graph::add_device_node(T cgf , const std::vector& dep) { - node _node(my_graph,cgf); - if( !dep.empty() ) { - for(auto n : dep) this->make_edge(n,_node); - } else { - _node.set_root(); - } - return _node; -} - -void graph::make_edge(node sender, node receiver) { - sender.register_successor(receiver);//register successor - my_graph->remove_root(receiver.my_node); //remove receiver from root node list -} - -void graph::exec_and_wait( sycl::queue q ) { - my_graph->exec_and_wait(q); -}; - -} // namespace experimental -} // namespace oneapi -} // namespace ext -} // namespace sycl -} // __SYCL_INLINE_NAMESPACE(cl) - From 86436c356ff7149d98a3a428d5b9ec364d1c1bcd Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 11 Aug 2022 16:29:47 -0500 Subject: [PATCH 07/82] move extension doc to proposed folder --- .../sycl_ext_oneapi_graph.asciidoc} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sycl/doc/extensions/{experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc => proposed/sycl_ext_oneapi_graph.asciidoc} (100%) diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc similarity index 100% rename from sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc rename to sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc From d410644deca63b7ecc14cbf5193684f798bae694 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 11 Aug 2022 16:33:23 -0500 Subject: [PATCH 08/82] Rework formatting and introducing USM shortcuts --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 227 ++++++++++++++++-- 1 file changed, 204 insertions(+), 23 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 4b0a5ea805d35..15dfa73d03b44 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -60,6 +60,7 @@ Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. == SYCL Graph Terminology Table 2. Terminology. +[%header,cols="1,3"] |=== |Concept|Description |graph| Class that stores structured work units and their dependencies @@ -81,8 +82,6 @@ namespace sycl::ext::oneapi::experimental { } ---- -NOTE: - == Edge A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs. @@ -100,7 +99,7 @@ namespace sycl::ext::oneapi::experimental { Graph is a class that represents a directed acyclic graph of nodes. A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed. -Member functions as listed in Table 2 and 3 can be used to add nodes to a graph. +Member functions as listed in Table 3 to 6 can be used to add nodes to a graph. [source,c++] ---- @@ -123,7 +122,18 @@ namespace sycl::ext::oneapi::experimental { } -sycl::event sycl::queue(const graph Graph); +---- + +The following member functions are added to the queue class. + +[source,c++] +---- + +namespace sycl { + +event submit(const ext::oneapi::experimental::graph& my_graph); + +} // namespace sycl ---- @@ -133,46 +143,209 @@ A `graph` object in `graph_state::executable` represents a user generated device The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node. Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error. +=== Graph member and helper functions + Table 3. Constructor of the `graph` class. +[cols="2a,a"] |=== |Constructor|Description -|`graph()` -|Creates a `graph` object. It's default state is `graph_state::modifiable`. +| +[source,c++] +---- +/* available only when graph_state == modifiable */` +graph(); +---- +|Creates a `graph` object. |=== Table 4. Member functions of the `graph` class. +[cols="2a,a"] |=== |Member function|Description -|`node add_node(const std::vector& dep = {});` -|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. +| +[source,c++] +---- +node add_node(const std::vector& dep = {}); +---- +|This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. -|`template - node add_node(T cgf, const std::vector& dep = {});` +| +[source,c++] +---- +template + node add_node(T cgf, const std::vector& dep = {}); +---- |This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec. |=== +Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. + Table 5. Member functions of the `graph` class (memory operations). +[cols="2a,a"] |=== |Member function|Description -|`node add_memcpy_node(void* dest, const void* src, size_t numBytes, const std::vector& dep = {});` +| +[source,c++] +---- +node memcpy(void* dest, const void* src, size_t numBytes, const std::vector& dep = {}); +---- |Adding a node that encapsulates a `memcpy` operation. -|`node add_memset_node(void* ptr, int value, size_t numBytes, const std::vector& dep = {});` +| +[source,c++] +---- +template node +copy(const T* src, T* dest, size_t count, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `copy` operation. + +| +[source,c++] +---- +node memset(void* ptr, int value, size_t numBytes, const std::vector& dep = {}); +---- |Adding a node that encapsulates a `memset` operation. -|`node add_malloc_node(void *data, size_t numBytes, usm::alloc kind, const std::vector& dep = {});` +| +[source,c++] +---- +template +node fill(void* ptr, const T& pattern, size_t count, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `fill` operation. + +| +[source,c++] +---- +node malloc(void *data, size_t numBytes, usm::alloc kind, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `malloc` operation. + +| +[source,c++] +---- +node malloc_shared(void *data, size_t numBytes, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `malloc` operation. + +| +[source,c++] +---- +node malloc_host(void *data, size_t numBytes, const std::vector& dep = {}); +---- |Adding a node that encapsulates a `malloc` operation. -|`node add_free_node(void *data, const std::vector& dep = {});` +| +[source,c++] +---- +node malloc_device(void *data, size_t numBytes, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `malloc` operation. + +| +[source,c++] +---- +node free(void *data, const std::vector& dep = {}); +---- |Adding a node that encapsulates a `free` operation. |=== +Table 6. Member functions of the `graph` class (convenience shortcuts). +[cols="2a,a"] +|=== +|Member function|Description + +| +[source,c++] +---- +template +node single_task(const KernelType &kernelFunc, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `single_task` operation. + +| +[source,c++] +---- +template +node parallel_for(range numWorkItems, Rest&& rest, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `parallel_for` operation. + +| +[source,c++] +---- +template +node parallel_for(nd_range executionRange, Rest&& rest, const std::vector& dep = {}); +---- +|Adding a node that encapsulates a `parallel_for` operation. + +|=== + +Table 7. Helper functions of the `graph` class. +[cols="a,a"] +|=== +|Function name|Description + +| +[source,c++] +---- +graph make_graph(); +---- +|Creates a `graph` object. It's state is `graph_state::modifiable`. + +|=== + +=== Node member functions + +Table 8. Constructor of the `node` class. +[cols="a,a"] +|=== +|Constructor|Description + +| +[source,c++] +---- +node(); +---- +|Creates an empty `node` object. That encapsulates no tasks and is not assigned to a graph. Prior to execution it has to be assigned to a graph. + +|=== + +Table 9. Member functions of the `node` class. +[cols="2a,a"] +|=== +|Function name|Description + +| +[source,c++] +---- +void set_graph(graph& Graph); +---- +|Assigns a `node` object to a `graph`. + +| +[source,c++] +---- +template +void update(T cgf); +---- +|Update a `node` object. + +| +[source,c++] +---- +template +void update(T cgf, graph& Graph); +---- +|Update a `node` object and assign it to a task. + +|=== == Examples @@ -196,31 +369,35 @@ int main() { auto g = sycl::ext::oneapi::experimental::make_graph(); - float *x = sycl::malloc_shared(n, q); - float *y = sycl::malloc_shared(n, q); - float *z = sycl::malloc_shared(n, q); + float *x , *y, *z; + + auto n_x = g.malloc_shared(x, n, q); + auto n_y = g.malloc_shared(y, n, q); + auto n_z = g.malloc_shared(z, n, q); float *dotp = sycl::malloc_shared(1, q); - for (int i = 0; i < n; i++) { + /* init data by using usm shortcut */ + auto n_i = g.parallel_for(n, [=](sycl::id<1> it){ + const size_t i = it[0]; x[i] = 1.0f; y[i] = 2.0f; z[i] = 3.0f; - } + }, {n_x, n_y, n_z}); auto node_a = g.add_node([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); - }); + }, {n_i}); auto node_b = g.add_node([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); - }); + }, {n_i}); auto node_c = g.add_node( [&](sycl::handler &h) { @@ -232,13 +409,15 @@ int main() { }); }, {node_a, node_b}); + + auto node_f1 = g.free(x, {node_c}); + auto node_f1 = g.free(y, {node_b}); auto exec = compile(q); q.submit(exec).wait(); - sycl::free(x, q); - sycl::free(y, q); + // memory can be freed inside or outside the graph sycl::free(z, q); sycl::free(dotp, q); @@ -271,4 +450,6 @@ Please, note that the following features are not yet implemented: |Rev|Date|Author|Changes |1|2022-02-11|Pablo Reble|Initial public working draft |2|2022-03-11|Pablo Reble|Incorporate feedback from PR +|3|2022-05-25|Pablo Reble|Extend API and Example +|4|2022-08-10|Pablo Reble|Adding USM shortcuts |======================================== From 920775416030c6512165014275fbaa525ee45533 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 12 Aug 2022 10:26:11 -0500 Subject: [PATCH 09/82] Update sycl_ext_oneapi_graph.asciidoc Change status to proposal --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 15dfa73d03b44..a9ab7837e7ceb 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -33,7 +33,13 @@ This extension is written against the SYCL 2020 revision 4 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision. -NOTE: This extension is experimental: interfaces are subject to change later. +== Status + + This is a proposed extension specification, intended to gather community + feedback. Interfaces defined in this specification may not be implemented yet + or may be in a preliminary state. The specification itself may also change in + incompatible ways before it is finalized. *Shipping software products should + not rely on APIs defined in this specification.* == Introduction From 01e06f6a1c18a6dceeb55f38a7e64196d39b21c8 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 12 Aug 2022 10:26:52 -0500 Subject: [PATCH 10/82] Update sycl_ext_oneapi_graph.asciidoc --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a9ab7837e7ceb..d199348020109 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -35,11 +35,11 @@ SYCL specification refer to that revision. == Status - This is a proposed extension specification, intended to gather community - feedback. Interfaces defined in this specification may not be implemented yet - or may be in a preliminary state. The specification itself may also change in - incompatible ways before it is finalized. *Shipping software products should - not rely on APIs defined in this specification.* +This is a proposed extension specification, intended to gather community +feedback. Interfaces defined in this specification may not be implemented yet +or may be in a preliminary state. The specification itself may also change in +incompatible ways before it is finalized. *Shipping software products should +not rely on APIs defined in this specification.* == Introduction From cc1f315ebce2181a39f897482eb14ae8e7a3a59d Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 12 Aug 2022 12:07:43 -0500 Subject: [PATCH 11/82] Remove implementation specifics from doc --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index d199348020109..d68a085fd0d31 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -355,7 +355,7 @@ void update(T cgf, graph& Graph); == Examples -NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document. +// NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document. 1. Dot product @@ -434,18 +434,18 @@ int main() { ... ---- -== Issues for later investigations - -. Explicit memory movement can cause POC to stall. - -== Non-implemented features -Please, note that the following features are not yet implemented: - -. Level Zero backend only -. Memory operation nodes not implemented -. Host node not implemented -. Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead. -. `class graph` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead. +// == Issues for later investigations +// +// . Explicit memory movement can cause POC to stall. +// +// == Non-implemented features +// Please, note that the following features are not yet implemented: +// +// . Level Zero backend only +// . Memory operation nodes not implemented +// . Host node not implemented +// . Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead. +// . `class graph` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead. == Revision History From fbccd9ace84e83f6041610064c1752030848300c Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 22 Aug 2022 14:38:40 -0500 Subject: [PATCH 12/82] Remove obsolete note Co-authored-by: John Pennycook --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index d68a085fd0d31..1cee476960463 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -26,8 +26,6 @@ NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. -NOTE: This document is better viewed when rendered as html with asciidoctor. -GitHub does not render image icons. This extension is written against the SYCL 2020 revision 4 specification. All references below to the "core SYCL specification" or to section numbers in the From e5435e61673d0a8594c1fb38bab988559b0326fe Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 22 Aug 2022 15:01:18 -0500 Subject: [PATCH 13/82] wording Co-authored-by: John Pennycook --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 1cee476960463..b422053bfc553 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -301,7 +301,7 @@ Table 7. Helper functions of the `graph` class. ---- graph make_graph(); ---- -|Creates a `graph` object. It's state is `graph_state::modifiable`. +|Creates a `graph` object in the `graph_state::modifiable` state. |=== From 54f4f9d5163be2a95d55130a44461f05e994fa00 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 23 Aug 2022 15:00:42 -0500 Subject: [PATCH 14/82] bump target spec rev Co-authored-by: Ronan Keryell --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index b422053bfc553..b350f5a79584c 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -27,7 +27,7 @@ trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. -This extension is written against the SYCL 2020 revision 4 specification. All +This extension is written against the SYCL 2020 revision 5 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision. From b5779bb7bea189dd28e4c95e6f2f611d04934d45 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 24 Aug 2022 16:08:45 -0500 Subject: [PATCH 15/82] using constructor for command graph creation (I/II) Co-authored-by: Ronan Keryell --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index b350f5a79584c..3b5aecea3d9b1 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -371,7 +371,7 @@ int main() { sycl::queue q; - auto g = sycl::ext::oneapi::experimental::make_graph(); + sycl::ext::oneapi::experimental::command_graph g; float *x , *y, *z; From 1eb97e28aff8c12de9a06ffbeab0564447ac3b7e Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 24 Aug 2022 16:27:47 -0500 Subject: [PATCH 16/82] using constructor for command graph creation (II/II) --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 62 ++++++++++--------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 3b5aecea3d9b1..98ccc332ecb1e 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -114,15 +114,19 @@ namespace sycl::ext::oneapi::experimental { executable }; - template - class graph { + template + class command_graph { public: - operator graph(); + operator command_graph(); }; - graph make_graph(); + template<> + class command_graph{ + public: + command_graph() = delete; + }; - graph compile(const graph Graph); + command_graph compile(const command_graph Graph); } @@ -135,7 +139,7 @@ The following member functions are added to the queue class. namespace sycl { -event submit(const ext::oneapi::experimental::graph& my_graph); +event submit(const ext::oneapi::experimental::command_graph& my_graph); } // namespace sycl @@ -143,13 +147,13 @@ event submit(const ext::oneapi::experimental::graph Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. -Table 5. Member functions of the `graph` class (memory operations). +Table 5. Member functions of the `command_graph` class (memory operations). [cols="2a,a"] |=== |Member function|Description @@ -260,7 +264,7 @@ node free(void *data, const std::vector& dep = {}); |=== -Table 6. Member functions of the `graph` class (convenience shortcuts). +Table 6. Member functions of the `command_graph` class (convenience shortcuts). [cols="2a,a"] |=== |Member function|Description @@ -289,20 +293,20 @@ node parallel_for(nd_range executionRange, Rest&& rest, const std::vector< ---- |Adding a node that encapsulates a `parallel_for` operation. -|=== - -Table 7. Helper functions of the `graph` class. -[cols="a,a"] -|=== -|Function name|Description - -| -[source,c++] ----- -graph make_graph(); ----- -|Creates a `graph` object in the `graph_state::modifiable` state. - +// |=== +// +// Table 7. Helper functions of the `graph` class. +// [cols="a,a"] +// |=== +// |Function name|Description +// +// | +// [source,c++] +// ---- +// graph make_graph(); +// ---- +// |Creates a `graph` object in the `graph_state::modifiable` state. + |=== === Node member functions @@ -329,7 +333,7 @@ Table 9. Member functions of the `node` class. | [source,c++] ---- -void set_graph(graph& Graph); +void set_graph(command_graph& Graph); ---- |Assigns a `node` object to a `graph`. From e07d7f8c29691cb01327fe6793cfec7673a34b5e Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 24 Aug 2022 21:45:35 -0500 Subject: [PATCH 17/82] typo Co-authored-by: Ronan Keryell --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 98ccc332ecb1e..f0a5db814ced1 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -421,7 +421,7 @@ int main() { auto node_f1 = g.free(x, {node_c}); auto node_f1 = g.free(y, {node_b}); - auto exec = compile(q); + auto exec = g.compile(q); q.submit(exec).wait(); From e9a0a192091d79ad8c884216ce762dfda011f619 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 29 Sep 2022 11:04:00 -0500 Subject: [PATCH 18/82] Update sycl_ext_oneapi_graph.asciidoc Renaming functions, update example and removing USM shortcuts --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 199 +++--------------- 1 file changed, 32 insertions(+), 167 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index f0a5db814ced1..a9a9ce4251244 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -126,8 +126,6 @@ namespace sycl::ext::oneapi::experimental { command_graph() = delete; }; - command_graph compile(const command_graph Graph); - } ---- @@ -176,7 +174,7 @@ Table 4. Member functions of the `command_graph` class. | [source,c++] ---- -node add_node(const std::vector& dep = {}); +node add(const std::vector& dep = {}); ---- |This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. @@ -184,174 +182,39 @@ node add_node(const std::vector& dep = {}); [source,c++] ---- template - node add_node(T cgf, const std::vector& dep = {}); + node add(T cgf, const std::vector& dep = {}); ---- -|This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec. - -|=== - -Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. - -Table 5. Member functions of the `command_graph` class (memory operations). -[cols="2a,a"] -|=== -|Member function|Description +|This function adds a command group function object to a graph. The function object can contain single or multiple commands such as a host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec. | [source,c++] ---- -node memcpy(void* dest, const void* src, size_t numBytes, const std::vector& dep = {}); +command_graph finalize(context &syclContext) const; ---- -|Adding a node that encapsulates a `memcpy` operation. +| This function creates an executable graph object with an immutable topology that can be executed on a queue that matches the given context. -| -[source,c++] ----- -template node -copy(const T* src, T* dest, size_t count, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `copy` operation. - -| -[source,c++] ----- -node memset(void* ptr, int value, size_t numBytes, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `memset` operation. - -| -[source,c++] ----- -template -node fill(void* ptr, const T& pattern, size_t count, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `fill` operation. - -| -[source,c++] ----- -node malloc(void *data, size_t numBytes, usm::alloc kind, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `malloc` operation. - -| -[source,c++] ----- -node malloc_shared(void *data, size_t numBytes, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `malloc` operation. - -| -[source,c++] ----- -node malloc_host(void *data, size_t numBytes, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `malloc` operation. - -| -[source,c++] ----- -node malloc_device(void *data, size_t numBytes, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `malloc` operation. - -| -[source,c++] ----- -node free(void *data, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `free` operation. - -|=== - -Table 6. Member functions of the `command_graph` class (convenience shortcuts). -[cols="2a,a"] -|=== -|Member function|Description - -| -[source,c++] ----- -template -node single_task(const KernelType &kernelFunc, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `single_task` operation. - -| -[source,c++] ----- -template -node parallel_for(range numWorkItems, Rest&& rest, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `parallel_for` operation. - -| -[source,c++] ----- -template -node parallel_for(nd_range executionRange, Rest&& rest, const std::vector& dep = {}); ----- -|Adding a node that encapsulates a `parallel_for` operation. - -// |=== -// -// Table 7. Helper functions of the `graph` class. -// [cols="a,a"] -// |=== -// |Function name|Description -// -// | -// [source,c++] -// ---- -// graph make_graph(); -// ---- -// |Creates a `graph` object in the `graph_state::modifiable` state. - -|=== - -=== Node member functions - -Table 8. Constructor of the `node` class. -[cols="a,a"] |=== -|Constructor|Description - -| -[source,c++] ----- -node(); ----- -|Creates an empty `node` object. That encapsulates no tasks and is not assigned to a graph. Prior to execution it has to be assigned to a graph. -|=== +Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. -Table 9. Member functions of the `node` class. +Table 5. Member functions of the `command_graph` class (memory operations). [cols="2a,a"] |=== -|Function name|Description - -| -[source,c++] ----- -void set_graph(command_graph& Graph); ----- -|Assigns a `node` object to a `graph`. +|Member function|Description | [source,c++] ---- -template -void update(T cgf); +node add_malloc_device(void *&data, size_t numBytes, const std::vector& dep = {}); ---- -|Update a `node` object. +|Adding a node that encapsulates a `malloc` operation. | [source,c++] ---- -template -void update(T cgf, graph& Graph); +node add_free(void *data, const std::vector& dep = {}); ---- -|Update a `node` object and assign it to a task. +|Adding a node that encapsulates a `free` operation. |=== @@ -378,36 +241,38 @@ int main() { sycl::ext::oneapi::experimental::command_graph g; float *x , *y, *z; - - auto n_x = g.malloc_shared(x, n, q); - auto n_y = g.malloc_shared(y, n, q); - auto n_z = g.malloc_shared(z, n, q); - + float *dotp = sycl::malloc_shared(1, q); - /* init data by using usm shortcut */ - auto n_i = g.parallel_for(n, [=](sycl::id<1> it){ - const size_t i = it[0]; - x[i] = 1.0f; - y[i] = 2.0f; - z[i] = 3.0f; + auto n_x = g.add_malloc_device(x, n); + auto n_y = g.add_malloc_device(y, n); + auto n_z = g.add_malloc_device(z, n); + + /* init data on the device */ + auto n_i = g.add([&](sycl::handler &h) { + h.parallel_for(n, [=](sycl::id<1> it){ + const size_t i = it[0]; + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; + }); }, {n_x, n_y, n_z}); - auto node_a = g.add_node([&](sycl::handler &h) { + auto node_a = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }, {n_i}); - auto node_b = g.add_node([&](sycl::handler &h) { + auto node_b = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); }, {n_i}); - auto node_c = g.add_node( + auto node_c = g.add( [&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), @@ -418,15 +283,15 @@ int main() { }, {node_a, node_b}); - auto node_f1 = g.free(x, {node_c}); - auto node_f1 = g.free(y, {node_b}); + auto node_f1 = g.add_free(x, {node_c}); + auto node_f2 = g.add_free(y, {node_b}); - auto exec = g.compile(q); + auto exec = g.finalize(q.get_context()); q.submit(exec).wait(); // memory can be freed inside or outside the graph - sycl::free(z, q); + sycl::free(z, q.get_context()); sycl::free(dotp, q); return 0; From 2984a97075eb4ac161fe8666210297b120b68e75 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 29 Sep 2022 16:11:06 -0500 Subject: [PATCH 19/82] Update sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc Co-authored-by: Ronan Keryell --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a9a9ce4251244..fb493b8d70222 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -67,7 +67,7 @@ Table 2. Terminology. [%header,cols="1,3"] |=== |Concept|Description -|graph| Class that stores structured work units and their dependencies +|graph| Class that stores structured work units and their dependencies. |node| The unit of work. Can have different attributes. |edge| Dependency between work units. Happens-before relation. |=== From 1fa2add55802da899e790286eb3f4afc5ffa59cf Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 29 Sep 2022 16:32:24 -0500 Subject: [PATCH 20/82] Update sycl_ext_oneapi_graph.asciidoc --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index fb493b8d70222..34355dde2caca 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -137,7 +137,7 @@ The following member functions are added to the queue class. namespace sycl { -event submit(const ext::oneapi::experimental::command_graph& my_graph); +event queue::submit(const ext::oneapi::experimental::command_graph& my_graph); } // namespace sycl From a97ace69521c594de90468506ffabc424f57d58d Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 11 Nov 2022 07:59:15 +0000 Subject: [PATCH 21/82] Unify with Codeplay Graph extension. (#4) Merge [SYCL_EXT_CODEPLAY_GRAPHS](https://github.com/codeplaysoftware/standards-proposals/pull/135) into SYCL_EXT_ONEAPI_GRAPH. This is a first cut at merging and follow-up changes to reconcile some differences will likely still to be made. --- .../command_graph-state.svg | 4 + .../sycl_ext_oneapi_graph/queue-state.svg | 4 + .../proposed/sycl_ext_oneapi_graph.asciidoc | 924 ++++++++++++++++-- 3 files changed, 831 insertions(+), 101 deletions(-) create mode 100644 sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg create mode 100644 sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg diff --git a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg new file mode 100644 index 0000000000000..f3ed6a15a1f7d --- /dev/null +++ b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg @@ -0,0 +1,4 @@ + + + +
Finalize
Finalize
Modifiable
Modifiable
Executable
Executable
Text is not SVG - cannot display
\ No newline at end of file diff --git a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg new file mode 100644 index 0000000000000..d51956d613098 --- /dev/null +++ b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg @@ -0,0 +1,4 @@ + + + +

Begin Recording

Begin Recording
Executing
Executing
End Recording
End Recording
Recording
Recording
Text is not SVG - cannot display
\ No newline at end of file diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 34355dde2caca..ed8f4f7075662 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -8,6 +8,7 @@ :toc: left :encoding: utf-8 :lang: en +:sectnums: :blank: pass:[ +] @@ -31,6 +32,23 @@ This extension is written against the SYCL 2020 revision 5 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision. +== Contributors + +Pablo Reble, Intel + +Julian Miller, Intel + +John Pennycook, Intel + +Guo Yejun, Intel + +Ewan Crawford, Codeplay + +Ben Tracy, Codeplay + +Duncan McBain, Codeplay + +Peter Žužek, Codeplay + +Ruyman Reyes, Codeplay + +Gordon Brown, Codeplay + +Erik Tomusk, Codeplay + +Bjoern Knafla, Codeplay + +Lukas Sommer, Codeplay + +Ronan Keryell, AMD + + == Status This is a proposed extension specification, intended to gather community @@ -41,10 +59,109 @@ not rely on APIs defined in this specification.* == Introduction -This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating -its definition and execution. - -== Feature test macro +Through the use of command groups SYCL is already able to create a dependency +graph (in the form of a directed acyclic graph) of kernel execution at runtime, +as a command group object defines a set of requisites (edges) which must be +satisfied for kernels (nodes) to be executed. However, because command-group +submission is tied to execution on the queue, without having a prior +construction step before starting execution, optimization opportunities are +missed from the runtime not knowing the complete dependency graph ahead of +execution. + +The following benefits would become possible if the user could define a +dependency graph to the SYCL runtime prior to execution: + +* Reduction in runtime overhead by only submitting a single graph object, rather + than many individual commands. + +* Enable more work to be done offline, in particular producing a graph ahead of + time allows for improved performance at runtime from reduced overhead. + +* Unlock DMA hardware features through graph analysis by the runtime. + +* Whole graph optimizations become available, including but not limited to: +** Kernel fusion/fission. +** Inter-node memory reuse from data staying resident on device. +** Identification of the peak intermediate output memory requirement, used for + more optimal memory allocation. + +As well as benefits to the SYCL runtime, there are also advantages to the user +developing SYCL applications, as repetitive workloads no longer have to +redundantly issue the same sequence of commands. Instead, a graph is only +constructed once and submitted for execution as many times as is necessary, only +changing the data in input buffers or USM allocations. For applications from +specific domains, such as machine learning, where the same command group pattern +is run repeatedly for different inputs, this is particularly useful. + +=== Requirements + +In order to achieve the goals described in previous sections, the following +requirements were considered: + +1. Ability to update inputs/outputs of the graph between submissions, without + changing the overall graph structure. +2. Enable low effort porting of existing applications to use the extension. +3. Profiling, debugging, and tracing functionality at the granularity of graph + nodes. +4. Integrate sub-graphs (previously constructed graphs) when constructing a new + graph. +5. Support the USM model of memory as well as buffer model. +6. Compatible with other SYCL extensions and features, e.g. kernel fusion & + built-in kernels. +7. Ability to record a graph with commands submitted to different devices in the + same context. +8. Capability to serialize graphs to a binary format which can then be + de-serialized and executed. This is helpful for offline cases where a graph + can be created by an offline tool to be loaded and run without the end-user + incurring the overheads of graph creation. +9. Backend interoperability, the ability to retrieve a native graph object from + the graph and use that in a native backend API. + +To allow for prototype implementations of this extension to be developed +quickly for evaluation the scope of this proposal was limited to a subset +of these requirements. In particular, the serialization functionality (8), +backend interoperability (9), and a profiling/debugging interface (3) were +omitted. As these are not easy to abstract over a number of backends without +significant investigation. It is also hoped these features can be exposed as +additive changes to the API, and thus introduced in future versions of the +extension. + +Another reason for deferring a serialize/deserialize API (8) is that its scope +could extend from emitting the graph in a binary format, to emitting a +standardized IR format that enables further device specific graph optimizations. + +Multi-device support (7) is something we are looking into introducing into +the extension, which may result in API changes. + +=== Graph Building Mechanisms + +This extension contains two different API mechanisms for constructing a graph +of commands: + +1. **Explicit graph building API** - Allows users to specify the exact nodes +and edges they want to add to the graph. + +2. **Queue recording API (aka "Record & Replay")** - Introduces state to a +`sycl::queue` such that rather than scheduling commands immediately for +execution, they are added to the graph object instead, with edges based on the +data dependencies of the command group. + +Each of these mechanisms for constructing a graph have their own advantages, so +having both APIs available allows the user to pick the one which is most +suitable for them. The queue recording API allows quicker porting of existing +applications, and can capture external work that is submitted to a queue, for +example via library function calls. While the explicit API can better express +what data is internal to the graph for optimization, and dependencies don't need +to be inferred. + +It is valid to combine these two mechanisms sequentially when constructing a +graph, however it is not valid to use them concurrently. An error will be thrown +if a user attempts to use the explicit API to add a node to a graph which is +being recorded to by a queue. + +== Specification + +=== Feature test macro This extension provides a feature-test macro as described in the core SYCL specification section 6.3.3 "Feature test macros". Therefore, an @@ -61,97 +178,204 @@ Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. |1 |Initial extension version. Base features are supported. |=== -== SYCL Graph Terminology +=== SYCL Graph Terminology + +:explicit-memory-ops: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:explicitmemory Table 2. Terminology. [%header,cols="1,3"] |=== -|Concept|Description -|graph| Class that stores structured work units and their dependencies. -|node| The unit of work. Can have different attributes. -|edge| Dependency between work units. Happens-before relation. +| Concept | Description + +| Graph +| A directed and acyclic graph (DAG) of commands (nodes) and their dependencies +(edges), represented by the `command_graph` class. + +| Node +| A command, which can have different attributes. + +| Edge +| Dependency between commands as a happens-before relationship. + |=== -== Node +==== Explicit Graph Building API -Node is a class that encapsulates tasks like SYCL kernel functions or host tasks for deferred execution. -A graph has to be created first, the structure of a graph is defined second by adding nodes and edges. +When using the explicit graph building API to construct a graph, nodes and +edges are captured as follows. -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { +Table 3. Explicit Graph Definition. +[%header,cols="1,3"] +|=== +| Concept | Description - class node{ - }; -} ----- +| Node +| In the explicit graph building API nodes are created by the user invoking +methods on a modifiable graph. Each node represent either a command-group +function, empty operation, or device memory allocation/free. -== Edge +| Edge +| In the explicit graph building API edges are defined by the user. This is +either through buffer accessors, the `make_edge()` function, or by passing +dependent nodes on creation of a new node. +|=== -A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs. +==== Queue Recording API -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { +When using the record & replay API to construct a graph by recording a queue, +nodes and edges are captured as follows. - // Adding dependency between two nodes. - void make_edge(node sender, node receiver); -} +Table 4. Recorded Graph Definition. +[%header,cols="1,3"] +|=== +| Concept | Description + +| Node +| Nodes in a queue recorded graph represent each of the command group +submissions of the program. Each submission encompasses either one or both of +a.) some data movement, b.) a single asynchronous kernel launch. Nodes cannot +define forward edges, only backwards (i.e. kernels can only create dependencies +on things that have already happened). This means that transparently a node can +depend on a previously recorded graph (sub-graph), which works by creating edges +to the individual nodes in the old graph. Explicit memory operations without +kernels, such as a memory copy, are still classed as nodes under this +definition, as the {explicit-memory-ops}[SYCL 2020 specification states] that +these can be seen as specialized kernels executing on the device. + +| Edge +| An edge in a queue recorded graph represents a data dependency between two +nodes. Data dependencies can naturally be expressed in user code through buffer +accessors. USM pointers also convey data dependencies, however offsets into +system allocations (`malloc`/`new`) are not supported. +|=== + +=== API Modifications + +[source, c++] +---- +namespace sycl { +namespace ext::oneapi::experimental { + +// State of a queue, returned by info::queue::state +enum class queue_state { + executing, + recording +}; + +class node {}; + +// State of a graph +enum class graph_state { + modifiable, + executable +}; + +// New object representing graph +template +class command_graph {}; + +template<> +class command_graph { +public: + command_graph(const property_list &propList = {}); + command_graph finalize(const context &syclContext) const; + + node add(const std::vector& dep = {}); + + template + node add(T cgf, const std::vector& dep = {}); + + node add_malloc_device(void *&data, size_t numBytes, const std::vector& dep = {}); + node add_free(void *data, const std::vector& dep = {}); + + void make_edge(node sender, node receiver); +}; + +template<> +class command_graph { +public: + command_graph() = delete; + void update(const command_graph &graph); +}; +} // namespace ext::oneapi::experimental + +// New methods added to the sycl::queue class +using namespace ext::oneapi::experimental; +class queue { +public: + bool begin_recording(command_graph &graph); + bool end_recording(); + event submit(command_graph graph); +}; +} // namespace sycl ---- -== Graph +=== Node -Graph is a class that represents a directed acyclic graph of nodes. -A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed. -Member functions as listed in Table 3 to 6 can be used to add nodes to a graph. +Node is a class that encapsulates tasks like SYCL kernel functions, device +memory allocations/frees, or host tasks for deferred execution. A graph has to +be created first, the structure of a graph is defined second by adding nodes and +edges. [source,c++] ---- namespace sycl::ext::oneapi::experimental { - - enum class graph_state{ - modifiable, - executable - }; - - template - class command_graph { - public: - operator command_graph(); - }; - - template<> - class command_graph{ - public: - command_graph() = delete; - }; - + class node {}; } - ---- -The following member functions are added to the queue class. +=== Graph -[source,c++] ----- +:crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics -namespace sycl { +This extension adds a new `command_graph` object which follows the +{crs}[common reference semantics] of other SYCL runtime objects. -event queue::submit(const ext::oneapi::experimental::command_graph& my_graph); +A `command_graph` represents a directed acyclic graph of nodes, where each node +represents a single command or a sub-graph. The execution of a graph completes +when all of its nodes have completed. -} // namespace sycl +A `command_graph` is built up by either recording queue submissions or +explicitly adding nodes, then once the user is happy that the graph is complete, +the graph instance is finalized into an executable variant which can have no +more nodes added to it. Finalization may be a computationally expensive +operation as the runtime is able to perform optimizations based on the graph +structure. After finalization the graph can be submitted for execution on a +queue one or more times with reduced overhead. ----- +==== Graph State + +An instance of a `command_graph` object can be in one of two states: + +* **Modifiable** - Graph is under construction and new nodes may be added to it. +* **Executable** - Graph topology is fixed after finalization and graph is ready to + be submitted for execution. + +A `command_graph` object is constructed in the _recording_ state and is made +_executable_ by the user invoking `command_graph::finalize()` to create a +new executable instance of the graph. An executable graph cannot be converted +to a modifiable graph. After finalizing a graph in the modifiable state it is +valid for a user to add additional nodes and finalize again to create subsequent +executable graphs. The state of a `command_graph` object is made explicit by +templating on state to make the class strongly typed, with the default template +argument being `graph_state::modifiable` to reduce code verbosity on +construction. + +.Graph State Diagram +image::images/sycl_ext_oneapi_graph/command_graph-state.svg[] -=== Executable Graph +==== Executable Graph Update -A `command_graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution. -The structure of such a `command_graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node. -Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error. +A graph in the executable state can have each nodes inputs & outputs updated +using the `command_graph::update()` method. This takes a graph in the +modifiable state and updates the executable graph to use the node input & +outputs of the modifiable graph, a technique called _Whole Graph Update_. The +modifiable graph must have the same topology as the graph originally used to +create the executable graphs, with the nodes added in the same order. -=== Graph member and helper functions +==== Graph Member Functions -Table 3. Constructor of the `command_graph` class. +Table 5. Constructor of the `command_graph` class. [cols="2a,a"] |=== |Constructor|Description @@ -159,14 +383,26 @@ Table 3. Constructor of the `command_graph` class. | [source,c++] ---- -/* available only when graph_state == modifiable */` -command_graph(); +using namespace ext::oneapi::experimental; +command_graph(const property_list &propList = {}); ---- -|Creates a `command_graph` object. +|Creates a SYCL `command_graph` object in the modifiable state. +Zero or more properties can be provided to the constructed SYCL `command_graph` +via an instance of `property_list`. + +Preconditions: + +* This constructor is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `propList` - Optional parameter for passing properties. No new properties are + defined by this extension. |=== -Table 4. Member functions of the `command_graph` class. +Table 6. Member functions of the `command_graph` class. [cols="2a,a"] |=== |Member function|Description @@ -174,30 +410,120 @@ Table 4. Member functions of the `command_graph` class. | [source,c++] ---- +using namespace ext::oneapi::experimental; node add(const std::vector& dep = {}); ---- -|This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. +|This creates an empty node which is associated to no task. Its intended use is +either a connection point inside a graph between groups of nodes, and can +significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case +is building the structure of a graph first and adding tasks later. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `dep` - Nodes the created node will be dependent on. + +Returns: The empty node which has been added to the graph. + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. | [source,c++] ---- +using namespace ext::oneapi::experimental; template - node add(T cgf, const std::vector& dep = {}); +node add(T cgf, const std::vector& dep = {}); ---- -|This function adds a command group function object to a graph. The function object can contain single or multiple commands such as a host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec. +|This function adds a command group function object to a graph. The function +object can contain single or multiple commands such as a host task which is +scheduled by the SYCL runtime or a SYCL function for invoking kernels with all +restrictions that apply as described in the core specification. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `cgf` - Command group function object to be added as a node + +* `dep` - Nodes the created node will be dependent on. + +Returns: The command-group function object node which has been added to the graph. + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. + +| +[source,c++] +---- +using namespace ext::oneapi::experimental; +void make_edge(node sender, node receiver); +---- + +|Creates a dependency between two nodes representing a happens-before relationship. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `sender` - Node which will be a dependency of `receiver`. + +* `receiver` - Node which will be dependent on `sender`. + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. + +* Throws synchronously with error code `invalid` if `sender` or `receiver` + are not valid nodes created from the graph. | [source,c++] ---- -command_graph finalize(context &syclContext) const; +using namespace ext::oneapi::experimental; +command_graph finalize(const context &syclContext) const; ---- -| This function creates an executable graph object with an immutable topology that can be executed on a queue that matches the given context. +|Synchronous operation that creates a graph in the executable state with a +fixed topology that can be submitted for execution on any queue sharing the +supplied context. It is valid to call this method multiple times to create +subsequent executable graphs. It is also valid to continue to add new nodes to +the modifiable graph instance after calling this function. It is valid to +finalize an empty graph instance with no recorded commands. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `syclContext` - The context asscociated with the queues to which the + executable graph will be able to be submitted. + +Returns: An executable graph object which can be submitted to a queue. |=== -Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. +Memory that is allocated by the following functions is owned by the specific +graph. When freed inside the graph, the memory is only accessible before the +`free` node is executed and after the `malloc` node is executed. -Table 5. Member functions of the `command_graph` class (memory operations). +Table 7. Member functions of the `command_graph` class (memory operations). [cols="2a,a"] |=== |Member function|Description @@ -205,24 +531,310 @@ Table 5. Member functions of the `command_graph` class (memory operations). | [source,c++] ---- +using namespace ext::oneapi::experimental; node add_malloc_device(void *&data, size_t numBytes, const std::vector& dep = {}); ---- -|Adding a node that encapsulates a `malloc` operation. +|Adding a node that encapsulates a memory allocation operation. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `data` - Return parameter set to the address of memory allocated. + +* `numBytes` - Size in bytes to allocate. + +* `dep` - Nodes the created node will be dependent on. + +Returns: The memory allocation node which has been added to the graph + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. | [source,c++] ---- +using namespace ext::oneapi::experimental; node add_free(void *data, const std::vector& dep = {}); ---- -|Adding a node that encapsulates a `free` operation. +|Adding a node that encapsulates a memory free operation. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::modifiable`. + +Parameters: + +* `data` - Address of memory to free. + +* `dep` - Nodes the created node will be dependent on. + +Returns: The memory freeing node which has been added to the graph. + +Exceptions: + +* Throws synchronously with error code `invalid` if a queue is recording + commands to the graph. + +|=== + +Table 8. Member functions of the `command_graph` class (executable graph update). +[cols="2a,a"] +|=== +|Member function|Description + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +void command_graph update(const command_graph &graph); +---- + +|Updates the executable graph node inputs & outputs from a topologically +identical modifiable graph. The effects of the update will be visible +on the next submission of the executable graph without the need for additional +user synchronization. + +Preconditions: + +* This member function is only available when the `command_graph` state is + `graph_state::executable`. + +Parameters: + +* `graph` - Modifiable graph object to update graph node inputs & outputs with. + This graph must have the same topology as the original graph used on + executable graph creation. + +Exceptions: + +* Throws synchronously with error code `invalid` if the topology of `graph` is + not the same as the existing graph topology, or if the nodes were not added in + the same order. +|=== + +=== Queue Class Modifications + +:queue-class: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class + +This extension modifies the {queue-class}[SYCL queue class] such that +<> is introduced to queue objects, allowing an instance to be +put into a mode where command-groups are recorded to a graph rather than +submitted immediately for execution. + +<> are also added to the +`sycl::queue` class with this extension. Two functions for selecting the state +of the queue, and another function for submitting a graph to the queue. + +==== Queue State + +:queue-info-table: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#table.queue.info + +The `sycl::queue` object can be in either of two states. The default +`queue_state::executing` state is where the queue has its normal semantics of +submitted command-groups being immediately scheduled for asynchronous execution. + +The alternative `queue_state::recording` state is used for graph construction. +Instead of being scheduled for execution, command-groups submitted to the queue +are recorded to a graph object as new nodes for each submission. After recording +has finished and the queue returns to the executing state, the recorded commands are +not then executed, they are transparent to any following queue operations. + +.Queue State +image::images/sycl_ext_oneapi_graph/queue-state.svg[] + +The state of a queue can be queried with `queue::get_info` using template +parameter `info::queue::state`. The following entry is added to the +{queue-info-table}[queue info table] to define this query: +Table 9. Queue info query +[cols="2a,a,a"] |=== +| Queue Descriptors | Return Type | Description + +| `info::queue::state` +| `ext::oneapi::experimental::queue_state` +| Returns the state of the queue + +|=== + +A default constructed event is returned when a user submits a command-group to +a queue in the recording state. These events have status +`info::event_command_status::complete` and a user waiting on them will return +immediately. + +==== Queue Properties + +:queue-properties: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:queue-properties + +There are {queue-properties}[two properties] defined by the core SYCL +specification that can be passed to a `sycl::queue` on construction via the +property list parameter. They interact with this extension in the following +ways: + +1. `property::queue::in_order` - When a queue is created with the in-order + property, recording its operations results in a straight-line graph, as each + operation has an implicit dependency on the previous operation. However, + a graph submitted to an in-order queue will keep its existing structure such + that the complete graph executes in-order with respect to the other + command-groups submitted to the queue. + +2. `property::queue::enable_profiling` - This property has no effect on graph + recording. When set on the queue a graph is submitted to however, it allows + profiling information to be obtained from the event returned by a graph + submission. + +For any other queue property that is defined by an extension, it is the +responsibility of the extension to define the relationship between that queue +property and this graph extension. + +==== New Queue Member Functions + +Table 8. Additional member functions of the `sycl::queue` class. +[cols="2a,a"] +|=== +|Member function|Description + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool queue::begin_recording(command_graph &graph) +---- + +|Synchronously changes the state of the queue to the `queue_state::recording` +state. + +Parameters: + +* `graph` - Graph object to start recording commands to. + +Returns: `true` if the queue was previously in the `queue_state::executing` +state, `false` otherwise. + +Exceptions: + +* Throws synchronously with error code `invalid` if the queue is already + recording to a different graph. + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool queue::end_recording() +---- + +|Synchronously changes the state of the queue to the `queue_state::executing` +state. + +Returns: `true` if the queue was previously in the `queue_state::recording` +state, `false` otherwise. + +| +[source,c++] +---- +using namespace ext::oneapi::experimental; +event queue::submit(command_graph graph) +---- + +|When invoked with the queue in the `queue_state::recording` state, a graph is +added as a subgraph node. When invoked with the queue in the default +`queue_state::executing` state, the graph is submitted for execution. Support +for submitting a graph for execution, before a previous execution has been +completed is backend specific. The runtime may throw an error. + +Parameters: + +* `graph` - Graph object to execute. + +When the queue is in the execution state, an `event` object used to synchronize +graph submission with other command-groups is returned. Otherwise the queue is +in the recording state, and a default event is returned. +|=== + +=== Thread Safety + +The new functions in this extension are thread-safe, the same as member +functions of classes in the base SYCL specification. If user code does +not perform synchronisation between two threads accessing the same queue, +there is no strong ordering between events on that queue, and the kernel +submissions, recording and finalization will happen in an undefined order. + +In particular, when one thread ends recording on a queue while another +thread is submitting work, which kernels will be part of the subsequent +graph is undefined. If user code enforces a total order on the queue +events, then the behaviour is well-defined, and will match the observable +total order. + +The returned value from the `info::queue::state` should be considered +immediately stale in multi-threaded usage, as another thread could have +preemptively changed the state of the queue. + +=== Error Handling + +Errors are reported through exceptions, as usual in the SYCL API. For new APIs, +submitting a graph for execution can generate unspecified asynchronous errors, +while `command_graph::finalize()` may throw unspecified synchronous exceptions. +Synchronous exception errors codes are defined for both +`queue::begin_recording()` and `command_graph::update()`. + +When a queue is in recording mode asynchronous exceptions will not be +generated, as no device execution is occuring. Synchronous errors specified as +being thrown in the default queue executing state, will still be thrown when a +queue is in the recording state. + +The `queue::begin_recording` and `queue::end_recording` entry-points return a +`bool` value informing the user whether a state change occurred. False is +returned rather than throwing an exception when state isn't changed. This design +is because the queue is already in the state the user desires, so if the +function threw an exception in this case, the application would likely swallow +it and then proceed. + +While a queue is in the recording state, methods performed on that queue which +are not command submissions behave as normal. This includes waits, throws, and +queries on the queue. These are all ignored by the graph system, as opposed to +throwing an exception when in queue recording mode. This is because otherwise +there would be no thread safe way for a user to check they could call these +functions without throwing, as a query about the state of the queue may be +immediately stale. + +=== Storage Lifetimes + +The lifetime of any buffer recorded as part of a submission +to a command graph will be extended in keeping with the common reference +semantics and buffer synchronization rules in the SYCL specification. It will be +extended either for the lifetime of the graph (including both modifiable graphs +and the executable graphs created from them) or until the buffer is no longer +required by the graph (such as after being replaced through executable graph update). + +=== Host Tasks + +:host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks + +A {host-task}[host task] is a native C++ callable, scheduled according to SYCL +dependency rules. It is valid to record a host task as part of graph, though it +may lead to sub-optimal graph performance because a host task node may prevent +the SYCL runtime from submitting the whole graph to the device at once. + +Host tasks can be updated as part of <> +by replacing the whole node with the new callable. == Examples -// NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document. +[NOTE] +==== +The examples below demonstrate intended usage of the extension, but may not be +compatible with the proof-of-concept implementation, as the proof-of-concept +implementation is currently under development. +==== -1. Dot product +=== Dot Product [source,c++] ---- @@ -241,36 +853,48 @@ int main() { sycl::ext::oneapi::experimental::command_graph g; float *x , *y, *z; - + float *dotp = sycl::malloc_shared(1, q); - auto n_x = g.add_malloc_device(x, n); - auto n_y = g.add_malloc_device(y, n); - auto n_z = g.add_malloc_device(z, n); + // Add commands to the graph to create the following topology. + // + // x y z + // \ | / + // i + // / \ + // a b + // \ / \ + // c fy + // | + // fx + + auto node_x = g.add_malloc_device(x, n * sizeof(float)); + auto node_y = g.add_malloc_device(y, n * sizeof(float)); + auto node_z = g.add_malloc_device(z, n * sizeof(float)); /* init data on the device */ - auto n_i = g.add([&](sycl::handler &h) { + auto node_i = g.add([&](sycl::handler &h) { h.parallel_for(n, [=](sycl::id<1> it){ const size_t i = it[0]; x[i] = 1.0f; y[i] = 2.0f; z[i] = 3.0f; }); - }, {n_x, n_y, n_z}); + }, {node_x, node_y, node_z}); auto node_a = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); - }, {n_i}); + }, {node_i}); auto node_b = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); - }, {n_i}); + }, {node_i}); auto node_c = g.add( [&](sycl::handler &h) { @@ -282,9 +906,9 @@ int main() { }); }, {node_a, node_b}); - - auto node_f1 = g.add_free(x, {node_c}); - auto node_f2 = g.add_free(y, {node_b}); + + auto node_fx = g.add_free(x, {node_c}); + auto node_fy = g.add_free(y, {node_b}); auto exec = g.finalize(q.get_context()); @@ -301,18 +925,115 @@ int main() { ... ---- -// == Issues for later investigations -// -// . Explicit memory movement can cause POC to stall. -// -// == Non-implemented features -// Please, note that the following features are not yet implemented: -// -// . Level Zero backend only -// . Memory operation nodes not implemented -// . Host node not implemented -// . Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead. -// . `class graph` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead. +=== Diamond Dependency + +The following snippet of code shows how a SYCL `queue` can be put into a +recording state, which allows a `command_graph` object to be populated by the +command-groups submitted to the queue. Once the graph is complete, recording +finishes on the queue to put it back into the default executing state. The +graph is then finalized so that no more nodes can be added. Lastly, the graph is +submitted as a whole for execution via +`queue::submit(command_graph)`. + +[source, c++] +---- + queue q{default_selector{}}; + + // New object representing graph of command-groups + ext::oneapi::experimental::command_graph graph; + { + buffer bufferA{dataA.data(), range<1>{elements}}; + buffer bufferB{dataB.data(), range<1>{elements}}; + buffer bufferC{dataC.data(), range<1>{elements}}; + + // `q` will be put in the recording state where commands are recorded to + // `graph` rather than submitted for execution immediately. + q.begin_recording(graph); + + // Record commands to `graph` with the following topology. + // + // increment_kernel + // / \ + // A->/ A->\ + // / \ + // add_kernel subtract_kernel + // \ / + // B->\ C->/ + // \ / + // decrement_kernel + + q.submit([&](handler &cgh) { + auto pData = bufferA.get_access(cgh); + cgh.parallel_for(range<1>(elements), + [=](item<1> id) { pData[id]++; }); + }); + + q.submit([&](handler &cgh) { + auto pData1 = bufferA.get_access(cgh); + auto pData2 = bufferB.get_access(cgh); + cgh.parallel_for(range<1>(elements), + [=](item<1> id) { pData2[id] += pData1[id]; }); + }); + + q.submit([&](handler &cgh) { + auto pData1 = bufferA.get_access(cgh); + auto pData2 = bufferC.get_access(cgh); + cgh.parallel_for( + range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; }); + }); + + q.submit([&](handler &cgh) { + auto pData1 = bufferB.get_access(cgh); + auto pData2 = bufferC.get_access(cgh); + cgh.parallel_for(range<1>(elements), [=](item<1> id) { + pData1[id]--; + pData2[id]--; + }); + }); + + // queue will be returned to the executing state where commands are + // submitted immediately for extension. + q.end_recording(); + } + + // Finalize the modifiable graph to create an executable graph that can be + // submitted for execution. + auto exec_graph = graph.finalize(q.get_context()); + + // Execute graph + q.submit(exec_graph); +---- + +== Issues + +=== Multi Device Graph + +Allow an executable graph to contain nodes targeting different devices. + +**Outcome:** Under consideration + +=== Record & Replay: Mark Internal Memory + +When a graph is created by recording a queue there is no way to tag memory +objects internal to the graph, which would enable optimizations on the internal +memory. Do we need an interface record & replay can use to identify buffers and +USM allocations not used outside of the graph? + +**Outcome:** Unresolved + +=== Executable Graph Update + +Is there a ML usecase (e.g pytorch workload) which justifies the inclusion of +this feature in the extension. + +**Outcome:** Unresolved + +=== Graph Submission Synchronization + +Should we provide a mechanism for a graph submission to depend on other graph +submission events or any arbitrary sycl event? + +**Outcome:** Unresolved == Revision History @@ -325,4 +1046,5 @@ int main() { |2|2022-03-11|Pablo Reble|Incorporate feedback from PR |3|2022-05-25|Pablo Reble|Extend API and Example |4|2022-08-10|Pablo Reble|Adding USM shortcuts +|5|2022-10-21|Ewan Crawford|Merge in Codeplay vendor extension |======================================== From d5b2e41ca5a280f4c8391e9bdf856515619623d9 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 16 Nov 2022 21:07:57 -0600 Subject: [PATCH 22/82] Converting vector of node dependencies to properties --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index ed8f4f7075662..fb3d24f116015 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -262,6 +262,18 @@ enum class queue_state { recording }; +namespace property { +namespace node { + +class depends_on { + public: + template + depends_on(Ts... deps); +}; + +} // namespace node +} // namespace property + class node {}; // State of a graph @@ -280,13 +292,13 @@ public: command_graph(const property_list &propList = {}); command_graph finalize(const context &syclContext) const; - node add(const std::vector& dep = {}); + node add(const property_list& propList = {}); template - node add(T cgf, const std::vector& dep = {}); + node add(T cgf, const property_list& propList = {}); - node add_malloc_device(void *&data, size_t numBytes, const std::vector& dep = {}); - node add_free(void *data, const std::vector& dep = {}); + node add_malloc_device(void *&data, size_t numBytes, const property_list& propList = {}); + node add_free(void *data, const property_list& propList = {}); void make_edge(node sender, node receiver); }; @@ -411,7 +423,7 @@ Table 6. Member functions of the `command_graph` class. [source,c++] ---- using namespace ext::oneapi::experimental; -node add(const std::vector& dep = {}); +node add(const property_list& propList = {}); ---- |This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can @@ -425,7 +437,7 @@ Preconditions: Parameters: -* `dep` - Nodes the created node will be dependent on. +* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. Returns: The empty node which has been added to the graph. @@ -439,7 +451,7 @@ Exceptions: ---- using namespace ext::oneapi::experimental; template -node add(T cgf, const std::vector& dep = {}); +node add(T cgf, const property_list& propList = {}); ---- |This function adds a command group function object to a graph. The function object can contain single or multiple commands such as a host task which is @@ -455,7 +467,7 @@ Parameters: * `cgf` - Command group function object to be added as a node -* `dep` - Nodes the created node will be dependent on. +* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. Returns: The command-group function object node which has been added to the graph. @@ -532,7 +544,7 @@ Table 7. Member functions of the `command_graph` class (memory operations). [source,c++] ---- using namespace ext::oneapi::experimental; -node add_malloc_device(void *&data, size_t numBytes, const std::vector& dep = {}); +node add_malloc_device(void *&data, size_t numBytes, const property_list& propList = {}); ---- |Adding a node that encapsulates a memory allocation operation. @@ -547,7 +559,7 @@ Parameters: * `numBytes` - Size in bytes to allocate. -* `dep` - Nodes the created node will be dependent on. +* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. Returns: The memory allocation node which has been added to the graph @@ -560,7 +572,7 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -node add_free(void *data, const std::vector& dep = {}); +node add_free(void *data, const property_list& propList = {}); ---- |Adding a node that encapsulates a memory free operation. @@ -573,7 +585,7 @@ Parameters: * `data` - Address of memory to free. -* `dep` - Nodes the created node will be dependent on. +* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. Returns: The memory freeing node which has been added to the graph. @@ -880,21 +892,21 @@ int main() { y[i] = 2.0f; z[i] = 3.0f; }); - }, {node_x, node_y, node_z}); + }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_x, node_y, node_z)}); auto node_a = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); - }, {node_i}); + }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_i)}); auto node_b = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); - }, {node_i}); + }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_i)}); auto node_c = g.add( [&](sycl::handler &h) { @@ -905,10 +917,10 @@ int main() { sum += x[i] * z[i]; }); }, - {node_a, node_b}); + { sycl::ext::oneapi::experimental::property::node::depends_on(node_a, node_b)}); - auto node_fx = g.add_free(x, {node_c}); - auto node_fy = g.add_free(y, {node_b}); + auto node_fx = g.add_free(x, {sycl::ext::oneapi::experimental::property::node::depends_on(node_c)}); + auto node_fy = g.add_free(y, {sycl::ext::oneapi::experimental::property::node::depends_on(node_b)}); auto exec = g.finalize(q.get_context()); From ee6159db9112911a2e3298f305ebb8bf87d95232 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 17 Nov 2022 21:41:19 -0600 Subject: [PATCH 23/82] Update sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc Co-authored-by: Ben Tracy --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index fb3d24f116015..0231bbf0d5a8b 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -267,8 +267,8 @@ namespace node { class depends_on { public: - template - depends_on(Ts... deps); + template + depends_on(NodeTN... nodes); }; } // namespace node From baeb7011a08993dce1e9f1171f20e65f26ccb510 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 17 Nov 2022 21:52:38 -0600 Subject: [PATCH 24/82] Update sycl_ext_oneapi_graph.asciidoc --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 0231bbf0d5a8b..0e696adf60383 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -437,7 +437,8 @@ Preconditions: Parameters: -* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. +* `propList` - Zero or more properties can be provided to the constructed node + via an instance of `property_list`. Returns: The empty node which has been added to the graph. @@ -467,7 +468,8 @@ Parameters: * `cgf` - Command group function object to be added as a node -* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. +* `propList` - Zero or more properties can be provided to the constructed node + via an instance of `property_list`. Returns: The command-group function object node which has been added to the graph. @@ -559,7 +561,8 @@ Parameters: * `numBytes` - Size in bytes to allocate. -* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. +* `propList` - Zero or more properties can be provided to the constructed node + via an instance of `property_list`. Returns: The memory allocation node which has been added to the graph @@ -585,7 +588,8 @@ Parameters: * `data` - Address of memory to free. -* `propList` - Zero or more properties can be provided to the constructed node via an instance of property_list. +* `propList` - Zero or more properties can be provided to the constructed node + via an instance of `property_list`. Returns: The memory freeing node which has been added to the graph. From 4ecdd3f6ab090d071eb17cb617864d0b0ecb03ee Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 22 Nov 2022 21:30:48 -0600 Subject: [PATCH 25/82] Update sycl_ext_oneapi_graph.asciidoc --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 0e696adf60383..16a29cee5f756 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -859,6 +859,8 @@ implementation is currently under development. #include int main() { + using namespace sycl::ext::oneapi::experimental; + const size_t n = 10; float alpha = 1.0f; float beta = 2.0f; @@ -866,7 +868,7 @@ int main() { sycl::queue q; - sycl::ext::oneapi::experimental::command_graph g; + command_graph g; float *x , *y, *z; @@ -896,21 +898,21 @@ int main() { y[i] = 2.0f; z[i] = 3.0f; }); - }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_x, node_y, node_z)}); + }, { property::node::depends_on(node_x, node_y, node_z)}); auto node_a = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); - }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_i)}); + }, { property::node::depends_on(node_i)}); auto node_b = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); - }, { sycl::ext::oneapi::experimental::property::node::depends_on(node_i)}); + }, { property::node::depends_on(node_i)}); auto node_c = g.add( [&](sycl::handler &h) { @@ -921,10 +923,10 @@ int main() { sum += x[i] * z[i]; }); }, - { sycl::ext::oneapi::experimental::property::node::depends_on(node_a, node_b)}); + { property::node::depends_on(node_a, node_b)}); - auto node_fx = g.add_free(x, {sycl::ext::oneapi::experimental::property::node::depends_on(node_c)}); - auto node_fy = g.add_free(y, {sycl::ext::oneapi::experimental::property::node::depends_on(node_b)}); + auto node_fx = g.add_free(x, {property::node::depends_on(node_c)}); + auto node_fy = g.add_free(y, {property::node::depends_on(node_b)}); auto exec = g.finalize(q.get_context()); From 695ea0b1982d4957f11a7e4bcdce51548530afd5 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 29 Nov 2022 11:33:47 +0000 Subject: [PATCH 26/82] Language clarifications (#35) * Address feedback around the language used in the specification. There should are no intended changes to extension behaviour. * Give the `depends_on` property its own subsection. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 81 +++++++++++++------ 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 16a29cee5f756..de84cd61390e7 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -38,6 +38,7 @@ Pablo Reble, Intel + Julian Miller, Intel + John Pennycook, Intel + Guo Yejun, Intel + +Dan Holmes, Intel + Ewan Crawford, Codeplay + Ben Tracy, Codeplay + Duncan McBain, Codeplay + @@ -65,21 +66,23 @@ as a command group object defines a set of requisites (edges) which must be satisfied for kernels (nodes) to be executed. However, because command-group submission is tied to execution on the queue, without having a prior construction step before starting execution, optimization opportunities are -missed from the runtime not knowing the complete dependency graph ahead of -execution. +missed from the runtime not being made aware of a defined dependency graph ahead +of execution. The following benefits would become possible if the user could define a dependency graph to the SYCL runtime prior to execution: * Reduction in runtime overhead by only submitting a single graph object, rather - than many individual commands. + than many individual command groups. -* Enable more work to be done offline, in particular producing a graph ahead of - time allows for improved performance at runtime from reduced overhead. +* Enable more work to be done ahead of time to improve runtime performance. This + early work could be done in a setup phase of the program prior to repeated + executions of the graph. Alternately, a future offline AOT compiler in a different + process could run be prior to the execution of the application. * Unlock DMA hardware features through graph analysis by the runtime. -* Whole graph optimizations become available, including but not limited to: +* Graph optimizations become available, including but not limited to: ** Kernel fusion/fission. ** Inter-node memory reuse from data staying resident on device. ** Identification of the peak intermediate output memory requirement, used for @@ -111,7 +114,7 @@ requirements were considered: 7. Ability to record a graph with commands submitted to different devices in the same context. 8. Capability to serialize graphs to a binary format which can then be - de-serialized and executed. This is helpful for offline cases where a graph + de-serialized and executed. This is helpful for AOT cases where a graph can be created by an offline tool to be loaded and run without the end-user incurring the overheads of graph creation. 9. Backend interoperability, the ability to retrieve a native graph object from @@ -234,13 +237,14 @@ Table 4. Recorded Graph Definition. | Nodes in a queue recorded graph represent each of the command group submissions of the program. Each submission encompasses either one or both of a.) some data movement, b.) a single asynchronous kernel launch. Nodes cannot -define forward edges, only backwards (i.e. kernels can only create dependencies -on things that have already happened). This means that transparently a node can -depend on a previously recorded graph (sub-graph), which works by creating edges -to the individual nodes in the old graph. Explicit memory operations without -kernels, such as a memory copy, are still classed as nodes under this -definition, as the {explicit-memory-ops}[SYCL 2020 specification states] that -these can be seen as specialized kernels executing on the device. +define forward edges, only backwards. This is, kernels can only create +dependencies on command-groups that have already been submitted. This means that +transparently a node can depend on a previously recorded graph (sub-graph), +which works by creating edges to the individual nodes in the old graph. Explicit +memory operations without kernels, such as a memory copy, are still classed as +nodes under this definition, as the +{explicit-memory-ops}[SYCL 2020 specification states] that these can be seen as +specialized kernels executing on the device. | Edge | An edge in a queue recorded graph represents a data dependency between two @@ -336,6 +340,26 @@ namespace sycl::ext::oneapi::experimental { } ---- +==== Depends-On Property + +The API for explicitly adding nodes to a `command_graph` includes a +`property_list` parameter. This extension defines the `depends_on` property to +be passed here. `depends_on` defines any `node` objects for the created node to +be dependent on, and therefore form an edge with. These nodes are in addition to +the dependent nodes identified from the command-group requisites of the created +node. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental::property::node +class depends_on { + public: + template + depends_on(NodeTN... nodes); +}; +} +---- + === Graph :crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics @@ -409,8 +433,8 @@ Preconditions: Parameters: -* `propList` - Optional parameter for passing properties. No new properties are - defined by this extension. +* `propList` - Optional parameter for passing properties. No `command_graph` + constructor properties are defined by this extension. |=== @@ -437,7 +461,7 @@ Preconditions: Parameters: -* `propList` - Zero or more properties can be provided to the constructed node +* `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. Returns: The empty node which has been added to the graph. @@ -455,9 +479,11 @@ template node add(T cgf, const property_list& propList = {}); ---- |This function adds a command group function object to a graph. The function -object can contain single or multiple commands such as a host task which is -scheduled by the SYCL runtime or a SYCL function for invoking kernels with all -restrictions that apply as described in the core specification. +object statically contains a group of commands, of which a single command is +executed at runtime. A function object can be a host task which is scheduled by +the SYCL runtime, or a SYCL function for invoking kernels with all restrictions +that apply as described in the core specification. The requisites of `cgf` will +be used to identify any dependent nodes in the graph to form edges with. Preconditions: @@ -466,9 +492,9 @@ Preconditions: Parameters: -* `cgf` - Command group function object to be added as a node +* `cgf` - Command group function object to be added as a node. -* `propList` - Zero or more properties can be provided to the constructed node +* `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. Returns: The command-group function object node which has been added to the graph. @@ -615,7 +641,8 @@ void command_graph update(const command_graph> by replacing the whole node with the new callable. @@ -850,6 +878,9 @@ compatible with the proof-of-concept implementation, as the proof-of-concept implementation is currently under development. ==== +Examples for demonstrative purposes only, and may leave out details such as how +input data is set. + === Dot Product [source,c++] @@ -950,7 +981,7 @@ recording state, which allows a `command_graph` object to be populated by the command-groups submitted to the queue. Once the graph is complete, recording finishes on the queue to put it back into the default executing state. The graph is then finalized so that no more nodes can be added. Lastly, the graph is -submitted as a whole for execution via +submitted in its entirety for execution via `queue::submit(command_graph)`. [source, c++] From 40b6ee49cf0e0d7d1f21f17c78a86475b9c5250e Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 17 Nov 2022 20:24:29 +0000 Subject: [PATCH 27/82] State that node class has common reference semantics SYCL [common reference semantics](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics) define copy, move, destruction, and equality requirements for runtime classes. Issue https://github.com/reble/llvm/issues/22 --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index de84cd61390e7..7059bc617ef05 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -328,11 +328,15 @@ public: === Node +:crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics + Node is a class that encapsulates tasks like SYCL kernel functions, device memory allocations/frees, or host tasks for deferred execution. A graph has to be created first, the structure of a graph is defined second by adding nodes and edges. +The `node` class provides the {crs}[common reference semantics]. + [source,c++] ---- namespace sycl::ext::oneapi::experimental { @@ -362,8 +366,6 @@ class depends_on { === Graph -:crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics - This extension adds a new `command_graph` object which follows the {crs}[common reference semantics] of other SYCL runtime objects. From f044be73b1a22bb513805ceac642d947f2b352da Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 29 Nov 2022 14:02:23 +0000 Subject: [PATCH 28/82] Rename make_edge() parameters Rename the parameters to `make_edge()` from `sender` & `receiver` to `src` & `dest` based on feedback. Additionally, specify an error if these are the same node. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 7059bc617ef05..a65273a766978 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -304,7 +304,7 @@ public: node add_malloc_device(void *&data, size_t numBytes, const property_list& propList = {}); node add_free(void *data, const property_list& propList = {}); - void make_edge(node sender, node receiver); + void make_edge(node src, node dest); }; template<> @@ -510,7 +510,7 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -void make_edge(node sender, node receiver); +void make_edge(node src, node dest); ---- |Creates a dependency between two nodes representing a happens-before relationship. @@ -522,17 +522,20 @@ Preconditions: Parameters: -* `sender` - Node which will be a dependency of `receiver`. +* `src` - Node which will be a dependency of `dest`. -* `receiver` - Node which will be dependent on `sender`. +* `dest` - Node which will be dependent on `src`. Exceptions: * Throws synchronously with error code `invalid` if a queue is recording - commands to the graph. + commands to the graph object. + +* Throws synchronously with error code `invalid` if `src` or `dest` + are not valid nodes assigned to the graph object. -* Throws synchronously with error code `invalid` if `sender` or `receiver` - are not valid nodes created from the graph. +* Throws synchronously with error code `invalid` if `src` and `dest` + are the same node. | [source,c++] From e93396d05b780ca75f29ca7471187816ec910a99 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 14 Nov 2022 10:46:08 +0000 Subject: [PATCH 29/82] Execute graph using handler Change the `queue::submit(command_graph)` API for launching an executable graph to `handler::exec_graph(command_graph)`. See Issue https://github.com/reble/llvm/issues/21 Using the handler is more in-keeping with the existing SYCL API and allows the execution of graph to depend on an arbitrary event through `handler::depends_on`. This design makes it easier for users to write the following code without have to block in host for waits. ```cpp auto ev1 = q.submit([&](handler& cgh){ cgh.exec_graph(g); }); // dest is some input to graph `g` auto ev2 = q.memcpy(dest, src, numBytes, ev1); auto ev3 = q.submit([&](handler& cgh){ cgh.depends_on(ev2); cgh.exec_graph(g); }); ``` Queue shortcut functions are also included, as is the case in the core SYCL spec for other handler functionality. This change should also enable the explicit API to capture nested sub-graph executions, which is not currently possible in the explicit API but is possible in the record & replay API. See issue https://github.com/reble/llvm/issues/23 For example, a user can now do: ```cpp command_graph executable_graph; auto node = recordable_graph.add([&](handler& cgh){ cgh.exec_graph(executable_graph); }); ``` --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 91 +++++++++++++++---- 1 file changed, 71 insertions(+), 20 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a65273a766978..7f88daac17766 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -253,6 +253,12 @@ accessors. USM pointers also convey data dependencies, however offsets into system allocations (`malloc`/`new`) are not supported. |=== +==== Sub-Graph + +A node in a graph can take the form of a nested sub-graph. This occurs when +a command-group submission that invokes `handler::exec_graph()` with an +executable graph object is added to the graph as a node. + === API Modifications [source, c++] @@ -321,8 +327,22 @@ class queue { public: bool begin_recording(command_graph &graph); bool end_recording(); - event submit(command_graph graph); + + /* -- graph convenience shortcuts -- */ + + event exec_graph(command_graph graph); + event exec_graph(command_graph graph, + event depEvent); + event exec_graph(command_graph graph, + const std::vector &depEvents); }; + +// New methods added to the sycl::handler class +class handler { +public: + void exec_graph(command_graph graph); +} + } // namespace sycl ---- @@ -788,22 +808,55 @@ state, `false` otherwise. [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::submit(command_graph graph) +event queue::exec_graph(command_graph graph) +---- + +|Queue shortcut function that is equivalent to submitting a command-group +containing `handler::exec_graph(graph)`. + +| +[source,c++] +---- +using namespace ext::oneapi::experimental; +event queue::exec_graph(command_graph graph, + event depEvent); +---- + +|Queue shortcut function that is equivalent to submitting a command-group +containing `handler::depends_on(depEvent)` and `handler::exec_graph(graph)`. + +| +[source,c++] +---- +using namespace ext::oneapi::experimental; +event queue::exec_graph(command_graph graph, + const std::vector &depEvents); ---- -|When invoked with the queue in the `queue_state::recording` state, a graph is -added as a subgraph node. When invoked with the queue in the default -`queue_state::executing` state, the graph is submitted for execution. Support -for submitting a graph for execution, before a previous execution has been -completed is backend specific. The runtime may throw an error. +|Queue shortcut function that is equivalent to submitting a command-group +containing `handler::depends_on(depEvents)` and `handler::exec_graph(graph)`. +|=== + +==== New Handler Member Functions + +Table 10. Additional member functions of the `sycl::handler` class. +[cols="2a,a"] +|=== +|Member function|Description +[source,c++] +---- +using namespace ext::oneapi::experimental; +void handler::exec_graph(command_graph graph) +---- + +|Invokes the execution of a graph. Support for invoking an executable graph, +before a previous execution of the same graph has been completed is backend +specific. The runtime may throw an error. Parameters: * `graph` - Graph object to execute. -When the queue is in the execution state, an `event` object used to synchronize -graph submission with other command-groups is returned. Otherwise the queue is -in the recording state, and a default event is returned. |=== === Thread Safety @@ -966,7 +1019,8 @@ int main() { auto exec = g.finalize(q.get_context()); - q.submit(exec).wait(); + // use queue shortcut for graph submission + q.exec_graph(exec).wait(); // memory can be freed inside or outside the graph sycl::free(z, q.get_context()); @@ -987,7 +1041,7 @@ command-groups submitted to the queue. Once the graph is complete, recording finishes on the queue to put it back into the default executing state. The graph is then finalized so that no more nodes can be added. Lastly, the graph is submitted in its entirety for execution via -`queue::submit(command_graph)`. +`handler::exec_graph(command_graph)`. [source, c++] ---- @@ -1055,7 +1109,10 @@ submitted in its entirety for execution via auto exec_graph = graph.finalize(q.get_context()); // Execute graph - q.submit(exec_graph); + q.submit([&](handler &cgh) { + cgh.exec_graph(exec_graph); + }); + ---- == Issues @@ -1082,13 +1139,6 @@ this feature in the extension. **Outcome:** Unresolved -=== Graph Submission Synchronization - -Should we provide a mechanism for a graph submission to depend on other graph -submission events or any arbitrary sycl event? - -**Outcome:** Unresolved - == Revision History [cols="5,15,15,70"] @@ -1101,4 +1151,5 @@ submission events or any arbitrary sycl event? |3|2022-05-25|Pablo Reble|Extend API and Example |4|2022-08-10|Pablo Reble|Adding USM shortcuts |5|2022-10-21|Ewan Crawford|Merge in Codeplay vendor extension +|6|2022-11-14|Ewan Crawford|Change graph execution to be a function on the handler |======================================== From 0dcb1d7213db2d6f09290c46c2386616a9b58e68 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 15 Nov 2022 10:07:53 +0000 Subject: [PATCH 30/82] Specify error on finalize if a graph has a cycle We define a graph as a DAG, and therefore it is invalid for a graph to contain a cycle. However, it is possible in the API for a user to introduce a cycle via a call to `make_edge` that introduces a forward dependency. Checking for cycles on `make_edge` itself may have negative performance implications, as the function can be called frequently and on larger graphs a cycle may not be cheap to check for. Instead, I've specified an error on finalize because this entry-point is already intended to be where to costly work is done. Additionally, this is where the runtime translates the graph to a backend API, and it is unlikely that the backend API will provide a mechanism for defining forward edges. Therefore, the runtime will naturally hit a obstacle in using the backend API and be able to throw an exception at that point. Issue https://github.com/reble/llvm/issues/12 --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 7f88daac17766..a71b1d28c53d8 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -578,10 +578,17 @@ Preconditions: Parameters: -* `syclContext` - The context asscociated with the queues to which the +* `syclContext` - The context associated with the queues to which the executable graph will be able to be submitted. Returns: An executable graph object which can be submitted to a queue. + +Exceptions: + +* Throws synchronously with error code `invalid` if the graph contains a cycle. + A cycle may be introduced to the graph via a call to `make_edge()` that + creates a forward dependency. + |=== Memory that is allocated by the following functions is owned by the specific From 2f871eb87c9ba8ea593af9ddf4cc76da3219928b Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 5 Dec 2022 13:40:09 -0600 Subject: [PATCH 31/82] Update explicit API example --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a71b1d28c53d8..61be6d264bfef 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -955,7 +955,7 @@ input data is set. #include int main() { - using namespace sycl::ext::oneapi::experimental; + namespace sycl_ext = sycl::ext::oneapi::experimental; const size_t n = 10; float alpha = 1.0f; @@ -964,7 +964,7 @@ int main() { sycl::queue q; - command_graph g; + sycl_ext::command_graph g; float *x , *y, *z; @@ -994,21 +994,21 @@ int main() { y[i] = 2.0f; z[i] = 3.0f; }); - }, { property::node::depends_on(node_x, node_y, node_z)}); + }, { sycl_ext::property::node::depends_on(node_x, node_y, node_z)}); auto node_a = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); - }, { property::node::depends_on(node_i)}); + }, { sycl_ext::property::node::depends_on(node_i)}); auto node_b = g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); - }, { property::node::depends_on(node_i)}); + }, { sycl_ext::property::node::depends_on(node_i)}); auto node_c = g.add( [&](sycl::handler &h) { @@ -1019,10 +1019,10 @@ int main() { sum += x[i] * z[i]; }); }, - { property::node::depends_on(node_a, node_b)}); + { sycl_ext::property::node::depends_on(node_a, node_b)}); - auto node_fx = g.add_free(x, {property::node::depends_on(node_c)}); - auto node_fy = g.add_free(y, {property::node::depends_on(node_b)}); + auto node_fx = g.add_free(x, {sycl_ext::property::node::depends_on(node_c)}); + auto node_fy = g.add_free(y, {sycl_ext::property::node::depends_on(node_b)}); auto exec = g.finalize(q.get_context()); From 4b301c0c6b05d67a0946d40382fef8dd20a5dda9 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 9 Dec 2022 14:37:26 +0000 Subject: [PATCH 32/82] Use Mermaid Diagrams instead of SVG Rather than adding separate SVG image files for state diagrams that need to be embedded in the spec, use mermaid diagrams directly in spec source --- .../sycl_ext_oneapi_graph/command_graph-state.svg | 4 ---- .../images/sycl_ext_oneapi_graph/queue-state.svg | 4 ---- .../proposed/sycl_ext_oneapi_graph.asciidoc | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 11 deletions(-) delete mode 100644 sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg delete mode 100644 sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg diff --git a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg deleted file mode 100644 index f3ed6a15a1f7d..0000000000000 --- a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/command_graph-state.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
Finalize
Finalize
Modifiable
Modifiable
Executable
Executable
Text is not SVG - cannot display
\ No newline at end of file diff --git a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg b/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg deleted file mode 100644 index d51956d613098..0000000000000 --- a/sycl/doc/extensions/proposed/images/sycl_ext_oneapi_graph/queue-state.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -

Begin Recording

Begin Recording
Executing
Executing
End Recording
End Recording
Recording
Recording
Text is not SVG - cannot display
\ No newline at end of file diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a71b1d28c53d8..d9d4b3480961d 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -420,7 +420,11 @@ argument being `graph_state::modifiable` to reduce code verbosity on construction. .Graph State Diagram -image::images/sycl_ext_oneapi_graph/command_graph-state.svg[] +[source, mermaid] +.... +graph LR + Modifiable -->|Finalize| Executable +.... ==== Executable Graph Update @@ -721,8 +725,13 @@ are recorded to a graph object as new nodes for each submission. After recording has finished and the queue returns to the executing state, the recorded commands are not then executed, they are transparent to any following queue operations. -.Queue State -image::images/sycl_ext_oneapi_graph/queue-state.svg[] +.Queue State Diagram +[source, mermaid] +.... +graph LR + Executing -->|Begin Recording| Recording + Recording -->|End Recording| Executing +.... The state of a queue can be queried with `queue::get_info` using template parameter `info::queue::state`. The following entry is added to the From e26b7a92dc93c116eae163eb8ec8604dcf20aeea Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 16 Dec 2022 09:42:33 +0000 Subject: [PATCH 33/82] Define behaviour of handler::depends_on() Supporting `handler::depends_on()` to track edges would make it easier for users to express USM dependencies in the record & replay model. This could be implemented by the runtime mapping default constructed `sycl::event`s returned by queue recording to internal nodes. When one of these events is later passed into `handler::depends_on() the runtime can check which node in the graph the event is associated with, and error if it is not an event returned from a queue recording. This addressed spec feedback at https://github.com/intel/llvm/pull/5626#discussion_r1020209752 Closes https://github.com/reble/llvm/issues/34 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 0637d375fd4b3..8536005bd9e55 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -146,8 +146,8 @@ and edges they want to add to the graph. 2. **Queue recording API (aka "Record & Replay")** - Introduces state to a `sycl::queue` such that rather than scheduling commands immediately for -execution, they are added to the graph object instead, with edges based on the -data dependencies of the command group. +execution, they are added to the graph object instead, with edges captured from +the dependencies of the command group. Each of these mechanisms for constructing a graph have their own advantages, so having both APIs available allows the user to pick the one which is most @@ -218,9 +218,16 @@ methods on a modifiable graph. Each node represent either a command-group function, empty operation, or device memory allocation/free. | Edge -| In the explicit graph building API edges are defined by the user. This is -either through buffer accessors, the `make_edge()` function, or by passing -dependent nodes on creation of a new node. +| In the explicit graph building API edges are primarily defined by the user +through newly added interfaces. This is either using the `make_edge()` function +to define an edge between existing nodes, or using a +`property::node::depends_on` property list when adding a new node to the graph. + +Edges can also be created when explicitly adding nodes to the graph through +existing SYCL mechanisms for expressing dependencies. Data dependencies from +buffer accessors to existing nodes in the graph are captured as an edge. Using +`handler::depends_on()` will also create a graph edge when passed an event +returned from a queue submission captured by a queue recording to the same graph. |=== ==== Queue Recording API @@ -247,10 +254,16 @@ nodes under this definition, as the specialized kernels executing on the device. | Edge -| An edge in a queue recorded graph represents a data dependency between two -nodes. Data dependencies can naturally be expressed in user code through buffer -accessors. USM pointers also convey data dependencies, however offsets into -system allocations (`malloc`/`new`) are not supported. +| An edge in a queue recorded graph is expressed through command group +dependencies in one of two ways. Firstly, through buffer accessors that +represent data dependencies between two command groups captured as nodes. +Secondly, by using the `handler::depends_on()` mechanism inside a command group +captured as a node. However, for an event passed to `handler::depends_on()` to +create an edge, it must be a default constructed event returned from a queue +submission captured by the same graph. Otherwise, the event will be ignored and +no dependency edge will be created in the graph. `handler::depends_on()` can be +used to express edges when a user is working with USM memory rather than SYCL +buffers. |=== ==== Sub-Graph From c3bdf1297d6a74efe061483481570020a7ce415f Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 19 Dec 2022 11:07:57 +0000 Subject: [PATCH 34/82] Left justify pointer/reference symbols The core SYCL spec always left justifies reference and pointer symbols in declarations. We are currently inconsistent, so this (very nitpicky) change updates the spec to be consistently left justified. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 8536005bd9e55..6e5a96bc6e666 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -312,16 +312,16 @@ class command_graph {}; template<> class command_graph { public: - command_graph(const property_list &propList = {}); - command_graph finalize(const context &syclContext) const; + command_graph(const property_list& propList = {}); + command_graph finalize(const context& syclContext) const; node add(const property_list& propList = {}); template node add(T cgf, const property_list& propList = {}); - node add_malloc_device(void *&data, size_t numBytes, const property_list& propList = {}); - node add_free(void *data, const property_list& propList = {}); + node add_malloc_device(void*& data, size_t numBytes, const property_list& propList = {}); + node add_free(void* data, const property_list& propList = {}); void make_edge(node src, node dest); }; @@ -330,7 +330,7 @@ template<> class command_graph { public: command_graph() = delete; - void update(const command_graph &graph); + void update(const command_graph& graph); }; } // namespace ext::oneapi::experimental @@ -338,7 +338,7 @@ public: using namespace ext::oneapi::experimental; class queue { public: - bool begin_recording(command_graph &graph); + bool begin_recording(command_graph& graph); bool end_recording(); /* -- graph convenience shortcuts -- */ @@ -347,7 +347,7 @@ public: event exec_graph(command_graph graph, event depEvent); event exec_graph(command_graph graph, - const std::vector &depEvents); + const std::vector& depEvents); }; // New methods added to the sycl::handler class @@ -459,7 +459,7 @@ Table 5. Constructor of the `command_graph` class. [source,c++] ---- using namespace ext::oneapi::experimental; -command_graph(const property_list &propList = {}); +command_graph(const property_list& propList = {}); ---- |Creates a SYCL `command_graph` object in the modifiable state. Zero or more properties can be provided to the constructed SYCL `command_graph` @@ -578,7 +578,7 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -command_graph finalize(const context &syclContext) const; +command_graph finalize(const context& syclContext) const; ---- |Synchronous operation that creates a graph in the executable state with a @@ -621,7 +621,7 @@ Table 7. Member functions of the `command_graph` class (memory operations). [source,c++] ---- using namespace ext::oneapi::experimental; -node add_malloc_device(void *&data, size_t numBytes, const property_list& propList = {}); +node add_malloc_device(void*& data, size_t numBytes, const property_list& propList = {}); ---- |Adding a node that encapsulates a memory allocation operation. @@ -650,7 +650,7 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -node add_free(void *data, const property_list& propList = {}); +node add_free(void* data, const property_list& propList = {}); ---- |Adding a node that encapsulates a memory free operation. @@ -684,7 +684,7 @@ Table 8. Member functions of the `command_graph` class (executable graph update) [source, c++] ---- using namespace ext::oneapi::experimental; -void command_graph update(const command_graph &graph); +void command_graph update(const command_graph& graph); ---- |Updates the executable graph node inputs & outputs from a topologically @@ -802,7 +802,7 @@ Table 8. Additional member functions of the `sycl::queue` class. [source, c++] ---- using namespace ext::oneapi::experimental; -bool queue::begin_recording(command_graph &graph) +bool queue::begin_recording(command_graph& graph) ---- |Synchronously changes the state of the queue to the `queue_state::recording` @@ -859,7 +859,7 @@ containing `handler::depends_on(depEvent)` and `handler::exec_graph(graph)`. ---- using namespace ext::oneapi::experimental; event queue::exec_graph(command_graph graph, - const std::vector &depEvents); + const std::vector& depEvents); ---- |Queue shortcut function that is equivalent to submitting a command-group @@ -1009,7 +1009,7 @@ int main() { auto node_z = g.add_malloc_device(z, n * sizeof(float)); /* init data on the device */ - auto node_i = g.add([&](sycl::handler &h) { + auto node_i = g.add([&](sycl::handler& h) { h.parallel_for(n, [=](sycl::id<1> it){ const size_t i = it[0]; x[i] = 1.0f; @@ -1018,14 +1018,14 @@ int main() { }); }, { sycl_ext::property::node::depends_on(node_x, node_y, node_z)}); - auto node_a = g.add([&](sycl::handler &h) { + auto node_a = g.add([&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }, { sycl_ext::property::node::depends_on(node_i)}); - auto node_b = g.add([&](sycl::handler &h) { + auto node_b = g.add([&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; @@ -1033,7 +1033,7 @@ int main() { }, { sycl_ext::property::node::depends_on(node_i)}); auto node_c = g.add( - [&](sycl::handler &h) { + [&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { @@ -1074,6 +1074,7 @@ submitted in its entirety for execution via [source, c++] ---- + using namespace sycl; queue q{default_selector{}}; // New object representing graph of command-groups @@ -1099,27 +1100,27 @@ submitted in its entirety for execution via // \ / // decrement_kernel - q.submit([&](handler &cgh) { + q.submit([&](handler& cgh) { auto pData = bufferA.get_access(cgh); cgh.parallel_for(range<1>(elements), [=](item<1> id) { pData[id]++; }); }); - q.submit([&](handler &cgh) { + q.submit([&](handler& cgh) { auto pData1 = bufferA.get_access(cgh); auto pData2 = bufferB.get_access(cgh); cgh.parallel_for(range<1>(elements), [=](item<1> id) { pData2[id] += pData1[id]; }); }); - q.submit([&](handler &cgh) { + q.submit([&](handler& cgh) { auto pData1 = bufferA.get_access(cgh); auto pData2 = bufferC.get_access(cgh); cgh.parallel_for( range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; }); }); - q.submit([&](handler &cgh) { + q.submit([&](handler& cgh) { auto pData1 = bufferB.get_access(cgh); auto pData2 = bufferC.get_access(cgh); cgh.parallel_for(range<1>(elements), [=](item<1> id) { @@ -1138,7 +1139,7 @@ submitted in its entirety for execution via auto exec_graph = graph.finalize(q.get_context()); // Execute graph - q.submit([&](handler &cgh) { + q.submit([&](handler& cgh) { cgh.exec_graph(exec_graph); }); From 3b804ba4a673c2bab3726b955cb3b0111da6537a Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 21 Dec 2022 10:38:25 +0000 Subject: [PATCH 35/82] Minor spec clarifications * [SYCL] Rename exec_graph to graph - Rename the exec_graph functions of the handler and queue to graph * [SYCL] Revise wording around finalize - Revised wording around finalize to make it clear a new executable graph is being created * [SYCL] Clarify mixed graph API usage - Clarify the wording around mixing explicit and record and replay APIs - Included a small illustrative code example. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 6e5a96bc6e666..35e41c73d5483 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -157,10 +157,16 @@ example via library function calls. While the explicit API can better express what data is internal to the graph for optimization, and dependencies don't need to be inferred. -It is valid to combine these two mechanisms sequentially when constructing a -graph, however it is not valid to use them concurrently. An error will be thrown -if a user attempts to use the explicit API to add a node to a graph which is -being recorded to by a queue. +It is valid to combine these two mechanisms, however it is invalid to modify +a graph using the explicit API while that graph is currently being recorded to, +for example: + +[source, c++] +---- +queue.begin_recording(graph); +graph.add(/*command group*/); // Invalid as graph is being recorded to +queue.end_recording(); +---- == Specification @@ -269,7 +275,7 @@ buffers. ==== Sub-Graph A node in a graph can take the form of a nested sub-graph. This occurs when -a command-group submission that invokes `handler::exec_graph()` with an +a command-group submission that invokes `handler::graph()` with an executable graph object is added to the graph as a node. === API Modifications @@ -343,17 +349,17 @@ public: /* -- graph convenience shortcuts -- */ - event exec_graph(command_graph graph); - event exec_graph(command_graph graph, + event graph(command_graph graph); + event graph(command_graph graph, event depEvent); - event exec_graph(command_graph graph, + event graph(command_graph graph, const std::vector& depEvents); }; // New methods added to the sycl::handler class class handler { public: - void exec_graph(command_graph graph); + void graph(command_graph graph); } } // namespace sycl @@ -581,7 +587,7 @@ using namespace ext::oneapi::experimental; command_graph finalize(const context& syclContext) const; ---- -|Synchronous operation that creates a graph in the executable state with a +|Synchronous operation that creates a new graph in the executable state with a fixed topology that can be submitted for execution on any queue sharing the supplied context. It is valid to call this method multiple times to create subsequent executable graphs. It is also valid to continue to add new nodes to @@ -598,7 +604,7 @@ Parameters: * `syclContext` - The context associated with the queues to which the executable graph will be able to be submitted. -Returns: An executable graph object which can be submitted to a queue. +Returns: A new executable graph object which can be submitted to a queue. Exceptions: @@ -837,33 +843,33 @@ state, `false` otherwise. [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::exec_graph(command_graph graph) +event queue::graph(command_graph graph) ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::exec_graph(graph)`. +containing `handler::graph(graph)`. | [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::exec_graph(command_graph graph, +event queue::graph(command_graph graph, event depEvent); ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::depends_on(depEvent)` and `handler::exec_graph(graph)`. +containing `handler::depends_on(depEvent)` and `handler::graph(graph)`. | [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::exec_graph(command_graph graph, +event queue::graph(command_graph graph, const std::vector& depEvents); ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::depends_on(depEvents)` and `handler::exec_graph(graph)`. +containing `handler::depends_on(depEvents)` and `handler::graph(graph)`. |=== ==== New Handler Member Functions @@ -875,7 +881,7 @@ Table 10. Additional member functions of the `sycl::handler` class. [source,c++] ---- using namespace ext::oneapi::experimental; -void handler::exec_graph(command_graph graph) +void handler::graph(command_graph graph) ---- |Invokes the execution of a graph. Support for invoking an executable graph, @@ -1049,7 +1055,7 @@ int main() { auto exec = g.finalize(q.get_context()); // use queue shortcut for graph submission - q.exec_graph(exec).wait(); + q.graph(exec).wait(); // memory can be freed inside or outside the graph sycl::free(z, q.get_context()); @@ -1070,7 +1076,7 @@ command-groups submitted to the queue. Once the graph is complete, recording finishes on the queue to put it back into the default executing state. The graph is then finalized so that no more nodes can be added. Lastly, the graph is submitted in its entirety for execution via -`handler::exec_graph(command_graph)`. +`handler::graph(command_graph)`. [source, c++] ---- @@ -1140,7 +1146,7 @@ submitted in its entirety for execution via // Execute graph q.submit([&](handler& cgh) { - cgh.exec_graph(exec_graph); + cgh.graph(exec_graph); }); ---- From 3d685559c6ca71c0e00772372232bb14b10f8194 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 3 Jan 2023 12:33:55 -0600 Subject: [PATCH 36/82] Apply suggestions from code review Co-authored-by: Greg Lueck --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 35e41c73d5483..a35902dea4153 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1,4 +1,4 @@ -= SYCL_EXT_ONEAPI_GRAPH += sycl_ext_oneapi_graph :source-highlighter: coderay :coderay-linenums-mode: table @@ -21,16 +21,16 @@ Copyright (c) 2022 Intel Corporation. All rights reserved. -IMPORTANT: This specification is a draft. - NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. -This extension is written against the SYCL 2020 revision 5 specification. All -references below to the "core SYCL specification" or to section numbers in the -SYCL specification refer to that revision. +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues == Contributors @@ -50,6 +50,12 @@ Bjoern Knafla, Codeplay + Lukas Sommer, Codeplay + Ronan Keryell, AMD + +== Dependencies + +This extension is written against the SYCL 2020 revision 6 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + == Status This is a proposed extension specification, intended to gather community @@ -494,7 +500,7 @@ Table 6. Member functions of the `command_graph` class. using namespace ext::oneapi::experimental; node add(const property_list& propList = {}); ---- -|This creates an empty node which is associated to no task. Its intended use is +|This creates an empty node which contains no command. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. From 2c0c63312a388ab2c9de30e99c686c1d2754ecd3 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 9 Jan 2023 11:08:33 +0000 Subject: [PATCH 37/82] [SYCL] Change `handler::graph()` to `handler::ext_oneapi_graph()` This is based on [feedback](https://github.com/intel/llvm/pull/5626#discussion_r1055830885) which points out that [Section 6.3.2](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_names_for_extensions_to_existing_classes_or_enumerations) of the SYCL spec says to use a vendor prefix for new functions to existing classes. I've not updated the `queue::begin_recording` and `queue::end_recording` entry points with this convention, as they will be removed in PR https://github.com/reble/llvm/pull/58 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a35902dea4153..a8995efa53a02 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -281,7 +281,7 @@ buffers. ==== Sub-Graph A node in a graph can take the form of a nested sub-graph. This occurs when -a command-group submission that invokes `handler::graph()` with an +a command-group submission that invokes `handler::ext_oneapi_graph()` with an executable graph object is added to the graph as a node. === API Modifications @@ -355,17 +355,17 @@ public: /* -- graph convenience shortcuts -- */ - event graph(command_graph graph); - event graph(command_graph graph, + event ext_oneapi_graph(command_graph graph); + event ext_oneapi_graph(command_graph graph, event depEvent); - event graph(command_graph graph, + event ext_oneapi_graph(command_graph graph, const std::vector& depEvents); }; // New methods added to the sycl::handler class class handler { public: - void graph(command_graph graph); + void ext_oneapi_graph(command_graph graph); } } // namespace sycl @@ -849,33 +849,35 @@ state, `false` otherwise. [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::graph(command_graph graph) +event queue::ext_oneapi_graph(command_graph graph) ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::graph(graph)`. +containing `handler::ext_oneapi_graph(graph)`. | [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::graph(command_graph graph, +event queue::ext_oneapi_graph(command_graph graph, event depEvent); ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::depends_on(depEvent)` and `handler::graph(graph)`. +containing `handler::depends_on(depEvent)` and +`handler::ext_oneapi_graph(graph)`. | [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::graph(command_graph graph, +event queue::ext_oneapi_graph(command_graph graph, const std::vector& depEvents); ---- |Queue shortcut function that is equivalent to submitting a command-group -containing `handler::depends_on(depEvents)` and `handler::graph(graph)`. +containing `handler::depends_on(depEvents)` and +`handler::ext_oneapi_graph(graph)`. |=== ==== New Handler Member Functions @@ -887,7 +889,7 @@ Table 10. Additional member functions of the `sycl::handler` class. [source,c++] ---- using namespace ext::oneapi::experimental; -void handler::graph(command_graph graph) +void handler::ext_oneapi_graph(command_graph graph) ---- |Invokes the execution of a graph. Support for invoking an executable graph, @@ -1061,7 +1063,7 @@ int main() { auto exec = g.finalize(q.get_context()); // use queue shortcut for graph submission - q.graph(exec).wait(); + q.ext_oneapi_graph(exec).wait(); // memory can be freed inside or outside the graph sycl::free(z, q.get_context()); @@ -1082,7 +1084,7 @@ command-groups submitted to the queue. Once the graph is complete, recording finishes on the queue to put it back into the default executing state. The graph is then finalized so that no more nodes can be added. Lastly, the graph is submitted in its entirety for execution via -`handler::graph(command_graph)`. +`handler::ext_oneapi_graph(command_graph)`. [source, c++] ---- @@ -1152,7 +1154,7 @@ submitted in its entirety for execution via // Execute graph q.submit([&](handler& cgh) { - cgh.graph(exec_graph); + cgh.ext_oneapi_graph(exec_graph); }); ---- From d49571e331c6b15311da69299c0a01a253645257 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 10 Jan 2023 11:26:03 +0000 Subject: [PATCH 38/82] [SYCL] Change record & replay relationship between queue / graph Better aligns the queue record graph creation mechanism with the [kernel fusion extension](https://github.com/intel/llvm/pull/7098) ```cpp ext::codeplay::experimental::fusion_wrapper w{q}; w.start_fusion(); // 'q' submissions w.complete_fusion() ``` By changing the relationship between a queue and a graph so that recording starts and finishes on a graph we better match kernel fusion. This design is also more exception safe as `end_recording()` can be called in a RAII approach when a graph is destroyed. As a result a graph is now created from queue recording like: ```cpp ext::oneapi::experimental::command_graph graph; graph.begin_recording({q}); // 'q' submissions graph.end_recording(); ``` Addresses Issue https://github.com/reble/llvm/issues/53 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 211 +++++++++++++----- 1 file changed, 153 insertions(+), 58 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a8995efa53a02..07057217ba058 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -169,9 +169,9 @@ for example: [source, c++] ---- -queue.begin_recording(graph); +graph.begin_recording(queue); graph.add(/*command group*/); // Invalid as graph is being recorded to -queue.end_recording(); +graph.end_recording(); ---- == Specification @@ -327,6 +327,13 @@ public: command_graph(const property_list& propList = {}); command_graph finalize(const context& syclContext) const; + bool begin_recording(queue recordingQueue); + bool begin_recording(const std::vector& recordingQueues); + + bool end_recording(); + bool end_recording(queue recordingQueue); + bool end_recording(const std::vector& recordingQueues); + node add(const property_list& propList = {}); template @@ -350,9 +357,6 @@ public: using namespace ext::oneapi::experimental; class queue { public: - bool begin_recording(command_graph& graph); - bool end_recording(); - /* -- graph convenience shortcuts -- */ event ext_oneapi_graph(command_graph graph); @@ -687,7 +691,120 @@ Exceptions: |=== -Table 8. Member functions of the `command_graph` class (executable graph update). +Table 8. Member functions of the `command_graph` class for queue recording. +[cols="2a,a"] +|=== +|Member function|Description + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool begin_recording(queue recordingQueue) +---- + +|Synchronously changes the state of `recordingQueue` to the +`queue_state::recording` state. + +Parameters: + +* `recordingQueue` - A `sycl::queue` object to change to the + `queue_state::recording` state and start recording commands to the graph + instance. + +Returns: `true` if `recordingQueue` has its state changed from +`queue_state::executing` to `queue_state::recording`, `false` otherwise. + +Exceptions: + +* Throws synchronously with error code `invalid` if `recordingQueue` is + already recording to a different graph. + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool begin_recording(const std::vector& recordingQueues) +---- + +|Synchronously changes the state of each queue in `recordingQueues` to the +`queue_state::recording` state. + +Parameters: + +* `recordingQueues` - List of `sycl::queue` objects to change to the + `queue_state::recording` state and start recording commands to the graph + instance. + +Returns: `true` if any queue in `recordingQueues` has its state changed from +`queue_state::executing` to `queue_state::recording`, `false` otherwise. + +Exceptions: + +* Throws synchronously with error code `invalid` if the any queue in + `recordingQueues` is already recording to a different graph. + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool end_recording() +---- + +|Synchronously finishes recording on all queues that are recording to the +graph and sets their state to `queue_state::executing`. + +Returns: `true` if any queue recording to the graph has its state changed from +`queue_state::recording` to `queue_state::executing`, `false` otherwise. + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool end_recording(queue recordingQueue) +---- + +|Synchronously changes the state of `recordingQueue` to the +`queue_state::executing` state. + +Parameters: + +* `recordingQueue` - A `sycl::queue` object to change to the executing state. + +Returns: `true` if `recordingQueue` has its state changed from +`queue_state::recording` to `queue_state::executing`, `false` otherwise. + +Exceptions: + +* Throws synchronously with error code `invalid` if `recordingQueue` is + recording to a different graph. + +| +[source, c++] +---- +using namespace ext::oneapi::experimental; +bool end_recording(const std::vector& recordingQueues) +---- + +|Synchronously changes the state of each queue in `recordingQueues` to the +`queue_state::executing` state. + +Parameters: + +* `recordingQueues` - List of `sycl::queue` objects to change to the executing + state. + +Returns: `true` if any queue in `recordingQueues` has its state changed from +`queue_state::recording` to `queue_state::executing`, `false` otherwise. + +Exceptions: + +* Throws synchronously with error code `invalid` if any queue in + `recordingQueues` is recording to a different graph. + +|=== + +Table 9. Member functions of the `command_graph` class (executable graph update). [cols="2a,a"] |=== |Member function|Description @@ -733,8 +850,7 @@ put into a mode where command-groups are recorded to a graph rather than submitted immediately for execution. <> are also added to the -`sycl::queue` class with this extension. Two functions for selecting the state -of the queue, and another function for submitting a graph to the queue. +`sycl::queue` class in this extension as queue shortcuts for `handler::graph()`. ==== Queue State @@ -762,7 +878,7 @@ The state of a queue can be queried with `queue::get_info` using template parameter `info::queue::state`. The following entry is added to the {queue-info-table}[queue info table] to define this query: -Table 9. Queue info query +Table 10. Queue info query [cols="2a,a,a"] |=== | Queue Descriptors | Return Type | Description @@ -805,46 +921,11 @@ property and this graph extension. ==== New Queue Member Functions -Table 8. Additional member functions of the `sycl::queue` class. +Table 11. Additional member functions of the `sycl::queue` class. [cols="2a,a"] |=== |Member function|Description -| -[source, c++] ----- -using namespace ext::oneapi::experimental; -bool queue::begin_recording(command_graph& graph) ----- - -|Synchronously changes the state of the queue to the `queue_state::recording` -state. - -Parameters: - -* `graph` - Graph object to start recording commands to. - -Returns: `true` if the queue was previously in the `queue_state::executing` -state, `false` otherwise. - -Exceptions: - -* Throws synchronously with error code `invalid` if the queue is already - recording to a different graph. - -| -[source, c++] ----- -using namespace ext::oneapi::experimental; -bool queue::end_recording() ----- - -|Synchronously changes the state of the queue to the `queue_state::executing` -state. - -Returns: `true` if the queue was previously in the `queue_state::recording` -state, `false` otherwise. - | [source,c++] ---- @@ -882,7 +963,7 @@ containing `handler::depends_on(depEvents)` and ==== New Handler Member Functions -Table 10. Additional member functions of the `sycl::handler` class. +Table 12. Additional member functions of the `sycl::handler` class. [cols="2a,a"] |=== |Member function|Description @@ -920,25 +1001,38 @@ The returned value from the `info::queue::state` should be considered immediately stale in multi-threaded usage, as another thread could have preemptively changed the state of the queue. +=== Exception Safety + +In addition to the destruction semantics provided by the SYCL +{crs}[common reference semantics], when a modifiable `command_graph` is +destroyed recording is ended on any queues that are recording to that +graph, equivalent to `this->end_recording()`. + +As a result users don't need to manually wrap queue recording code in a +`try` / `catch` block to reset the state of recording queues on an exception +back to the executing state. Instead, an uncaught exception destroying the +modifiable graph will perform this action, useful in RAII pattern usage. + === Error Handling Errors are reported through exceptions, as usual in the SYCL API. For new APIs, submitting a graph for execution can generate unspecified asynchronous errors, while `command_graph::finalize()` may throw unspecified synchronous exceptions. -Synchronous exception errors codes are defined for both -`queue::begin_recording()` and `command_graph::update()`. +Synchronous exception errors codes are defined for all of +`command_graph::begin_recording()`, `command_graph::end_recording()` and +`command_graph::update()`. When a queue is in recording mode asynchronous exceptions will not be -generated, as no device execution is occuring. Synchronous errors specified as +generated, as no device execution is occurring. Synchronous errors specified as being thrown in the default queue executing state, will still be thrown when a queue is in the recording state. -The `queue::begin_recording` and `queue::end_recording` entry-points return a -`bool` value informing the user whether a state change occurred. False is -returned rather than throwing an exception when state isn't changed. This design -is because the queue is already in the state the user desires, so if the -function threw an exception in this case, the application would likely swallow -it and then proceed. +The `command_graph::begin_recording` and `command_graph::end_recording` +entry-points return a `bool` value informing the user whether a related queue +state change occurred. False is returned rather than throwing an exception when +no queue state is changed. This design is because the queues are already in +the state the user desires, so if the function threw an exception in this case, +the application would likely swallow it and then proceed. While a queue is in the recording state, methods performed on that queue which are not command submissions behave as normal. This includes waits, throws, and @@ -1100,7 +1194,7 @@ submitted in its entirety for execution via // `q` will be put in the recording state where commands are recorded to // `graph` rather than submitted for execution immediately. - q.begin_recording(graph); + graph.begin_recording(q); // Record commands to `graph` with the following topology. // @@ -1143,9 +1237,9 @@ submitted in its entirety for execution via }); }); - // queue will be returned to the executing state where commands are + // queue `q` will be returned to the executing state where commands are // submitted immediately for extension. - q.end_recording(); + graph.end_recording(); } // Finalize the modifiable graph to create an executable graph that can be @@ -1196,4 +1290,5 @@ this feature in the extension. |4|2022-08-10|Pablo Reble|Adding USM shortcuts |5|2022-10-21|Ewan Crawford|Merge in Codeplay vendor extension |6|2022-11-14|Ewan Crawford|Change graph execution to be a function on the handler +|7|2022-12-15|Ewan Crawford|Change record & replay relationship between graph and queue. |======================================== From fe7cea3a1da37930a87f7b3ef53cdc47560abe16 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 12 Jan 2023 15:11:00 +0000 Subject: [PATCH 39/82] [SYCL] Add property_list parameter to finalize Introduces an optional `property_list` parameter when creating an executable graph with finalize. No properties are defined that could be passed here, but the kernel fusion API `fusion_wrapper::complete fusion()` takes a property list, which could be relevant in future. Additionally, CUDA has a `cudaGraphInstantiateWithParams()` entry-point that this property list would provide equivalent coverage for. Using a `property_list` in this way is analagous to how [kernel_bundle](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interfaces.bundles.overview.synopsis) `compile()`/`link()`/`build()` take a property list and return a new object with different state. --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 07057217ba058..27dbd9f1fe81d 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -325,7 +325,9 @@ template<> class command_graph { public: command_graph(const property_list& propList = {}); - command_graph finalize(const context& syclContext) const; + + command_graph + finalize(const context& syclContext, const property_list& propList = {}) const; bool begin_recording(queue recordingQueue); bool begin_recording(const std::vector& recordingQueues); @@ -594,7 +596,8 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -command_graph finalize(const context& syclContext) const; +command_graph +finalize(const context& syclContext, const property_list& propList = {}) const; ---- |Synchronous operation that creates a new graph in the executable state with a @@ -614,6 +617,9 @@ Parameters: * `syclContext` - The context associated with the queues to which the executable graph will be able to be submitted. +* `propList` - Optional parameter for passing properties. No finalization + properties are defined by this extension. + Returns: A new executable graph object which can be submitted to a queue. Exceptions: From 0952f6a0e53641005b2226afe72548dd33a2fddc Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 15 Feb 2023 10:16:41 +0000 Subject: [PATCH 40/82] [SYCL] Use counter attribute for table numbers (#76) - Use the ASCIIDOC counter attribute to generate table numbers --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 27dbd9f1fe81d..40fb86fe55459 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -186,7 +186,7 @@ Applications can test for the existence of this macro to determine if the implementation supports this feature, or applications can test the macro's value to determine which of the extension's APIs the implementation supports. -Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. +Table {counter: tableNumber}. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. [%header,cols="1,5"] |=== |Value |Description @@ -197,7 +197,7 @@ Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. :explicit-memory-ops: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:explicitmemory -Table 2. Terminology. +Table {counter: tableNumber}. Terminology. [%header,cols="1,3"] |=== | Concept | Description @@ -219,7 +219,7 @@ Table 2. Terminology. When using the explicit graph building API to construct a graph, nodes and edges are captured as follows. -Table 3. Explicit Graph Definition. +Table {counter: tableNumber}. Explicit Graph Definition. [%header,cols="1,3"] |=== | Concept | Description @@ -247,7 +247,7 @@ returned from a queue submission captured by a queue recording to the same graph When using the record & replay API to construct a graph by recording a queue, nodes and edges are captured as follows. -Table 4. Recorded Graph Definition. +Table {counter: tableNumber}. Recorded Graph Definition. [%header,cols="1,3"] |=== | Concept | Description @@ -468,7 +468,7 @@ create the executable graphs, with the nodes added in the same order. ==== Graph Member Functions -Table 5. Constructor of the `command_graph` class. +Table {counter: tableNumber}. Constructor of the `command_graph` class. [cols="2a,a"] |=== |Constructor|Description @@ -495,7 +495,7 @@ Parameters: |=== -Table 6. Member functions of the `command_graph` class. +Table {counter: tableNumber}. Member functions of the `command_graph` class. [cols="2a,a"] |=== |Member function|Description @@ -634,7 +634,7 @@ Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed. -Table 7. Member functions of the `command_graph` class (memory operations). +Table {counter: tableNumber}. Member functions of the `command_graph` class (memory operations). [cols="2a,a"] |=== |Member function|Description @@ -697,7 +697,7 @@ Exceptions: |=== -Table 8. Member functions of the `command_graph` class for queue recording. +Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. [cols="2a,a"] |=== |Member function|Description @@ -810,7 +810,7 @@ Exceptions: |=== -Table 9. Member functions of the `command_graph` class (executable graph update). +Table {counter: tableNumber}. Member functions of the `command_graph` class (executable graph update). [cols="2a,a"] |=== |Member function|Description @@ -884,7 +884,7 @@ The state of a queue can be queried with `queue::get_info` using template parameter `info::queue::state`. The following entry is added to the {queue-info-table}[queue info table] to define this query: -Table 10. Queue info query +Table {counter: tableNumber}. Queue info query [cols="2a,a,a"] |=== | Queue Descriptors | Return Type | Description @@ -927,7 +927,7 @@ property and this graph extension. ==== New Queue Member Functions -Table 11. Additional member functions of the `sycl::queue` class. +Table {counter: tableNumber}. Additional member functions of the `sycl::queue` class. [cols="2a,a"] |=== |Member function|Description @@ -969,7 +969,7 @@ containing `handler::depends_on(depEvents)` and ==== New Handler Member Functions -Table 12. Additional member functions of the `sycl::handler` class. +Table {counter: tableNumber}. Additional member functions of the `sycl::handler` class. [cols="2a,a"] |=== |Member function|Description From 08a426399c01353c1b0c610db564ef8b9f821d87 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 15 Feb 2023 12:26:03 +0000 Subject: [PATCH 41/82] [SYCL] Add note on CGF evaluation (#74) - Add a note on CGF evaluation behaviour. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 40fb86fe55459..bc723ab387f6a 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1060,6 +1060,7 @@ required by the graph (such as after being replaced through executable graph upd === Host Tasks :host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks +:cg-scope: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:command.group.scope A {host-task}[host task] is a native C++ callable, scheduled according to SYCL dependency rules. It is valid to record a host task as part of graph, though it @@ -1070,6 +1071,30 @@ device at once. Host tasks can be updated as part of <> by replacing the whole node with the new callable. +=== Command Group Function Evaluation + +Host code within a command group function object is evaluated when the command +group is added to a graph. This is either before the return of the call to +`command_graph::add()` when using the explicit API or before the return of the call to +`queue::submit()` when submitting a command group to a queue that is recording to a graph. +This behaviour is in keeping with the existing {cg-scope}[command group] behaviour but may have +implications for command group functions containing arbitrary host code. This could +affect the behaviour of captured code due to the delayed execution of commands. + +This does not apply to code within a {host-task}[host task] which is +evaluated as normal during command graph execution. + +[source,c++] +---- +using namespace ext::oneapi::experimental; +auto node = graph.add([&](sycl::handler& cgh){ +// Host code here is evaluated during the call to add() +cgh.host_task([=](){ + // Code here is evaluated as part of executing the command graph node +}); +}); +---- + == Examples [NOTE] From f6773340b7afcdfc177a675b44dcfad4da4f50fc Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 15 Feb 2023 14:08:00 +0000 Subject: [PATCH 42/82] [SYCL] Add section about buffer limitations (#73) - Adds section to the spec on buffer limitations around the copy-back mechanism. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index bc723ab387f6a..b325a4a1a6d6e 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1057,6 +1057,29 @@ extended either for the lifetime of the graph (including both modifiable graphs and the executable graphs created from them) or until the buffer is no longer required by the graph (such as after being replaced through executable graph update). +Because of the extension of storage lifetimes, users should avoid the use of the +buffer copy-back on destruction mechanism. If used in code intended to be +executed as part of a graph, it may not perform as expected. + +=== Buffer Limtations for Record & Replay API + +Because of the delayed execution of a recorded graph it is not possible to support +captured code which relies on the copy-back on destruction behaviour of buffers. +Typically applications would rely on this behaviour to do work on the host which +cannot inherently be captured inside a command graph. Thus when recording to a graph +it is an error to submit a command which has an accessor on a buffer which would +cause a write-back to happen. Using an incompatible buffer in this case will result +in a synchronous error being thrown with error code `invalid`. + +The copy-back mechanism can be disabled explicitly for buffers with attached host +storage using either `buffer::set_final_data(nullptr)` or +`buffer::set_copy_back(false)`. + +It is also an error to create a host accessor to a buffer which is used in +commands which are currently being recorded to a command graph. Attempting to +construct a host accessor to an incompatible buffer will result in a +synchronous error being thrown with error code `invalid`. + === Host Tasks :host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks From e866b31e48afc0d90017b925da075e2664e9ff12 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 27 Feb 2023 08:30:04 +0000 Subject: [PATCH 43/82] [SYCL] Clarify event/queue wait behaviour - Update wording around event waits to explicitly disallow waiting on events or a queue returned from a graph submission. - Update error handling section on queue operations - Specify status of events returned from recording submissions --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index b325a4a1a6d6e..aa184c8b2ee29 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -271,9 +271,9 @@ dependencies in one of two ways. Firstly, through buffer accessors that represent data dependencies between two command groups captured as nodes. Secondly, by using the `handler::depends_on()` mechanism inside a command group captured as a node. However, for an event passed to `handler::depends_on()` to -create an edge, it must be a default constructed event returned from a queue -submission captured by the same graph. Otherwise, the event will be ignored and -no dependency edge will be created in the graph. `handler::depends_on()` can be +create an edge, it must be an event returned from a queue +submission captured by the same graph. Otherwise, a synchronous error will be +thrown with error code `invalid`. `handler::depends_on()` can be used to express edges when a user is working with USM memory rather than SYCL buffers. |=== @@ -895,10 +895,17 @@ Table {counter: tableNumber}. Queue info query |=== -A default constructed event is returned when a user submits a command-group to -a queue in the recording state. These events have status -`info::event_command_status::complete` and a user waiting on them will return -immediately. +Events returned from queue submissions when a queue is in the recording state +may only be used as parameters to `handler::depends_on()` or as dependent +events for queue shortcuts like `queue::parallel_for()` for submissions which +are being recorded to the same modifiable `command_graph`. These events have +status `info::event_command_status::complete`. + +Waiting on an event returned from a queue submission recorded to a graph +will throw synchronously with error code `invalid`. + +Calling `queue::wait()` on a queue in the recording state is an error and +will throw synchronously with error code `invalid`. ==== Queue Properties @@ -1040,13 +1047,14 @@ no queue state is changed. This design is because the queues are already in the state the user desires, so if the function threw an exception in this case, the application would likely swallow it and then proceed. -While a queue is in the recording state, methods performed on that queue which -are not command submissions behave as normal. This includes waits, throws, and -queries on the queue. These are all ignored by the graph system, as opposed to -throwing an exception when in queue recording mode. This is because otherwise -there would be no thread safe way for a user to check they could call these -functions without throwing, as a query about the state of the queue may be -immediately stale. +While a queue is in the recording state, methods peformed on that queue which +are not command submissions behave as normal except for waits. Waiting on a +queue in the recording state is an error and will throw a synchronous +exception. Other methods are ignored by the graph system as opposed to +throwing in recording mode. As any query about the state of the queue may +be immediately stale, any code which relies on queue waits should take care +to ensure waits are not performed on queues in recording mode. For example by +using separate queues for graph recording and normal queue operations. === Storage Lifetimes From 7ceb00561278e82697431a76741ae51590d63115 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 16 Mar 2023 09:10:17 +0000 Subject: [PATCH 44/82] [SYCL] Expand on graphs profiling semantics Elaborates on what the semantics are of queries on events returned by executable graph submission. The event enters the running state when the first node starts executing on device, and completes once the last node finishes. Since a graph could be divided up into multiple PI commands-buffers that may have work scheduled in-between this could give a pessimistic view of graph execution time, however I don't think we can guarantee better. In the future we could add explicit graph profiling nodes for a more fine grained view of performance, but that's out of scope for now. Closes Issue https://github.com/reble/llvm/issues/52 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index aa184c8b2ee29..e16c748e11c89 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -899,7 +899,11 @@ Events returned from queue submissions when a queue is in the recording state may only be used as parameters to `handler::depends_on()` or as dependent events for queue shortcuts like `queue::parallel_for()` for submissions which are being recorded to the same modifiable `command_graph`. These events have -status `info::event_command_status::complete`. +status `info::event_command_status::complete`. The event status of an event +returned from an executable graph submission will have +`info::event_command_status::running` once any command group node starts +executing on a device, and status `info::event_command_status::complete` +once all the nodes have finished execution. Waiting on an event returned from a queue submission recorded to a graph will throw synchronously with error code `invalid`. @@ -926,7 +930,18 @@ ways: 2. `property::queue::enable_profiling` - This property has no effect on graph recording. When set on the queue a graph is submitted to however, it allows profiling information to be obtained from the event returned by a graph - submission. + submission. As it is not defined how a submitted graph will be split up for + scheduling at runtime, the `uint64_t` timestamp reported from a profiling + query on a graph execution event has the following semantics, which may be + pessimistic about execution time on device. + + * `info::event_profiling::command_submit` - Timestamp when the graph is + submitted to the queue. + * `info::event_profiling::command_start` - Timestamp when the first + command-group node begins running. + * `info::event_profiling::command_end` - Timestamp when the last + command-group node completes execution. + For any other queue property that is defined by an extension, it is the responsibility of the extension to define the relationship between that queue From 8cef0c2b7dcfe84384b6c7098ca1917e05e7636a Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 16 Mar 2023 13:33:21 +0000 Subject: [PATCH 45/82] [SYCL] Remove memory allocation/free nodes There are [concerns with the currently specified approach](https://github.com/intel/llvm/pull/5626#discussion_r1084170330) so remove the API and advise users to allocate memory before submitting the graph. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 157 +++++++----------- 1 file changed, 59 insertions(+), 98 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index e16c748e11c89..03a1adc7bb799 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -227,7 +227,7 @@ Table {counter: tableNumber}. Explicit Graph Definition. | Node | In the explicit graph building API nodes are created by the user invoking methods on a modifiable graph. Each node represent either a command-group -function, empty operation, or device memory allocation/free. +function or an empty operation. | Edge | In the explicit graph building API edges are primarily defined by the user @@ -341,9 +341,6 @@ public: template node add(T cgf, const property_list& propList = {}); - node add_malloc_device(void*& data, size_t numBytes, const property_list& propList = {}); - node add_free(void* data, const property_list& propList = {}); - void make_edge(node src, node dest); }; @@ -381,8 +378,8 @@ public: :crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics -Node is a class that encapsulates tasks like SYCL kernel functions, device -memory allocations/frees, or host tasks for deferred execution. A graph has to +Node is a class that encapsulates tasks like SYCL kernel functions, memory +operations, or host tasks for deferred execution. A graph has to be created first, the structure of a graph is defined second by adding nodes and edges. @@ -630,73 +627,6 @@ Exceptions: |=== -Memory that is allocated by the following functions is owned by the specific -graph. When freed inside the graph, the memory is only accessible before the -`free` node is executed and after the `malloc` node is executed. - -Table {counter: tableNumber}. Member functions of the `command_graph` class (memory operations). -[cols="2a,a"] -|=== -|Member function|Description - -| -[source,c++] ----- -using namespace ext::oneapi::experimental; -node add_malloc_device(void*& data, size_t numBytes, const property_list& propList = {}); ----- -|Adding a node that encapsulates a memory allocation operation. - -Preconditions: - -* This member function is only available when the `command_graph` state is - `graph_state::modifiable`. - -Parameters: - -* `data` - Return parameter set to the address of memory allocated. - -* `numBytes` - Size in bytes to allocate. - -* `propList` - Zero or more properties can be provided to the constructed node - via an instance of `property_list`. - -Returns: The memory allocation node which has been added to the graph - -Exceptions: - -* Throws synchronously with error code `invalid` if a queue is recording - commands to the graph. - -| -[source,c++] ----- -using namespace ext::oneapi::experimental; -node add_free(void* data, const property_list& propList = {}); ----- -|Adding a node that encapsulates a memory free operation. - -Preconditions: - -* This member function is only available when the `command_graph` state is - `graph_state::modifiable`. - -Parameters: - -* `data` - Address of memory to free. - -* `propList` - Zero or more properties can be provided to the constructed node - via an instance of `property_list`. - -Returns: The memory freeing node which has been added to the graph. - -Exceptions: - -* Throws synchronously with error code `invalid` if a queue is recording - commands to the graph. - -|=== - Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. [cols="2a,a"] |=== @@ -1120,11 +1050,11 @@ by replacing the whole node with the new callable. === Command Group Function Evaluation Host code within a command group function object is evaluated when the command -group is added to a graph. This is either before the return of the call to +group is added to a graph. This is either before the return of the call to `command_graph::add()` when using the explicit API or before the return of the call to `queue::submit()` when submitting a command group to a queue that is recording to a graph. This behaviour is in keeping with the existing {cg-scope}[command group] behaviour but may have -implications for command group functions containing arbitrary host code. This could +implications for command group functions containing arbitrary host code. This could affect the behaviour of captured code due to the delayed execution of commands. This does not apply to code within a {host-task}[host task] which is @@ -1141,6 +1071,44 @@ cgh.host_task([=](){ }); ---- +=== Memory Allocation Nodes + +There is no provided interface for users to define a USM allocation/free +operation belonging to the scope of the graph. It would be error prone and +non-performant to allocate or free memory as a node executed during graph +submission. Instead, such a memory allocation API needs to provide a way to +return a pointer which won't be valid until the allocation is made on graph +finalization, as allocating at finalization is the only way to benefit from +the known graph scope for optimal memory allocation, and even optimize to +eliminate some allocations entirely. + +Such a deferred allocation strategy presents challenges however, and as a result +we recommend instead that prior to graph construction users perform core SYCL +USM allocations to be used in the graph submission. Before to coming to this +recommendation we considered the following explicit graph building interfaces +for adding a memory allocation owned by the graph: + +1. Allocation function returning a reference to the raw pointer, i.e. `void*&`, + which will be instantiated on graph finalization with the location of the + allocated USM memory. + +2. Allocation function returning a handle to the allocation. Applications use + the handle in node command-group functions to access memory when allocated. + +3. Allocation function returning a pointer to a virtual allocation, only backed + with an actual allocation when graph is finalized or submitted. + +Design 1) has the drawback of forcing users to keep the user pointer variable +alive so that the reference is valid, which is unintuitive and is likely to +result in bugs. + +Design 2) introduces a handle object which has the advantages of being a less +error prone way to provide the pointer to the deferred allocation. However, it +requires kernel changes and introduces an overhead above the raw pointers that +are the advantage of USM. + +Design 3) needs specific backend support for deferred allocation. + == Examples [NOTE] @@ -1173,25 +1141,18 @@ int main() { sycl_ext::command_graph g; - float *x , *y, *z; - float *dotp = sycl::malloc_shared(1, q); + float *x = sycl::malloc_device(n, q); + float *y = sycl::malloc_device(n, q); + float *z = sycl::malloc_device(n, q); // Add commands to the graph to create the following topology. // - // x y z - // \ | / // i // / \ // a b - // \ / \ - // c fy - // | - // fx - - auto node_x = g.add_malloc_device(x, n * sizeof(float)); - auto node_y = g.add_malloc_device(y, n * sizeof(float)); - auto node_z = g.add_malloc_device(z, n * sizeof(float)); + // \ / + // c /* init data on the device */ auto node_i = g.add([&](sycl::handler& h) { @@ -1201,7 +1162,7 @@ int main() { y[i] = 2.0f; z[i] = 3.0f; }); - }, { sycl_ext::property::node::depends_on(node_x, node_y, node_z)}); + }); auto node_a = g.add([&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { @@ -1228,16 +1189,15 @@ int main() { }, { sycl_ext::property::node::depends_on(node_a, node_b)}); - auto node_fx = g.add_free(x, {sycl_ext::property::node::depends_on(node_c)}); - auto node_fy = g.add_free(y, {sycl_ext::property::node::depends_on(node_b)}); - auto exec = g.finalize(q.get_context()); // use queue shortcut for graph submission q.ext_oneapi_graph(exec).wait(); // memory can be freed inside or outside the graph - sycl::free(z, q.get_context()); + sycl::free(x, q); + sycl::free(y, q); + sycl::free(z, q); sycl::free(dotp, q); return 0; @@ -1338,14 +1298,15 @@ Allow an executable graph to contain nodes targeting different devices. **Outcome:** Under consideration -=== Record & Replay: Mark Internal Memory +=== Memory Allocation API -When a graph is created by recording a queue there is no way to tag memory -objects internal to the graph, which would enable optimizations on the internal -memory. Do we need an interface record & replay can use to identify buffers and -USM allocations not used outside of the graph? +We would like to provide an API that allows graph scope memory to be +allocated and used in nodes, such that optimizations can be done on +the allocation. No mechanism is currently provided, but see the +section on <> for +some designs being considered. -**Outcome:** Unresolved +**Outcome:** Designs under consideration === Executable Graph Update From 9f6f8fb9b0b1d11b5d0fecaa5ea540f5e99fbbf4 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 20 Mar 2023 09:18:15 -0500 Subject: [PATCH 46/82] [SYCL][DOC] Improve wording (#103) --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 03a1adc7bb799..af5c0feeeb0cf 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -66,7 +66,7 @@ not rely on APIs defined in this specification.* == Introduction -Through the use of command groups SYCL is already able to create a dependency +With command groups SYCL is already able to create a dependency graph (in the form of a directed acyclic graph) of kernel execution at runtime, as a command group object defines a set of requisites (edges) which must be satisfied for kernels (nodes) to be executed. However, because command-group @@ -130,7 +130,7 @@ To allow for prototype implementations of this extension to be developed quickly for evaluation the scope of this proposal was limited to a subset of these requirements. In particular, the serialization functionality (8), backend interoperability (9), and a profiling/debugging interface (3) were -omitted. As these are not easy to abstract over a number of backends without +omitted. As these are not easy to abstract over several backends without significant investigation. It is also hoped these features can be exposed as additive changes to the API, and thus introduced in future versions of the extension. @@ -226,7 +226,7 @@ Table {counter: tableNumber}. Explicit Graph Definition. | Node | In the explicit graph building API nodes are created by the user invoking -methods on a modifiable graph. Each node represent either a command-group +methods on a modifiable graph. Each node represents either a command-group function or an empty operation. | Edge @@ -256,7 +256,7 @@ Table {counter: tableNumber}. Recorded Graph Definition. | Nodes in a queue recorded graph represent each of the command group submissions of the program. Each submission encompasses either one or both of a.) some data movement, b.) a single asynchronous kernel launch. Nodes cannot -define forward edges, only backwards. This is, kernels can only create +define forward edges, only backwards. That is, kernels can only create dependencies on command-groups that have already been submitted. This means that transparently a node can depend on a previously recorded graph (sub-graph), which works by creating edges to the individual nodes in the old graph. Explicit @@ -379,7 +379,7 @@ public: :crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics Node is a class that encapsulates tasks like SYCL kernel functions, memory -operations, or host tasks for deferred execution. A graph has to +operations, or host tasks for deferred execution. A graph must be created first, the structure of a graph is defined second by adding nodes and edges. @@ -419,13 +419,13 @@ This extension adds a new `command_graph` object which follows the A `command_graph` represents a directed acyclic graph of nodes, where each node represents a single command or a sub-graph. The execution of a graph completes -when all of its nodes have completed. +when all its nodes have completed. A `command_graph` is built up by either recording queue submissions or explicitly adding nodes, then once the user is happy that the graph is complete, the graph instance is finalized into an executable variant which can have no more nodes added to it. Finalization may be a computationally expensive -operation as the runtime is able to perform optimizations based on the graph +operation as the runtime can perform optimizations based on the graph structure. After finalization the graph can be submitted for execution on a queue one or more times with reduced overhead. @@ -440,7 +440,7 @@ An instance of a `command_graph` object can be in one of two states: A `command_graph` object is constructed in the _recording_ state and is made _executable_ by the user invoking `command_graph::finalize()` to create a new executable instance of the graph. An executable graph cannot be converted -to a modifiable graph. After finalizing a graph in the modifiable state it is +to a modifiable graph. After finalizing a graph in the modifiable state, it is valid for a user to add additional nodes and finalize again to create subsequent executable graphs. The state of a `command_graph` object is made explicit by templating on state to make the class strongly typed, with the default template @@ -945,14 +945,14 @@ Parameters: The new functions in this extension are thread-safe, the same as member functions of classes in the base SYCL specification. If user code does -not perform synchronisation between two threads accessing the same queue, +not perform synchronization between two threads accessing the same queue, there is no strong ordering between events on that queue, and the kernel submissions, recording and finalization will happen in an undefined order. -In particular, when one thread ends recording on a queue while another +When one thread ends recording on a queue while another thread is submitting work, which kernels will be part of the subsequent graph is undefined. If user code enforces a total order on the queue -events, then the behaviour is well-defined, and will match the observable +events, then the behavior is well-defined, and will match the observable total order. The returned value from the `info::queue::state` should be considered @@ -966,7 +966,7 @@ In addition to the destruction semantics provided by the SYCL destroyed recording is ended on any queues that are recording to that graph, equivalent to `this->end_recording()`. -As a result users don't need to manually wrap queue recording code in a +As a result, users don't need to manually wrap queue recording code in a `try` / `catch` block to reset the state of recording queues on an exception back to the executing state. Instead, an uncaught exception destroying the modifiable graph will perform this action, useful in RAII pattern usage. @@ -992,13 +992,13 @@ no queue state is changed. This design is because the queues are already in the state the user desires, so if the function threw an exception in this case, the application would likely swallow it and then proceed. -While a queue is in the recording state, methods peformed on that queue which +While a queue is in the recording state, methods performed on that queue which are not command submissions behave as normal except for waits. Waiting on a queue in the recording state is an error and will throw a synchronous exception. Other methods are ignored by the graph system as opposed to throwing in recording mode. As any query about the state of the queue may be immediately stale, any code which relies on queue waits should take care -to ensure waits are not performed on queues in recording mode. For example by +to ensure waits are not performed on queues in recording mode. For example, by using separate queues for graph recording and normal queue operations. === Storage Lifetimes @@ -1014,12 +1014,12 @@ Because of the extension of storage lifetimes, users should avoid the use of the buffer copy-back on destruction mechanism. If used in code intended to be executed as part of a graph, it may not perform as expected. -=== Buffer Limtations for Record & Replay API +=== Buffer Limitations for Record & Replay API -Because of the delayed execution of a recorded graph it is not possible to support -captured code which relies on the copy-back on destruction behaviour of buffers. -Typically applications would rely on this behaviour to do work on the host which -cannot inherently be captured inside a command graph. Thus when recording to a graph +Because of the delayed execution of a recorded graph, it is not possible to support +captured code which relies on the copy-back on destruction behavior of buffers. +Typically, applications would rely on this behavior to do work on the host which +cannot inherently be captured inside a command graph. Thus, when recording to a graph it is an error to submit a command which has an accessor on a buffer which would cause a write-back to happen. Using an incompatible buffer in this case will result in a synchronous error being thrown with error code `invalid`. From 28acfa82f71262416d5d41f4928ca70fc516846f Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 24 Mar 2023 08:29:09 +0000 Subject: [PATCH 47/82] [SYCL] Squash revision history and remove resolved issue For our first beta release we should just be at revision 1, then later updates to a merged spec can bump this when we feel is appropriate. This patch also removes the issue about whether executable graph update is valuable to users, as we've found evidence of the equivalent CUDA functionality in pytorch, tensorflow, and GROMACS. Removing this open issue closes https://github.com/reble/llvm/issues/8 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index af5c0feeeb0cf..aec359ec925ec 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1308,13 +1308,6 @@ some designs being considered. **Outcome:** Designs under consideration -=== Executable Graph Update - -Is there a ML usecase (e.g pytorch workload) which justifies the inclusion of -this feature in the extension. - -**Outcome:** Unresolved - == Revision History [cols="5,15,15,70"] @@ -1322,11 +1315,8 @@ this feature in the extension. [options="header"] |======================================== |Rev|Date|Author|Changes -|1|2022-02-11|Pablo Reble|Initial public working draft -|2|2022-03-11|Pablo Reble|Incorporate feedback from PR -|3|2022-05-25|Pablo Reble|Extend API and Example -|4|2022-08-10|Pablo Reble|Adding USM shortcuts -|5|2022-10-21|Ewan Crawford|Merge in Codeplay vendor extension -|6|2022-11-14|Ewan Crawford|Change graph execution to be a function on the handler -|7|2022-12-15|Ewan Crawford|Change record & replay relationship between graph and queue. + +|1|2023-03-23|Pablo Reble, Ewan Crawford, Ben Tracy, Julian Miller +|Initial public working draft + |======================================== From 61926472cacd0788e62bc61ce5afb3fa4f091f38 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 24 Mar 2023 10:23:05 +0000 Subject: [PATCH 48/82] [SYCL] Specify a modifiable graph as having device specific nodes Modifies the API and definition of a command_graph to define a modifiable command_graph as having nodes targeting specific devices, rather than being device-agnostic. This change will allow a multi-device graph in the future by only removing some error conditions. To enable this, the explicit API has a `syclDevice` parameter introduced to `add()` for use as the device to process `cgf`. See https://github.com/reble/llvm/issues/7 for discussion. Co-authored-by: Pablo Reble --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 124 ++++++++++++++---- 1 file changed, 99 insertions(+), 25 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index aec359ec925ec..11906e10ab2aa 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -139,8 +139,11 @@ Another reason for deferring a serialize/deserialize API (8) is that its scope could extend from emitting the graph in a binary format, to emitting a standardized IR format that enables further device specific graph optimizations. -Multi-device support (7) is something we are looking into introducing into -the extension, which may result in API changes. +Multi-device support (7) is something that has been designed for in this +extension, with the definition of a graph node being device specific. However, +the ability for a user to define a single graph with nodes targeting different +devices is still disallowed until such a time as this feature can be backed up +by implementation coverage. === Graph Building Mechanisms @@ -207,7 +210,7 @@ Table {counter: tableNumber}. Terminology. (edges), represented by the `command_graph` class. | Node -| A command, which can have different attributes. +| A command, which can have different attributes, targeting a specific device. | Edge | Dependency between commands as a happens-before relationship. @@ -253,15 +256,15 @@ Table {counter: tableNumber}. Recorded Graph Definition. | Concept | Description | Node -| Nodes in a queue recorded graph represent each of the command group -submissions of the program. Each submission encompasses either one or both of -a.) some data movement, b.) a single asynchronous kernel launch. Nodes cannot -define forward edges, only backwards. That is, kernels can only create -dependencies on command-groups that have already been submitted. This means that -transparently a node can depend on a previously recorded graph (sub-graph), -which works by creating edges to the individual nodes in the old graph. Explicit -memory operations without kernels, such as a memory copy, are still classed as -nodes under this definition, as the +| A node in a queue recorded graph represents a command group submission to the +device associated with the queue begin recorded. Each submission encompasses +either one or both of a.) some data movement, b.) a single asynchronous kernel +launch. Nodes cannot define forward edges, only backwards. That is, kernels can +only create dependencies on command-groups that have already been submitted. +This means that transparently a node can depend on a previously recorded graph +(sub-graph), which works by creating edges to the individual nodes in the old +graph. Explicit memory operations without kernels, such as a memory copy, are +still classed as nodes under this definition, as the {explicit-memory-ops}[SYCL 2020 specification states] that these can be seen as specialized kernels executing on the device. @@ -339,7 +342,7 @@ public: node add(const property_list& propList = {}); template - node add(T cgf, const property_list& propList = {}); + node add(const device& syclDevice, T cgf, const property_list& propList = {}); void make_edge(node src, node dest); }; @@ -418,7 +421,8 @@ This extension adds a new `command_graph` object which follows the {crs}[common reference semantics] of other SYCL runtime objects. A `command_graph` represents a directed acyclic graph of nodes, where each node -represents a single command or a sub-graph. The execution of a graph completes +represents a single command for a specific device or a sub-graph. A graph may be +made up of nodes targeting different devices. The execution of a graph completes when all its nodes have completed. A `command_graph` is built up by either recording queue submissions or @@ -461,7 +465,8 @@ using the `command_graph::update()` method. This takes a graph in the modifiable state and updates the executable graph to use the node input & outputs of the modifiable graph, a technique called _Whole Graph Update_. The modifiable graph must have the same topology as the graph originally used to -create the executable graphs, with the nodes added in the same order. +create the executable graphs, with the nodes targeting the same devices and +added in the same order. ==== Graph Member Functions @@ -530,7 +535,7 @@ Exceptions: ---- using namespace ext::oneapi::experimental; template -node add(T cgf, const property_list& propList = {}); +node add(const device& syclDevice, T cgf, const property_list& propList = {}); ---- |This function adds a command group function object to a graph. The function object statically contains a group of commands, of which a single command is @@ -546,6 +551,13 @@ Preconditions: Parameters: +* `syclDevice` - Device to process `cgf` with to create node command-group. + Commands are not required to execute on this device if the behaviour is + consistent with how they would not execute on the device associated with a + `sycl::queue` during regular queue submission. In particular host tasks, + sub-graphs with nodes targeting other devices, and memory copies may not + execute on `syclDevice`. + * `cgf` - Command group function object to be added as a node. * `propList` - Zero or more properties can be provided to the constructed node @@ -558,6 +570,9 @@ Exceptions: * Throws synchronously with error code `invalid` if a queue is recording commands to the graph. +* Throws synchronously with error code `invalid` if `syclDevice` is a different + device from the device targeted by the existing nodes of the graph. + | [source,c++] ---- @@ -625,6 +640,9 @@ Exceptions: A cycle may be introduced to the graph via a call to `make_edge()` that creates a forward dependency. +* Throws synchronously with error code `invalid` if the graph contains a + node which targets a device not present in `syclContext`. + |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. @@ -656,6 +674,10 @@ Exceptions: * Throws synchronously with error code `invalid` if `recordingQueue` is already recording to a different graph. +* Throws synchronously with error code `invalid` if `recordingQueue` is + associated with a device which is different from the device already targeted + by the nodes of the graph. + | [source, c++] ---- @@ -939,6 +961,11 @@ Parameters: * `graph` - Graph object to execute. +Exceptions: + +* Throws synchronously with error code `invalid` if the handler is submitted + to a queue which doesn't have a SYCL context which matches the context of + the executable graph. |=== === Thread Safety @@ -977,13 +1004,20 @@ Errors are reported through exceptions, as usual in the SYCL API. For new APIs, submitting a graph for execution can generate unspecified asynchronous errors, while `command_graph::finalize()` may throw unspecified synchronous exceptions. Synchronous exception errors codes are defined for all of -`command_graph::begin_recording()`, `command_graph::end_recording()` and -`command_graph::update()`. +`command_graph::add()`, `command_graph::make_edge()`, `command_graph::update()`, +`command_graph::begin_recording()`, and `command_graph::end_recording()`. + +Submitting an executable graph using `handler::ext_oneapi_graph()` to +a queue with a different SYCL context than that of the executable graph will +result in a synchronous exception. When a queue is in recording mode asynchronous exceptions will not be generated, as no device execution is occurring. Synchronous errors specified as being thrown in the default queue executing state, will still be thrown when a -queue is in the recording state. +queue is in the recording state. A synchronous error with error code `invalid` +will also be thrown if a queue in recording mode tries to record a command to a +graph which already has nodes that target a device different from the device +associated with the recording queue. The `command_graph::begin_recording` and `command_graph::end_recording` entry-points return a `bool` value informing the user whether a related queue @@ -1063,7 +1097,7 @@ evaluated as normal during command graph execution. [source,c++] ---- using namespace ext::oneapi::experimental; -auto node = graph.add([&](sycl::handler& cgh){ +auto node = graph.add(device, [&](sycl::handler& cgh){ // Host code here is evaluated during the call to add() cgh.host_task([=](){ // Code here is evaluated as part of executing the command graph node @@ -1109,6 +1143,42 @@ are the advantage of USM. Design 3) needs specific backend support for deferred allocation. +=== Device Specific Graph + +A modifiable state `command_graph` contains nodes targeting specific devices, +rather than being a device agnostic representation only tied to devices on +finalization. This allows the implementation to process nodes which require +device information when the command group function is evaluated. For example, +a SYCL reduction implementation may desire the work-group/sub-group size, which +is normally gathered by the runtime from the device associated with the queue. + +This design also enables the future capability for a user to compose a graph +with nodes targeting different devices, allowing the benefits of defining an +execution graph ahead of submission to be extended to multi-device platforms. +Without this capability a user currently has to submit individual single-device +graphs and use events for dependencies, which is a usage model this extension is +aiming to optimize. Automatic load balancing of commands across devices is not a +problem this extension currently aims to solve, it is the responsibility of the +user to decide the device each command will be processed for, not the SYCL +runtime. + +A drawback of this design is that it is less convenient for the use-case where +a user would like to run the same graph on N devices. Rather than finalizing a +single modifiable graph N times for N devices in this scenario, the user now has +to record N modifiable graphs and then as finalize each of them. If this use +case does become a usability issue, we could provide a specific API to support +it. For example, an update API for the modifiable `command_graph`, similar to +parameters, so that the same modifiable graph can be updated to new devices then +re-finalized. There may need to be limitations on what devices a graph can be +updated to however, as device specific processing might not be possible to roll +back or defer. As such, it may be only possible to update to identical physical +devices, rather than re-target a GPU constructed graph to a CPU. + +The queue an executable graph is submitted to needs to have the same context as +the executable graph, however execution of a multi-device graph is not limited +to the device associated to the queue. Instead, the queue is used as the +mechanism to submit the graph and express execution dependencies. + == Examples [NOTE] @@ -1138,6 +1208,7 @@ int main() { float gamma = 3.0f; sycl::queue q; + sycl::device device = q.get_device(); sycl_ext::command_graph g; @@ -1155,7 +1226,7 @@ int main() { // c /* init data on the device */ - auto node_i = g.add([&](sycl::handler& h) { + auto node_i = g.add(device, [&](sycl::handler& h) { h.parallel_for(n, [=](sycl::id<1> it){ const size_t i = it[0]; x[i] = 1.0f; @@ -1164,21 +1235,21 @@ int main() { }); }); - auto node_a = g.add([&](sycl::handler& h) { + auto node_a = g.add(device, [&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }, { sycl_ext::property::node::depends_on(node_i)}); - auto node_b = g.add([&](sycl::handler& h) { + auto node_b = g.add(device, [&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); }, { sycl_ext::property::node::depends_on(node_i)}); - auto node_c = g.add( + auto node_c = g.add(device, [&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), @@ -1296,7 +1367,10 @@ submitted in its entirety for execution via Allow an executable graph to contain nodes targeting different devices. -**Outcome:** Under consideration +**Outcome:** This feature has been designed for with the definition of a graph +node being device specific. However, the ability for a user to define a single +graph with nodes targeting different devices is still disallowed until such a +time as this feature can be backed up by implementation coverage. === Memory Allocation API From 59a35223c85ae6b9ef0691c36a7dbd40e9ac38f6 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 29 Mar 2023 13:19:41 -0500 Subject: [PATCH 49/82] Apply suggestions from code review Co-authored-by: Greg Lueck --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 11906e10ab2aa..2e45dee22ab2f 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -39,6 +39,7 @@ Julian Miller, Intel + John Pennycook, Intel + Guo Yejun, Intel + Dan Holmes, Intel + +Greg Lueck, Intel + Ewan Crawford, Codeplay + Ben Tracy, Codeplay + Duncan McBain, Codeplay + @@ -509,7 +510,7 @@ using namespace ext::oneapi::experimental; node add(const property_list& propList = {}); ---- |This creates an empty node which contains no command. Its intended use is -either a connection point inside a graph between groups of nodes, and can +to make a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later. @@ -991,7 +992,7 @@ preemptively changed the state of the queue. In addition to the destruction semantics provided by the SYCL {crs}[common reference semantics], when a modifiable `command_graph` is destroyed recording is ended on any queues that are recording to that -graph, equivalent to `this->end_recording()`. +graph, equivalent to `+this->end_recording()+`. As a result, users don't need to manually wrap queue recording code in a `try` / `catch` block to reset the state of recording queues on an exception @@ -1098,10 +1099,10 @@ evaluated as normal during command graph execution. ---- using namespace ext::oneapi::experimental; auto node = graph.add(device, [&](sycl::handler& cgh){ -// Host code here is evaluated during the call to add() -cgh.host_task([=](){ - // Code here is evaluated as part of executing the command graph node -}); + // Host code here is evaluated during the call to add() + cgh.host_task([=](){ + // Code here is evaluated as part of executing the command graph node + }); }); ---- From c7a689b7ecf02b6b1219b18c3f163eadfccffa9e Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 30 Mar 2023 08:26:58 +0100 Subject: [PATCH 50/82] [SYCL][Doc] Pass a device to graph constructor (#113) Based on feedback from the Working Group it is clearer to represent the current single device characteristic of a graph by passing a device to the graph constructor that all nodes will target. Then remove the `device` parameter from the explicit API, as it can be added as an overload in a future revision due to experimental nature of the extension A diff of these changes against the commit prior to when the [original multi-device PR](https://github.com/reble/llvm/pull/83) went in can be seen [here](https://github.com/reble/llvm/compare/28acfa82f71262416d5d41f4928ca70fc516846f...reble:llvm:ewan/single_device) --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 92 ++++++------------- 1 file changed, 30 insertions(+), 62 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 2e45dee22ab2f..1268a499ace2a 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -140,11 +140,11 @@ Another reason for deferring a serialize/deserialize API (8) is that its scope could extend from emitting the graph in a binary format, to emitting a standardized IR format that enables further device specific graph optimizations. -Multi-device support (7) is something that has been designed for in this -extension, with the definition of a graph node being device specific. However, -the ability for a user to define a single graph with nodes targeting different -devices is still disallowed until such a time as this feature can be backed up -by implementation coverage. +Multi-device support (7) is something that we are considering introducing into +the extension in later revisions, which may result in API changes. It has been +planned for to the extent that the definition of a graph node is device +specific, however currently all nodes in a graph must target the same device +provided to the graph constructor. === Graph Building Mechanisms @@ -328,7 +328,7 @@ class command_graph {}; template<> class command_graph { public: - command_graph(const property_list& propList = {}); + command_graph(const device& syclDevice, const property_list& propList = {}); command_graph finalize(const context& syclContext, const property_list& propList = {}) const; @@ -343,7 +343,7 @@ public: node add(const property_list& propList = {}); template - node add(const device& syclDevice, T cgf, const property_list& propList = {}); + node add(T cgf, const property_list& propList = {}); void make_edge(node src, node dest); }; @@ -422,9 +422,8 @@ This extension adds a new `command_graph` object which follows the {crs}[common reference semantics] of other SYCL runtime objects. A `command_graph` represents a directed acyclic graph of nodes, where each node -represents a single command for a specific device or a sub-graph. A graph may be -made up of nodes targeting different devices. The execution of a graph completes -when all its nodes have completed. +represents a single command for a specific device or a sub-graph. The execution +of a graph completes when all its nodes have completed. A `command_graph` is built up by either recording queue submissions or explicitly adding nodes, then once the user is happy that the graph is complete, @@ -480,11 +479,11 @@ Table {counter: tableNumber}. Constructor of the `command_graph` class. [source,c++] ---- using namespace ext::oneapi::experimental; -command_graph(const property_list& propList = {}); +command_graph(const device& syclDevice, const property_list& propList = {}); ---- -|Creates a SYCL `command_graph` object in the modifiable state. -Zero or more properties can be provided to the constructed SYCL `command_graph` -via an instance of `property_list`. +|Creates a SYCL `command_graph` object in the modifiable state for device +`syclDevice`. Zero or more properties can be provided to the constructed SYCL +`command_graph` via an instance of `property_list`. Preconditions: @@ -493,6 +492,9 @@ Preconditions: Parameters: +* `syclDevice` - Device that all nodes added to the graph will target, + an immutable characteristic of the graph. + * `propList` - Optional parameter for passing properties. No `command_graph` constructor properties are defined by this extension. @@ -536,7 +538,7 @@ Exceptions: ---- using namespace ext::oneapi::experimental; template -node add(const device& syclDevice, T cgf, const property_list& propList = {}); +node add(T cgf, const property_list& propList = {}); ---- |This function adds a command group function object to a graph. The function object statically contains a group of commands, of which a single command is @@ -552,13 +554,6 @@ Preconditions: Parameters: -* `syclDevice` - Device to process `cgf` with to create node command-group. - Commands are not required to execute on this device if the behaviour is - consistent with how they would not execute on the device associated with a - `sycl::queue` during regular queue submission. In particular host tasks, - sub-graphs with nodes targeting other devices, and memory copies may not - execute on `syclDevice`. - * `cgf` - Command group function object to be added as a node. * `propList` - Zero or more properties can be provided to the constructed node @@ -571,9 +566,6 @@ Exceptions: * Throws synchronously with error code `invalid` if a queue is recording commands to the graph. -* Throws synchronously with error code `invalid` if `syclDevice` is a different - device from the device targeted by the existing nodes of the graph. - | [source,c++] ---- @@ -676,9 +668,8 @@ Exceptions: already recording to a different graph. * Throws synchronously with error code `invalid` if `recordingQueue` is - associated with a device which is different from the device already targeted - by the nodes of the graph. - + associated with a device that is different from the device used on creation + of the graph. | [source, c++] ---- @@ -1015,10 +1006,7 @@ result in a synchronous exception. When a queue is in recording mode asynchronous exceptions will not be generated, as no device execution is occurring. Synchronous errors specified as being thrown in the default queue executing state, will still be thrown when a -queue is in the recording state. A synchronous error with error code `invalid` -will also be thrown if a queue in recording mode tries to record a command to a -graph which already has nodes that target a device different from the device -associated with the recording queue. +queue is in the recording state. The `command_graph::begin_recording` and `command_graph::end_recording` entry-points return a `bool` value informing the user whether a related queue @@ -1098,7 +1086,7 @@ evaluated as normal during command graph execution. [source,c++] ---- using namespace ext::oneapi::experimental; -auto node = graph.add(device, [&](sycl::handler& cgh){ +auto node = graph.add([&](sycl::handler& cgh){ // Host code here is evaluated during the call to add() cgh.host_task([=](){ // Code here is evaluated as part of executing the command graph node @@ -1163,23 +1151,6 @@ problem this extension currently aims to solve, it is the responsibility of the user to decide the device each command will be processed for, not the SYCL runtime. -A drawback of this design is that it is less convenient for the use-case where -a user would like to run the same graph on N devices. Rather than finalizing a -single modifiable graph N times for N devices in this scenario, the user now has -to record N modifiable graphs and then as finalize each of them. If this use -case does become a usability issue, we could provide a specific API to support -it. For example, an update API for the modifiable `command_graph`, similar to -parameters, so that the same modifiable graph can be updated to new devices then -re-finalized. There may need to be limitations on what devices a graph can be -updated to however, as device specific processing might not be possible to roll -back or defer. As such, it may be only possible to update to identical physical -devices, rather than re-target a GPU constructed graph to a CPU. - -The queue an executable graph is submitted to needs to have the same context as -the executable graph, however execution of a multi-device graph is not limited -to the device associated to the queue. Instead, the queue is used as the -mechanism to submit the graph and express execution dependencies. - == Examples [NOTE] @@ -1209,9 +1180,7 @@ int main() { float gamma = 3.0f; sycl::queue q; - sycl::device device = q.get_device(); - - sycl_ext::command_graph g; + sycl_ext::command_graph g(q.get_device()); float *dotp = sycl::malloc_shared(1, q); float *x = sycl::malloc_device(n, q); @@ -1227,7 +1196,7 @@ int main() { // c /* init data on the device */ - auto node_i = g.add(device, [&](sycl::handler& h) { + auto node_i = g.add([&](sycl::handler& h) { h.parallel_for(n, [=](sycl::id<1> it){ const size_t i = it[0]; x[i] = 1.0f; @@ -1236,21 +1205,21 @@ int main() { }); }); - auto node_a = g.add(device, [&](sycl::handler& h) { + auto node_a = g.add([&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }, { sycl_ext::property::node::depends_on(node_i)}); - auto node_b = g.add(device, [&](sycl::handler& h) { + auto node_b = g.add([&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; }); }, { sycl_ext::property::node::depends_on(node_i)}); - auto node_c = g.add(device, + auto node_c = g.add( [&](sycl::handler& h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), @@ -1295,7 +1264,7 @@ submitted in its entirety for execution via queue q{default_selector{}}; // New object representing graph of command-groups - ext::oneapi::experimental::command_graph graph; + ext::oneapi::experimental::command_graph graph(q.get_device()); { buffer bufferA{dataA.data(), range<1>{elements}}; buffer bufferB{dataB.data(), range<1>{elements}}; @@ -1368,10 +1337,9 @@ submitted in its entirety for execution via Allow an executable graph to contain nodes targeting different devices. -**Outcome:** This feature has been designed for with the definition of a graph -node being device specific. However, the ability for a user to define a single -graph with nodes targeting different devices is still disallowed until such a -time as this feature can be backed up by implementation coverage. +**Outcome:** This feature is something that we are considering introducing into +the extension in later revisions. It has been planned for to the extent that the +definition of a graph node is device specific. === Memory Allocation API From 807645568031a97126b325c01ae0f58aed9c582b Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 30 Mar 2023 08:32:02 +0100 Subject: [PATCH 51/82] [SYCL][DOC] Remove wording on adding taks after defining graph (#114) Removes wording from `graph.add()` about being able to add tasks after defining the graph structure. Actions feedback https://github.com/intel/llvm/pull/5626#discussion_r1055852971 Also use [asciidoc superscript syntax](https://docs.asciidoctor.org/asciidoc/latest/text/subscript-and-superscript/) for power-of-two. --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 1268a499ace2a..c1e8dcfa4e892 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -513,8 +513,7 @@ node add(const property_list& propList = {}); ---- |This creates an empty node which contains no command. Its intended use is to make a connection point inside a graph between groups of nodes, and can -significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case -is building the structure of a graph first and adding tasks later. +significantly reduce the number of edges ( O(n) vs. O(n^2^) ). Preconditions: From 5d32404d3d05efc4dddf77fe93aa4b6e0615cbcc Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 30 Mar 2023 17:19:02 +0100 Subject: [PATCH 52/82] [SYCL][DOC] Pass SYCL objects by reference to API Addresses feedback from https://github.com/intel/llvm/pull/5626#discussion_r1150619959 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index c1e8dcfa4e892..d96a04fe13370 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -333,11 +333,11 @@ public: command_graph finalize(const context& syclContext, const property_list& propList = {}) const; - bool begin_recording(queue recordingQueue); + bool begin_recording(queue& recordingQueue); bool begin_recording(const std::vector& recordingQueues); bool end_recording(); - bool end_recording(queue recordingQueue); + bool end_recording(queue& recordingQueue); bool end_recording(const std::vector& recordingQueues); node add(const property_list& propList = {}); @@ -345,7 +345,7 @@ public: template node add(T cgf, const property_list& propList = {}); - void make_edge(node src, node dest); + void make_edge(node& src, node& dest); }; template<> @@ -362,17 +362,17 @@ class queue { public: /* -- graph convenience shortcuts -- */ - event ext_oneapi_graph(command_graph graph); - event ext_oneapi_graph(command_graph graph, + event ext_oneapi_graph(command_graph& graph); + event ext_oneapi_graph(command_graph& graph, event depEvent); - event ext_oneapi_graph(command_graph graph, + event ext_oneapi_graph(command_graph& graph, const std::vector& depEvents); }; // New methods added to the sycl::handler class class handler { public: - void ext_oneapi_graph(command_graph graph); + void ext_oneapi_graph(command_graph& graph); } } // namespace sycl @@ -569,7 +569,7 @@ Exceptions: [source,c++] ---- using namespace ext::oneapi::experimental; -void make_edge(node src, node dest); +void make_edge(node& src, node& dest); ---- |Creates a dependency between two nodes representing a happens-before relationship. @@ -646,7 +646,7 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class for [source, c++] ---- using namespace ext::oneapi::experimental; -bool begin_recording(queue recordingQueue) +bool begin_recording(queue& recordingQueue) ---- |Synchronously changes the state of `recordingQueue` to the @@ -710,7 +710,7 @@ Returns: `true` if any queue recording to the graph has its state changed from [source, c++] ---- using namespace ext::oneapi::experimental; -bool end_recording(queue recordingQueue) +bool end_recording(queue& recordingQueue) ---- |Synchronously changes the state of `recordingQueue` to the @@ -901,7 +901,7 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::queue` c [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::ext_oneapi_graph(command_graph graph) +event queue::ext_oneapi_graph(command_graph& graph) ---- |Queue shortcut function that is equivalent to submitting a command-group @@ -911,7 +911,7 @@ containing `handler::ext_oneapi_graph(graph)`. [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::ext_oneapi_graph(command_graph graph, +event queue::ext_oneapi_graph(command_graph& graph, event depEvent); ---- @@ -923,7 +923,7 @@ containing `handler::depends_on(depEvent)` and [source,c++] ---- using namespace ext::oneapi::experimental; -event queue::ext_oneapi_graph(command_graph graph, +event queue::ext_oneapi_graph(command_graph& graph, const std::vector& depEvents); ---- @@ -941,7 +941,7 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::handler` [source,c++] ---- using namespace ext::oneapi::experimental; -void handler::ext_oneapi_graph(command_graph graph) +void handler::ext_oneapi_graph(command_graph& graph) ---- |Invokes the execution of a graph. Support for invoking an executable graph, From cbeb1765aa96ea4dabfc42d90e51cab19f5a1e19 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 3 Apr 2023 07:38:34 +0100 Subject: [PATCH 53/82] [SYCL][DOC] Editorial cleanup to table entries Actions feedback: * https://github.com/intel/llvm/pull/5626#discussion_r1150592166 that this experimental namespace is already implicit to the reader * https://github.com/intel/llvm/pull/5626#discussion_r1150767062 to fix render of new handler table --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index d96a04fe13370..b489b228dfbca 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -478,7 +478,6 @@ Table {counter: tableNumber}. Constructor of the `command_graph` class. | [source,c++] ---- -using namespace ext::oneapi::experimental; command_graph(const device& syclDevice, const property_list& propList = {}); ---- |Creates a SYCL `command_graph` object in the modifiable state for device @@ -508,7 +507,6 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class. | [source,c++] ---- -using namespace ext::oneapi::experimental; node add(const property_list& propList = {}); ---- |This creates an empty node which contains no command. Its intended use is @@ -535,7 +533,6 @@ Exceptions: | [source,c++] ---- -using namespace ext::oneapi::experimental; template node add(T cgf, const property_list& propList = {}); ---- @@ -568,7 +565,6 @@ Exceptions: | [source,c++] ---- -using namespace ext::oneapi::experimental; void make_edge(node& src, node& dest); ---- @@ -599,7 +595,6 @@ Exceptions: | [source,c++] ---- -using namespace ext::oneapi::experimental; command_graph finalize(const context& syclContext, const property_list& propList = {}) const; ---- @@ -645,7 +640,6 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class for | [source, c++] ---- -using namespace ext::oneapi::experimental; bool begin_recording(queue& recordingQueue) ---- @@ -672,7 +666,6 @@ Exceptions: | [source, c++] ---- -using namespace ext::oneapi::experimental; bool begin_recording(const std::vector& recordingQueues) ---- @@ -696,7 +689,6 @@ Exceptions: | [source, c++] ---- -using namespace ext::oneapi::experimental; bool end_recording() ---- @@ -709,7 +701,6 @@ Returns: `true` if any queue recording to the graph has its state changed from | [source, c++] ---- -using namespace ext::oneapi::experimental; bool end_recording(queue& recordingQueue) ---- @@ -731,7 +722,6 @@ Exceptions: | [source, c++] ---- -using namespace ext::oneapi::experimental; bool end_recording(const std::vector& recordingQueues) ---- @@ -761,7 +751,6 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class (exe | [source, c++] ---- -using namespace ext::oneapi::experimental; void command_graph update(const command_graph& graph); ---- @@ -900,7 +889,6 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::queue` c | [source,c++] ---- -using namespace ext::oneapi::experimental; event queue::ext_oneapi_graph(command_graph& graph) ---- @@ -910,7 +898,6 @@ containing `handler::ext_oneapi_graph(graph)`. | [source,c++] ---- -using namespace ext::oneapi::experimental; event queue::ext_oneapi_graph(command_graph& graph, event depEvent); ---- @@ -922,7 +909,6 @@ containing `handler::depends_on(depEvent)` and | [source,c++] ---- -using namespace ext::oneapi::experimental; event queue::ext_oneapi_graph(command_graph& graph, const std::vector& depEvents); ---- @@ -938,9 +924,10 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::handler` [cols="2a,a"] |=== |Member function|Description + +| [source,c++] ---- -using namespace ext::oneapi::experimental; void handler::ext_oneapi_graph(command_graph& graph) ---- @@ -1084,7 +1071,6 @@ evaluated as normal during command graph execution. [source,c++] ---- -using namespace ext::oneapi::experimental; auto node = graph.add([&](sycl::handler& cgh){ // Host code here is evaluated during the call to add() cgh.host_task([=](){ From 29801860785f457ca5ee455133cd6dd5043c7644 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 3 Apr 2023 11:02:12 +0100 Subject: [PATCH 54/82] [SYCL] Remove text about cgf host code behaviour (#125) - Remove wording about host code behaviour in command group functions --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index b489b228dfbca..f70a4ca8ee896 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1045,7 +1045,6 @@ synchronous error being thrown with error code `invalid`. === Host Tasks :host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks -:cg-scope: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:command.group.scope A {host-task}[host task] is a native C++ callable, scheduled according to SYCL dependency rules. It is valid to record a host task as part of graph, though it @@ -1062,9 +1061,6 @@ Host code within a command group function object is evaluated when the command group is added to a graph. This is either before the return of the call to `command_graph::add()` when using the explicit API or before the return of the call to `queue::submit()` when submitting a command group to a queue that is recording to a graph. -This behaviour is in keeping with the existing {cg-scope}[command group] behaviour but may have -implications for command group functions containing arbitrary host code. This could -affect the behaviour of captured code due to the delayed execution of commands. This does not apply to code within a {host-task}[host task] which is evaluated as normal during command graph execution. From bddc5b9c75745a0cb4a50b68b0318e4e204906ed Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 4 Apr 2023 14:29:23 -0500 Subject: [PATCH 55/82] [SYCL][DOC] Apply suggestions: Avoid implicitly disallow memory operations (#117) --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index f70a4ca8ee896..fd45246404daa 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -538,10 +538,11 @@ node add(T cgf, const property_list& propList = {}); ---- |This function adds a command group function object to a graph. The function object statically contains a group of commands, of which a single command is -executed at runtime. A function object can be a host task which is scheduled by -the SYCL runtime, or a SYCL function for invoking kernels with all restrictions -that apply as described in the core specification. The requisites of `cgf` will -be used to identify any dependent nodes in the graph to form edges with. +executed at runtime. A command group function object may submit any command +as defined by the core SYCL specification, and any SYCL extensions unless explicitly +stated otherwise. +The requisites of `cgf` will be used to identify any dependent nodes in the graph +to form edges with. Preconditions: From 445f5816615bf780e23dfdb24d56046e61f1c82c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 5 Apr 2023 09:49:13 +0100 Subject: [PATCH 56/82] [SYCL][DOC] Error on simultaneous graph submission Actions [feedback] (https://github.com/intel/llvm/pull/5626#discussion_r1150765202) that we should error when a graph is submitted before a previous execution has completed, rather than make it conditional on the backend when we have no query. Added an issue to the bottom of the spec, since I think for revision 2 we could possibly do better and serialize the graph executions in the runtime. Closes https://github.com/reble/llvm/issues/122 Co-authored-by: Pablo Reble --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index fd45246404daa..842e293dc22b7 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -932,9 +932,9 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::handler` void handler::ext_oneapi_graph(command_graph& graph) ---- -|Invokes the execution of a graph. Support for invoking an executable graph, -before a previous execution of the same graph has been completed is backend -specific. The runtime may throw an error. +|Invokes the execution of a graph. Only one instance of `graph` may be executing, +or pending execution, at any time. Concurrent graph execution can be achieved by +finalizing a graph in modifiable state into multiple graphs in executable state. Parameters: @@ -945,6 +945,9 @@ Exceptions: * Throws synchronously with error code `invalid` if the handler is submitted to a queue which doesn't have a SYCL context which matches the context of the executable graph. + +* Throws synchronously with error code `invalid` if a previous submission of + `graph` has yet to complete execution. |=== === Thread Safety @@ -1315,6 +1318,14 @@ submitted in its entirety for execution via == Issues +=== Simultaneous Graph Submission + +Enable an instance of a graph in executable state to be submitted for execution +when a previous submission of the same graph has yet to complete execution. + +**Outcome:** Backend support for this is inconsistent, but the runtime could +schedule the submissions sequentially for backends which don't support it. + === Multi Device Graph Allow an executable graph to contain nodes targeting different devices. From 1ba5f87bed415c17ba48da838ccb9b315a9b9e8a Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 7 Apr 2023 10:40:03 +0100 Subject: [PATCH 57/82] [SYCL][DOC] Define "topologically identical" This phrase is used to specify when `update()` can successfully be used on an executable graph, but it is currently not well defined. Closes https://github.com/reble/llvm/issues/121 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 842e293dc22b7..efaab1bff84dc 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -756,10 +756,15 @@ void command_graph update(const command_graph Date: Fri, 7 Apr 2023 17:15:33 +0100 Subject: [PATCH 58/82] [SYCL][DOC] Add wording about buffer host data copies (#127) * [SYCL] Add wording about buffer host data copies - Introduces new property to disable copy behaviour - Adds property list to begin_recording() Co-authored-by: Ewan Crawford --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 64 ++++++++++++++++--- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index efaab1bff84dc..2bce146ff3ef8 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -321,6 +321,17 @@ enum class graph_state { executable }; +namespace property { +namespace graph { + +class no_host_copy { +public: + no_host_copy() = default; +}; + +} // namespace graph +} // namespace property + // New object representing graph template class command_graph {}; @@ -333,8 +344,8 @@ public: command_graph finalize(const context& syclContext, const property_list& propList = {}) const; - bool begin_recording(queue& recordingQueue); - bool begin_recording(const std::vector& recordingQueues); + bool begin_recording(queue& recordingQueue, const property_list& propList = {}); + bool begin_recording(const std::vector& recordingQueues, const property_list& propList = {}); bool end_recording(); bool end_recording(queue& recordingQueue); @@ -458,6 +469,21 @@ graph LR Modifiable -->|Finalize| Executable .... +==== Graph Properties [[graph-properties]] + +===== No-Host-Copy Property + +The `no_host_copy` property is defined by this extension and can be passed to +either the `command_graph` constructor or the `command_graph::begin_recording()` +member function. This property will disable the host data copy that may +occur as detailed in the <> section of +this specification. + +Passing this property represents a promise from the user that host data +associated with a buffer that was created using a host data pointer will +outlive any executable graphs created from a modifiable graph which uses +that buffer. + ==== Executable Graph Update A graph in the executable state can have each nodes inputs & outputs updated @@ -494,8 +520,8 @@ Parameters: * `syclDevice` - Device that all nodes added to the graph will target, an immutable characteristic of the graph. -* `propList` - Optional parameter for passing properties. No `command_graph` - constructor properties are defined by this extension. +* `propList` - Optional parameter for passing properties. Properties for + the `command_graph` class are defined in <>. |=== @@ -641,7 +667,7 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class for | [source, c++] ---- -bool begin_recording(queue& recordingQueue) +bool begin_recording(queue& recordingQueue, const property_list& propList = {}) ---- |Synchronously changes the state of `recordingQueue` to the @@ -653,6 +679,9 @@ Parameters: `queue_state::recording` state and start recording commands to the graph instance. +* `propList` - Optional parameter for passing properties. Properties for + the `command_graph` class are defined in <>. + Returns: `true` if `recordingQueue` has its state changed from `queue_state::executing` to `queue_state::recording`, `false` otherwise. @@ -667,7 +696,8 @@ Exceptions: | [source, c++] ---- -bool begin_recording(const std::vector& recordingQueues) +bool begin_recording(const std::vector& recordingQueues, + const property_list& propList = {}) ---- |Synchronously changes the state of each queue in `recordingQueues` to the @@ -679,6 +709,9 @@ Parameters: `queue_state::recording` state and start recording commands to the graph instance. +* `propList` - Optional parameter for passing properties. Properties for + the `command_graph` class are defined in <>. + Returns: `true` if any queue in `recordingQueues` has its state changed from `queue_state::executing` to `queue_state::recording`, `false` otherwise. @@ -1025,7 +1058,7 @@ be immediately stale, any code which relies on queue waits should take care to ensure waits are not performed on queues in recording mode. For example, by using separate queues for graph recording and normal queue operations. -=== Storage Lifetimes +=== Storage Lifetimes [[storage-lifetimes]] The lifetime of any buffer recorded as part of a submission to a command graph will be extended in keeping with the common reference @@ -1034,9 +1067,20 @@ extended either for the lifetime of the graph (including both modifiable graphs and the executable graphs created from them) or until the buffer is no longer required by the graph (such as after being replaced through executable graph update). -Because of the extension of storage lifetimes, users should avoid the use of the -buffer copy-back on destruction mechanism. If used in code intended to be -executed as part of a graph, it may not perform as expected. +If a buffer created with a host data pointer is recorded as part of a submission to +a command graph, the lifetime of that host data will also be extended by taking a +copy of that data inside the buffer. This copy will occur during the call to +`queue::submit()`. Users can opt-out of this behaviour by passing the property +`graph::no_host_copy` to `command_graph::begin_recording()` or the constructor +of the `command_graph` the commands will be recorded to. Passing the property to +`begin_recording()` will prevent host copies only for commands recorded before +`end_recording()` is called for a given queue. Passing the property to the +`command_graph` constructor will prevent host copies for all commands recorded to +the graph. + +The copy will not occur if the buffer is accessed inside the submitted command +group function with an accessor with `access_mode::write` or the `no_init` +property. === Buffer Limitations for Record & Replay API From 42457689079789f2c0091dc9b4524b2a94dd1d6a Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 10 Apr 2023 10:48:39 +0100 Subject: [PATCH 59/82] [SYCL] Clarify wording around graph contexts (#130) - Graph now takes a context on construction - Removed context param from finalize - Updated error wording for begin_recording - Update example code - Add error for updating with incorrect context --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 2bce146ff3ef8..38b72fd0262c1 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -339,10 +339,11 @@ class command_graph {}; template<> class command_graph { public: - command_graph(const device& syclDevice, const property_list& propList = {}); + command_graph(const context& syclContext, const device& syclDevice, + const property_list& propList = {}); command_graph - finalize(const context& syclContext, const property_list& propList = {}) const; + finalize(const property_list& propList = {}) const; bool begin_recording(queue& recordingQueue, const property_list& propList = {}); bool begin_recording(const std::vector& recordingQueues, const property_list& propList = {}); @@ -504,11 +505,12 @@ Table {counter: tableNumber}. Constructor of the `command_graph` class. | [source,c++] ---- -command_graph(const device& syclDevice, const property_list& propList = {}); +command_graph(const context& syclContext, const device& syclDevice, + const property_list& propList = {}); ---- -|Creates a SYCL `command_graph` object in the modifiable state for device -`syclDevice`. Zero or more properties can be provided to the constructed SYCL -`command_graph` via an instance of `property_list`. +|Creates a SYCL `command_graph` object in the modifiable state for context +`syclContext` and device `syclDevice`. Zero or more properties can be provided +to the constructed SYCL `command_graph` via an instance of `property_list`. Preconditions: @@ -517,12 +519,21 @@ Preconditions: Parameters: +* `syclContext` - Context which will be associated with this graph and all + nodes within it. This is an immutable characteristic of the graph. + * `syclDevice` - Device that all nodes added to the graph will target, - an immutable characteristic of the graph. + an immutable characteristic of the graph. Must be associated with + `syclContext`. * `propList` - Optional parameter for passing properties. Properties for the `command_graph` class are defined in <>. +Exceptions: + +* Throws synchronously with error code `invalid` if `syclDevice` is not +associated with `syclContext`. + |=== Table {counter: tableNumber}. Member functions of the `command_graph` class. @@ -623,15 +634,15 @@ Exceptions: [source,c++] ---- command_graph -finalize(const context& syclContext, const property_list& propList = {}) const; +finalize(const property_list& propList = {}) const; ---- |Synchronous operation that creates a new graph in the executable state with a fixed topology that can be submitted for execution on any queue sharing the -supplied context. It is valid to call this method multiple times to create -subsequent executable graphs. It is also valid to continue to add new nodes to -the modifiable graph instance after calling this function. It is valid to -finalize an empty graph instance with no recorded commands. +context associated with the graph. It is valid to call this method multiple times +to create subsequent executable graphs. It is also valid to continue to add new +nodes to the modifiable graph instance after calling this function. It is valid +to finalize an empty graph instance with no recorded commands. Preconditions: @@ -640,9 +651,6 @@ Preconditions: Parameters: -* `syclContext` - The context associated with the queues to which the - executable graph will be able to be submitted. - * `propList` - Optional parameter for passing properties. No finalization properties are defined by this extension. @@ -654,9 +662,6 @@ Exceptions: A cycle may be introduced to the graph via a call to `make_edge()` that creates a forward dependency. -* Throws synchronously with error code `invalid` if the graph contains a - node which targets a device not present in `syclContext`. - |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. @@ -691,8 +696,8 @@ Exceptions: already recording to a different graph. * Throws synchronously with error code `invalid` if `recordingQueue` is - associated with a device that is different from the device used on creation - of the graph. + associated with a device or context that is different from the device + and context used on creation of the graph. | [source, c++] ---- @@ -720,6 +725,10 @@ Exceptions: * Throws synchronously with error code `invalid` if the any queue in `recordingQueues` is already recording to a different graph. +* Throws synchronously with error code `invalid` if any of `recordingQueues` + is associated with a device or context that is different from the device + and context used on creation of the graph. + | [source, c++] ---- @@ -821,6 +830,9 @@ Exceptions: * Throws synchronously with error code `invalid` if `graph` contains any node which is not a kernel command or host-task, e.g. {handler-copy-functions}[memory operations]. + +* Throws synchronously with error code `invalid` if the context associated with + `graph` does not match that of the `command_graph` being updated. |=== === Queue Class Modifications @@ -1220,7 +1232,7 @@ int main() { float gamma = 3.0f; sycl::queue q; - sycl_ext::command_graph g(q.get_device()); + sycl_ext::command_graph g(q.get_context(), q.get_device()); float *dotp = sycl::malloc_shared(1, q); float *x = sycl::malloc_device(n, q); @@ -1270,7 +1282,7 @@ int main() { }, { sycl_ext::property::node::depends_on(node_a, node_b)}); - auto exec = g.finalize(q.get_context()); + auto exec = g.finalize(); // use queue shortcut for graph submission q.ext_oneapi_graph(exec).wait(); @@ -1304,7 +1316,7 @@ submitted in its entirety for execution via queue q{default_selector{}}; // New object representing graph of command-groups - ext::oneapi::experimental::command_graph graph(q.get_device()); + ext::oneapi::experimental::command_graph graph(q.get_context(), q.get_device()); { buffer bufferA{dataA.data(), range<1>{elements}}; buffer bufferB{dataB.data(), range<1>{elements}}; @@ -1362,7 +1374,7 @@ submitted in its entirety for execution via // Finalize the modifiable graph to create an executable graph that can be // submitted for execution. - auto exec_graph = graph.finalize(q.get_context()); + auto exec_graph = graph.finalize(); // Execute graph q.submit([&](handler& cgh) { From 6b5f96e2a0613c69e684bc4cba5c000a36d88960 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 11 Apr 2023 08:18:54 +0100 Subject: [PATCH 60/82] [SYCL][Doc] Simplify section on error handling Address feedback that it's not necessary to duplicate error information already specified https://github.com/intel/llvm/pull/5626#discussion_r1150846349 This includes removing wording about unspecified errors on finalize https://github.com/intel/llvm/pull/5626#discussion_r1150845124 And clarifying queue queries which behave as normal during recording https://github.com/intel/llvm/pull/5626#discussion_r1150851343 There is also the feedback that we could better group queue recording error behaviour into its own section, but I've left that larger change out the scope of the PR. Use suggestion from https://github.com/intel/llvm/pull/5626#discussion_r1150829677 about "last copy" of modifiable graph. * Update sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc Co-authored-by: Ben Tracy --------- Co-authored-by: Ben Tracy --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 38b72fd0262c1..e6cba87260d55 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1027,9 +1027,9 @@ preemptively changed the state of the queue. === Exception Safety In addition to the destruction semantics provided by the SYCL -{crs}[common reference semantics], when a modifiable `command_graph` is -destroyed recording is ended on any queues that are recording to that -graph, equivalent to `+this->end_recording()+`. +{crs}[common reference semantics], when the last copy of a modifiable +`command_graph` is destroyed recording is ended on any queues that are recording +to that graph, equivalent to `+this->end_recording()+`. As a result, users don't need to manually wrap queue recording code in a `try` / `catch` block to reset the state of recording queues on an exception @@ -1038,21 +1038,11 @@ modifiable graph will perform this action, useful in RAII pattern usage. === Error Handling -Errors are reported through exceptions, as usual in the SYCL API. For new APIs, -submitting a graph for execution can generate unspecified asynchronous errors, -while `command_graph::finalize()` may throw unspecified synchronous exceptions. -Synchronous exception errors codes are defined for all of -`command_graph::add()`, `command_graph::make_edge()`, `command_graph::update()`, -`command_graph::begin_recording()`, and `command_graph::end_recording()`. - -Submitting an executable graph using `handler::ext_oneapi_graph()` to -a queue with a different SYCL context than that of the executable graph will -result in a synchronous exception. - When a queue is in recording mode asynchronous exceptions will not be generated, as no device execution is occurring. Synchronous errors specified as being thrown in the default queue executing state, will still be thrown when a -queue is in the recording state. +queue is in the recording state. Queue query methods operate as usual in +recording mode, as opposed to throwing. The `command_graph::begin_recording` and `command_graph::end_recording` entry-points return a `bool` value informing the user whether a related queue @@ -1061,15 +1051,6 @@ no queue state is changed. This design is because the queues are already in the state the user desires, so if the function threw an exception in this case, the application would likely swallow it and then proceed. -While a queue is in the recording state, methods performed on that queue which -are not command submissions behave as normal except for waits. Waiting on a -queue in the recording state is an error and will throw a synchronous -exception. Other methods are ignored by the graph system as opposed to -throwing in recording mode. As any query about the state of the queue may -be immediately stale, any code which relies on queue waits should take care -to ensure waits are not performed on queues in recording mode. For example, by -using separate queues for graph recording and normal queue operations. - === Storage Lifetimes [[storage-lifetimes]] The lifetime of any buffer recorded as part of a submission From 49c46a14cc125febd2a0a6e1fbc331ec62bebe2a Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 11 Apr 2023 10:51:29 -0500 Subject: [PATCH 61/82] [SYCL][DOC] Adding graph property to switch off error checks for cycles (#128) --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index e6cba87260d55..ae24d00096000 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -302,6 +302,16 @@ enum class queue_state { }; namespace property { + +namespace graph { + +class no_cycle_check { + public: + no_cycle_check() = default; +}; + +} // namespace graph + namespace node { class depends_on { @@ -526,8 +536,8 @@ Parameters: an immutable characteristic of the graph. Must be associated with `syclContext`. -* `propList` - Optional parameter for passing properties. Properties for - the `command_graph` class are defined in <>. +* `propList` - Optional parameter for passing properties. Valid `command_graph` + constructor properties are listed in Section <>. Exceptions: @@ -629,6 +639,9 @@ Exceptions: * Throws synchronously with error code `invalid` if `src` and `dest` are the same node. + +* Throws synchronously with error code `invalid` if the resulting dependency would + lead to a cycle. This error is omitted when `property::graph::no_cycle_check` is set. | [source,c++] @@ -658,6 +671,9 @@ Returns: A new executable graph object which can be submitted to a queue. Exceptions: +* Throws synchronously with error code `invalid` if the graph contains a + node which targets a device not present in `syclContext`. + * Throws synchronously with error code `invalid` if the graph contains a cycle. A cycle may be introduced to the graph via a call to `make_edge()` that creates a forward dependency. @@ -835,6 +851,19 @@ Exceptions: `graph` does not match that of the `command_graph` being updated. |=== +==== Graph Properties + +There is the one following property defined that can be passed to a `command_graph` +on construction via the property list parameter. + +1. `property::graph::no_cycle_check` - This property disables any checks if + a newly added dependency will lead to a cycle in a specific `command_graph`. + As a result, no errors are reported when a function tries to create a cyclic + dependency. Thus, it's the user's responsibility to create an acyclic graph + for execution when this property is set. Creating a `command_graph` in + executable state through `finalize` from a graph with cyclic dependencies + is not allowed and results in undefined behavior. + === Queue Class Modifications :queue-class: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class From 1a75cbd3c318a26600976927092b6567da05ac02 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 12 Apr 2023 09:38:17 +0100 Subject: [PATCH 62/82] [SYCL][Doc] Remove cycle error from finalize --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index ae24d00096000..f58ecf5c126a8 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -674,10 +674,6 @@ Exceptions: * Throws synchronously with error code `invalid` if the graph contains a node which targets a device not present in `syclContext`. -* Throws synchronously with error code `invalid` if the graph contains a cycle. - A cycle may be introduced to the graph via a call to `make_edge()` that - creates a forward dependency. - |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. From 5d6a58103643a1dd011b059d03eac9f697fdb20c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 12 Apr 2023 09:39:49 +0100 Subject: [PATCH 63/82] [SYCL][Doc] Error on recorded event query. Specify that it is an error to query an event returned by a queue recording for either its command status or profiling info. Addresses feedback https://github.com/intel/llvm/pull/5626#discussion_r1150688098 Additionally, the description on the meaning of command status for a graph submissions is moved next to the description of graph submission shortcuts. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index f58ecf5c126a8..9dfd7291dfe0e 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -912,18 +912,17 @@ Table {counter: tableNumber}. Queue info query Events returned from queue submissions when a queue is in the recording state may only be used as parameters to `handler::depends_on()` or as dependent events for queue shortcuts like `queue::parallel_for()` for submissions which -are being recorded to the same modifiable `command_graph`. These events have -status `info::event_command_status::complete`. The event status of an event -returned from an executable graph submission will have -`info::event_command_status::running` once any command group node starts -executing on a device, and status `info::event_command_status::complete` -once all the nodes have finished execution. +are being recorded to the same modifiable `command_graph`. + +Calling `event::get_info()` or +`event::get_profiling_info()` on an event returned from a queue submission +recorded to a graph will throw synchronously with error code `invalid`. Waiting on an event returned from a queue submission recorded to a graph will throw synchronously with error code `invalid`. -Calling `queue::wait()` on a queue in the recording state is an error and -will throw synchronously with error code `invalid`. +Waiting on a queue in the recording state is an error and will throw +synchronously with error code `invalid`. ==== Queue Properties @@ -977,6 +976,10 @@ event queue::ext_oneapi_graph(command_graph& graph) |Queue shortcut function that is equivalent to submitting a command-group containing `handler::ext_oneapi_graph(graph)`. +The command status of the event returned will be +`info::event_command_status::running` once any command group node starts +executing on a device, and status `info::event_command_status::complete` once +all the nodes have finished execution. | [source,c++] ---- @@ -988,6 +991,10 @@ event queue::ext_oneapi_graph(command_graph& graph, containing `handler::depends_on(depEvent)` and `handler::ext_oneapi_graph(graph)`. +The command status of the event returned will be +`info::event_command_status::running` once any command group node starts +executing on a device, and status `info::event_command_status::complete` once +all the nodes have finished execution. | [source,c++] ---- @@ -998,6 +1005,11 @@ event queue::ext_oneapi_graph(command_graph& graph, |Queue shortcut function that is equivalent to submitting a command-group containing `handler::depends_on(depEvents)` and `handler::ext_oneapi_graph(graph)`. + +The command status of the event returned will be +`info::event_command_status::running` once any command group node starts +executing on a device, and status `info::event_command_status::complete` once +all the nodes have finished execution. |=== ==== New Handler Member Functions From 213385e7bbaed451040dd817873fbc447526f2e1 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 17 Apr 2023 10:29:31 +0100 Subject: [PATCH 64/82] [SYCL][Doc] Define interaction with other extensions Defines the interaction of `sycl_ext_oneapi_graph` with other extensions that define queue properties or new queue methods, as these need clarified as to how they relate to record & replay. Also ban the use of new handler methods, and queue shortcuts, in graph nodes. These could be possible to support, but I don't think the scope of implementing and testing them is feasible for 2024.0 timelines. Closes Issue https://github.com/reble/llvm/issues/135 Co-authored-by: Pablo Reble Co-authored-by: Ben Tracy --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 110 +++++++++++++++++- 1 file changed, 104 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 9dfd7291dfe0e..67fb68d5d5d80 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -587,7 +587,7 @@ node add(T cgf, const property_list& propList = {}); object statically contains a group of commands, of which a single command is executed at runtime. A command group function object may submit any command as defined by the core SYCL specification, and any SYCL extensions unless explicitly -stated otherwise. +stated otherwise in <>. The requisites of `cgf` will be used to identify any dependent nodes in the graph to form edges with. @@ -955,11 +955,6 @@ ways: * `info::event_profiling::command_end` - Timestamp when the last command-group node completes execution. - -For any other queue property that is defined by an extension, it is the -responsibility of the extension to define the relationship between that queue -property and this graph extension. - ==== New Queue Member Functions Table {counter: tableNumber}. Additional member functions of the `sycl::queue` class. @@ -1221,6 +1216,109 @@ problem this extension currently aims to solve, it is the responsibility of the user to decide the device each command will be processed for, not the SYCL runtime. +=== Interaction With Other Extensions [[extension-interaction]] + +This section defines the interaction of `sycl_ext_oneapi_graph` with other +extensions. + +==== sycl_ext_oneapi_discard_queue_events + +When recording a `sycl::queue` which has been created with the +`ext::oneapi::property::queue::discard_event` property, it is invalid to +use these events returned from queue submissions to create graph edges. This is +in-keeping with the +link:../supported/sycl_ext_oneapi_discard_queue_events.asciidoc[sycl_ext_oneapi_discard_queue_events] +specification wording that `handler::depends_on()` throws an exception when +passed an invalid event. + +==== sycl_ext_oneapi_enqueue_barrier + +The new handler methods, and queue shortcuts, defined by +link:../supported/sycl_ext_oneapi_enqueue_barrier.asciidoc[sycl_ext_oneapi_enqueue_barrier] +cannot be used in graph nodes. A synchronous exception will be thrown with +error code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_memcpy2d + +The new handler methods, and queue shortcuts, defined by +link:../supported/sycl_ext_oneapi_memcpy2d.asciidoc[sycl_ext_oneapi_memcpy2d] +cannot be used in graph nodes. A synchronous exception will be thrown with +error code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_queue_priority + +The queue priority property defined by +link:../supported/sycl_ext_oneapi_queue_priority.asciidoc[sycl_ext_oneapi_queue_priority] +is ignored during queue recording. + +==== sycl_ext_oneapi_queue_empty + +The `queue::ext_oneapi_empty()` query defined by the +link:../supported/sycl_ext_oneapi_queue_empty.asciidoc[sycl_ext_oneapi_queue_empty] +extension behaves as normal during queue recording and is not captured to the graph. +Recorded commands are not counted as submitted for the purposes of this query. + +==== sycl_ext_intel_queue_index + +The compute index queue property defined by +link:../supported/sycl_ext_intel_queue_index.asciidoc[sycl_ext_intel_queue_index] +is ignored during queue recording. + +Using this information is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_codeplay_kernel_fusion + +As the +link:../experimental/sycl_ext_codeplay_kernel_fusion.asciidoc[sycl_ext_codeplay_kernel_fusion] +extension also introduces state to a `sycl::queue`, there are restrictions on +its usage when combined with `sycl_ext_oneapi_graph`. Exceptions with error code +`invalid` are thrown in the following cases: + +* `fusion_wrapper::start_fusion()` is called when its associated queue + is in the recording state. +* `command_graph::begin_recording()` is called passing a queue in fusion mode. + +The `sycl::ext::codeplay::experimental::property::queue::enable_fusion` property +defined by the extension is ignored by queue recording. + +To enable kernel fusion in a `command_graph` see the +https://github.com/sommerlukas/llvm/blob/proposal/graph-fusion/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph_fusion.asciidoc[sycl_ext_oneapi_graph_fusion extension proposal] +which is layered ontop of `sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_kernel_properties + +The new handler methods, and queue shortcuts, defined by +link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_prod + +The new `sycl::queue::ext_oneapi_prod()` method added by +link:../proposed/sycl_ext_oneapi_prod.asciidoc[sycl_ext_oneapi_prod] +behaves as normal during queue recording and is not captured to the graph. +Recorded commands are not counted as submitted for the purposes of its operation. + +==== sycl_ext_oneapi_device_global + +The new handler methods, and queue shortcuts, defined by +link:../proposed/sycl_ext_oneapi_device_global.asciidoc[sycl_ext_oneapi_device_global]. +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + == Examples [NOTE] From e85b3380ee031a82887519b7572193b1cd50e44f Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 2 May 2023 11:02:44 +0100 Subject: [PATCH 65/82] Fix typo in name of copy-back API The name of the method is `buffer::set_write_back(bool)` rather than `buffer::set_copy_back(bool)` --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 67fb68d5d5d80..8906d411794b4 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1119,7 +1119,7 @@ in a synchronous error being thrown with error code `invalid`. The copy-back mechanism can be disabled explicitly for buffers with attached host storage using either `buffer::set_final_data(nullptr)` or -`buffer::set_copy_back(false)`. +`buffer::set_write_back(false)`. It is also an error to create a host accessor to a buffer which is used in commands which are currently being recorded to a command graph. Attempting to From 69649c4301e0024db5290a813a923444442d5e8d Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 15 May 2023 11:35:54 -0500 Subject: [PATCH 66/82] [SYCL][DOC] Collecting open issues (#175) * Apply suggestions from code review --------- Co-authored-by: Ewan Crawford --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 8906d411794b4..9e9e463d12e9d 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1506,16 +1506,17 @@ submitted in its entirety for execution via Enable an instance of a graph in executable state to be submitted for execution when a previous submission of the same graph has yet to complete execution. -**Outcome:** Backend support for this is inconsistent, but the runtime could -schedule the submissions sequentially for backends which don't support it. +**UNRESOLVED:** Trending "yes". Backend support for this is inconsistent, but +the runtime could schedule the submissions sequentially for backends which don't +support it. === Multi Device Graph Allow an executable graph to contain nodes targeting different devices. -**Outcome:** This feature is something that we are considering introducing into -the extension in later revisions. It has been planned for to the extent that the -definition of a graph node is device specific. +**UNRESOLVED:** Trending "yes". This feature is something that we are considering +introducing into the extension in later revisions. It has been planned for to the +extent that the definition of a graph node is device specific. === Memory Allocation API @@ -1525,7 +1526,36 @@ the allocation. No mechanism is currently provided, but see the section on <> for some designs being considered. -**Outcome:** Designs under consideration +**UNRESOLVED:** Trending "yes". Design is under consideration. + +=== Device Agnostic Graph + +Explicit API could support device-agnostic graphs that can be submitted +through queues to a particular device. This issue is related to multi-device +graphs. + +**UNRESOLVED:** Trending "no". Because of current runtime limitations this +can't be implemented with a reasonable effort. + +=== Execution Property + +Current proposal contains extensive extensions to existing API in SYCL. +Can we achieve something similar with user control over the flush behavior +of a queue and providing a handler that can be replayed? + +**UNRESOLVED:** Trending "no". Needs reconsideration of the design and +possible restrictions. + +=== User Guided Scheduling + +For specific workloads it could be beneficial to provide hints to the +runtime how to schedule a command graph onto a device. This info could effect +the scheduling policy like breadth or depth-first, or a combination with a +block size. + +**UNRESOLVED:** Trending "yes". A new property could be added to +the finalize call either extending the basic command graph proposal +or layered as a separate extension proposal. == Revision History From 70cf19a9325d080113067efe282d69553305b9d8 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 16 May 2023 09:46:42 +0100 Subject: [PATCH 67/82] [SYCL][Graphs] Change queue state query The current provided mechanism for querying the state of a queue is to use `get_info`. However, I think this has a couple of drawbacks: 1) The `info::queue::state` value which this extension defines isn't in an experimental namespace. 2) The `get_info` queries tend to map to underlying backend properties, in the implementation, see https://github.com/intel/llvm/tree/sycl/sycl/include/sycl/info . We are not defining a query for something that's backend specific, but backend agnostic, so `get_info` might not be the right API. Instead, I've changed the query to add a new `queue::ext_oneapi_get_state()` query. This has an experimental prefix, so won't conflict with any future core functionality, and can be easily implemented without fighting against the current `get_info()` mechanism. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 9e9e463d12e9d..ff131b324ac85 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -295,7 +295,7 @@ executable graph object is added to the graph as a node. namespace sycl { namespace ext::oneapi::experimental { -// State of a queue, returned by info::queue::state +// State of a queue, returned by queue::ext_oneapi_get_state() enum class queue_state { executing, recording @@ -307,7 +307,7 @@ namespace graph { class no_cycle_check { public: - no_cycle_check() = default; + no_cycle_check() = default; }; } // namespace graph @@ -331,7 +331,7 @@ enum class graph_state { executable }; -namespace property { +namespace property { namespace graph { class no_host_copy { @@ -382,6 +382,10 @@ public: using namespace ext::oneapi::experimental; class queue { public: + + ext::oneapi::experimental::queue_state + ext_oneapi_get_state() const; + /* -- graph convenience shortcuts -- */ event ext_oneapi_graph(command_graph& graph); @@ -874,8 +878,6 @@ submitted immediately for execution. ==== Queue State -:queue-info-table: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#table.queue.info - The `sycl::queue` object can be in either of two states. The default `queue_state::executing` state is where the queue has its normal semantics of submitted command-groups being immediately scheduled for asynchronous execution. @@ -884,7 +886,8 @@ The alternative `queue_state::recording` state is used for graph construction. Instead of being scheduled for execution, command-groups submitted to the queue are recorded to a graph object as new nodes for each submission. After recording has finished and the queue returns to the executing state, the recorded commands are -not then executed, they are transparent to any following queue operations. +not then executed, they are transparent to any following queue operations. The state +of a queue can be queried with `queue::ext_oneapi_get_state()`. .Queue State Diagram [source, mermaid] @@ -894,21 +897,6 @@ graph LR Recording -->|End Recording| Executing .... -The state of a queue can be queried with `queue::get_info` using template -parameter `info::queue::state`. The following entry is added to the -{queue-info-table}[queue info table] to define this query: - -Table {counter: tableNumber}. Queue info query -[cols="2a,a,a"] -|=== -| Queue Descriptors | Return Type | Description - -| `info::queue::state` -| `ext::oneapi::experimental::queue_state` -| Returns the state of the queue - -|=== - Events returned from queue submissions when a queue is in the recording state may only be used as parameters to `handler::depends_on()` or as dependent events for queue shortcuts like `queue::parallel_for()` for submissions which @@ -962,6 +950,18 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::queue` c |=== |Member function|Description +| +[source,c++] +---- +queue_state queue::ext_oneapi_get_state() const; +---- + +| Query the <> of the queue. + +Returns: If the queue is in the default state where commands are scheduled +immediately for execution, `queue_state::executing` is returned. Otherwise, +`queue_state::recording` is returned where commands are redirected to a `command_graph` +object. | [source,c++] ---- @@ -1052,9 +1052,9 @@ graph is undefined. If user code enforces a total order on the queue events, then the behavior is well-defined, and will match the observable total order. -The returned value from the `info::queue::state` should be considered -immediately stale in multi-threaded usage, as another thread could have -preemptively changed the state of the queue. +The returned value from the `queue::ext_oneapi_get_state()` should be +considered immediately stale in multi-threaded usage, as another thread could +have preemptively changed the state of the queue. === Exception Safety From 6a3b016d54f52eb8284b1aa83d4d1621165ef02e Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 22 May 2023 11:16:33 +0100 Subject: [PATCH 68/82] [SYCL][Doc] Device query for graphs support Although we have an emulation mode for when a SYCL backend doesn't support PI/UR command-buffers, we don't have good implementation coverage across it's usage on all possible backends. Providing a support query would allow us to limit the backends we support while an experimental extension to say CUDA and Level Zero, and maybe OpenCL. Rather than having users with FPGA and esimd_emulator trying to use the extension and crashing because we've not tested these platforms even with emulation mode. We could remove this query once the implementation is more mature and we can trust emulation mode support on all backends, but I don't think that's the case at the moment. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index ff131b324ac85..1d46317e32f6b 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -323,6 +323,20 @@ class depends_on { } // namespace node } // namespace property +// Device query for level of support +namespace info { +namespace device { +struct graphs_support; + +enum class graph_support_level { + unsupported = 0, + native, + emulated +}; + +} // namespace device +} // namespace info + class node {}; // State of a graph @@ -404,6 +418,30 @@ public: } // namespace sycl ---- +=== Device Info Query + +Due to the experimental nature of the extension, support is not available across +all devices. The following device support query is added to report devices which +are currently supported, and how that support is implemented. + + +Table {counter: tableNumber}. Device Info Queries. +[%header] +|=== +| Device Descriptors | Return Type | Description + +|`info::device::graph_support` +|`info::device::graph_support_level` +|When passed to `device::get_info<...>()`, the function returns `native` +if there is an underlying SYCL backend command-buffer construct which is used +to propagate the graph to the backend. If no backend construct exists, or +building on top of it has not yet been implemented, then `emulated` is +returned. Otherwise `unsupported` is returned if the SYCL device doesn't +support using this graph extension. + +|=== + + === Node :crs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics @@ -548,6 +586,9 @@ Exceptions: * Throws synchronously with error code `invalid` if `syclDevice` is not associated with `syclContext`. +* Throws synchronously with error code `invalid` if `syclDevice` + <>. + |=== Table {counter: tableNumber}. Member functions of the `command_graph` class. From 3c3565ca12e9f626f2c28ab0a2dfa4831aeea4c5 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 30 May 2023 19:34:51 +0100 Subject: [PATCH 69/82] [SYCL][Doc] Address latest round of review comments (#197) * Remove `syclContext` exception from `finalize()` https://github.com/intel/llvm/pull/5626#discussion_r1206813954 * Move `no_host_copy` property next to other properties https://github.com/intel/llvm/pull/5626#discussion_r1205967853 * Remove numeric value from `graph_support_level` enum class https://github.com/intel/llvm/pull/5626#discussion_r1206952141 * Make HTML rendered tables easier to read by making first column narrower https://github.com/intel/llvm/pull/5626#discussion_r1206828902 * In `propList` parameter description for `add` mention the `depends_on` property https://github.com/intel/llvm/pull/5626#discussion_r1206972805 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 1d46317e32f6b..9a68a26325f4c 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -310,6 +310,10 @@ class no_cycle_check { no_cycle_check() = default; }; +class no_host_copy { +public: + no_host_copy() = default; +}; } // namespace graph namespace node { @@ -329,7 +333,7 @@ namespace device { struct graphs_support; enum class graph_support_level { - unsupported = 0, + unsupported, native, emulated }; @@ -345,17 +349,6 @@ enum class graph_state { executable }; -namespace property { -namespace graph { - -class no_host_copy { -public: - no_host_copy() = default; -}; - -} // namespace graph -} // namespace property - // New object representing graph template class command_graph {}; @@ -557,7 +550,8 @@ Table {counter: tableNumber}. Constructor of the `command_graph` class. | [source,c++] ---- -command_graph(const context& syclContext, const device& syclDevice, +command_graph(const context& syclContext, + const device& syclDevice, const property_list& propList = {}); ---- |Creates a SYCL `command_graph` object in the modifiable state for context @@ -613,7 +607,9 @@ Preconditions: Parameters: * `propList` - Zero or more properties can be provided to the constructed node - via an instance of `property_list`. + via an instance of `property_list`. The `property::node::depends_on` property + can be passed here with a list of nodes to create dependency edges on. + Returns: The empty node which has been added to the graph. @@ -646,7 +642,8 @@ Parameters: * `cgf` - Command group function object to be added as a node. * `propList` - Zero or more properties can be provided to the constructed node - via an instance of `property_list`. + via an instance of `property_list`. The `property::node::depends_on` property + can be passed here with a list of nodes to create dependency edges on. Returns: The command-group function object node which has been added to the graph. @@ -684,7 +681,7 @@ Exceptions: * Throws synchronously with error code `invalid` if `src` and `dest` are the same node. - + * Throws synchronously with error code `invalid` if the resulting dependency would lead to a cycle. This error is omitted when `property::graph::no_cycle_check` is set. @@ -714,11 +711,6 @@ Parameters: Returns: A new executable graph object which can be submitted to a queue. -Exceptions: - -* Throws synchronously with error code `invalid` if the graph contains a - node which targets a device not present in `syclContext`. - |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. @@ -729,7 +721,9 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class for | [source, c++] ---- -bool begin_recording(queue& recordingQueue, const property_list& propList = {}) +bool +begin_recording(queue& recordingQueue, + const property_list& propList = {}) ---- |Synchronously changes the state of `recordingQueue` to the @@ -758,8 +752,9 @@ Exceptions: | [source, c++] ---- -bool begin_recording(const std::vector& recordingQueues, - const property_list& propList = {}) +bool +begin_recording(const std::vector& recordingQueues, + const property_list& propList = {}) ---- |Synchronously changes the state of each queue in `recordingQueues` to the @@ -851,7 +846,8 @@ Table {counter: tableNumber}. Member functions of the `command_graph` class (exe | [source, c++] ---- -void command_graph update(const command_graph& graph); +void +update(const command_graph& graph); ---- |Updates the executable graph node inputs & outputs from a topologically @@ -900,7 +896,7 @@ on construction via the property list parameter. 1. `property::graph::no_cycle_check` - This property disables any checks if a newly added dependency will lead to a cycle in a specific `command_graph`. As a result, no errors are reported when a function tries to create a cyclic - dependency. Thus, it's the user's responsibility to create an acyclic graph + dependency. Thus, it's the user's responsibility to create an acyclic graph for execution when this property is set. Creating a `command_graph` in executable state through `finalize` from a graph with cyclic dependencies is not allowed and results in undefined behavior. @@ -994,7 +990,8 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::queue` c | [source,c++] ---- -queue_state queue::ext_oneapi_get_state() const; +queue_state +queue::ext_oneapi_get_state() const; ---- | Query the <> of the queue. @@ -1006,7 +1003,8 @@ object. | [source,c++] ---- -event queue::ext_oneapi_graph(command_graph& graph) +event +queue::ext_oneapi_graph(command_graph& graph) ---- |Queue shortcut function that is equivalent to submitting a command-group @@ -1019,7 +1017,8 @@ all the nodes have finished execution. | [source,c++] ---- -event queue::ext_oneapi_graph(command_graph& graph, +event +queue::ext_oneapi_graph(command_graph& graph, event depEvent); ---- @@ -1034,7 +1033,8 @@ all the nodes have finished execution. | [source,c++] ---- -event queue::ext_oneapi_graph(command_graph& graph, +event +queue::ext_oneapi_graph(command_graph& graph, const std::vector& depEvents); ---- @@ -1058,7 +1058,8 @@ Table {counter: tableNumber}. Additional member functions of the `sycl::handler` | [source,c++] ---- -void handler::ext_oneapi_graph(command_graph& graph) +void +handler::ext_oneapi_graph(command_graph& graph) ---- |Invokes the execution of a graph. Only one instance of `graph` may be executing, @@ -1547,7 +1548,7 @@ submitted in its entirety for execution via Enable an instance of a graph in executable state to be submitted for execution when a previous submission of the same graph has yet to complete execution. -**UNRESOLVED:** Trending "yes". Backend support for this is inconsistent, but +**UNRESOLVED:** Trending "yes". Backend support for this is inconsistent, but the runtime could schedule the submissions sequentially for backends which don't support it. @@ -1555,8 +1556,8 @@ support it. Allow an executable graph to contain nodes targeting different devices. -**UNRESOLVED:** Trending "yes". This feature is something that we are considering -introducing into the extension in later revisions. It has been planned for to the +**UNRESOLVED:** Trending "yes". This feature is something that we are considering +introducing into the extension in later revisions. It has been planned for to the extent that the definition of a graph node is device specific. === Memory Allocation API @@ -1571,11 +1572,11 @@ some designs being considered. === Device Agnostic Graph -Explicit API could support device-agnostic graphs that can be submitted -through queues to a particular device. This issue is related to multi-device +Explicit API could support device-agnostic graphs that can be submitted +through queues to a particular device. This issue is related to multi-device graphs. -**UNRESOLVED:** Trending "no". Because of current runtime limitations this +**UNRESOLVED:** Trending "no". Because of current runtime limitations this can't be implemented with a reasonable effort. === Execution Property @@ -1594,9 +1595,9 @@ runtime how to schedule a command graph onto a device. This info could effect the scheduling policy like breadth or depth-first, or a combination with a block size. -**UNRESOLVED:** Trending "yes". A new property could be added to +**UNRESOLVED:** Trending "yes". A new property could be added to the finalize call either extending the basic command graph proposal -or layered as a separate extension proposal. +or layered as a separate extension proposal. == Revision History From a4f54098756adab691df737bb42c3171b8fd300f Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 31 May 2023 09:45:45 -0500 Subject: [PATCH 70/82] [SYCL] Apply suggestions from code review (#200) * [SYCL] Collect recording restrictions into single section --------- Co-authored-by: Ewan Crawford --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 230 +++++++++--------- 1 file changed, 119 insertions(+), 111 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 9a68a26325f4c..202a5ed4bf1c6 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -624,13 +624,15 @@ Exceptions: template node add(T cgf, const property_list& propList = {}); ---- -|This function adds a command group function object to a graph. The function -object statically contains a group of commands, of which a single command is -executed at runtime. A command group function object may submit any command -as defined by the core SYCL specification, and any SYCL extensions unless explicitly -stated otherwise in <>. -The requisites of `cgf` will be used to identify any dependent nodes in the graph -to form edges with. +|The `cgf` command group function behaves in much the same way as the command +group function passed to `queue::submit` unless explicitly stated otherwise in +<>. Code in the +function is executed synchronously, before the function returns back to +`command_graph::add`, with the exception of any SYCL commands (e.g. kernels, +host tasks, or explicit memory copy operations). These commands are captured +into the graph and executed asynchronously when the graph is submitted to a +queue. The requisites of `cgf` will be used to identify any dependent nodes in +the graph to form edges with. Preconditions: @@ -934,21 +936,6 @@ graph LR Recording -->|End Recording| Executing .... -Events returned from queue submissions when a queue is in the recording state -may only be used as parameters to `handler::depends_on()` or as dependent -events for queue shortcuts like `queue::parallel_for()` for submissions which -are being recorded to the same modifiable `command_graph`. - -Calling `event::get_info()` or -`event::get_profiling_info()` on an event returned from a queue submission -recorded to a graph will throw synchronously with error code `invalid`. - -Waiting on an event returned from a queue submission recorded to a graph -will throw synchronously with error code `invalid`. - -Waiting on a queue in the recording state is an error and will throw -synchronously with error code `invalid`. - ==== Queue Properties :queue-properties: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:queue-properties @@ -1110,21 +1097,6 @@ As a result, users don't need to manually wrap queue recording code in a back to the executing state. Instead, an uncaught exception destroying the modifiable graph will perform this action, useful in RAII pattern usage. -=== Error Handling - -When a queue is in recording mode asynchronous exceptions will not be -generated, as no device execution is occurring. Synchronous errors specified as -being thrown in the default queue executing state, will still be thrown when a -queue is in the recording state. Queue query methods operate as usual in -recording mode, as opposed to throwing. - -The `command_graph::begin_recording` and `command_graph::end_recording` -entry-points return a `bool` value informing the user whether a related queue -state change occurred. False is returned rather than throwing an exception when -no queue state is changed. This design is because the queues are already in -the state the user desires, so if the function threw an exception in this case, -the application would likely swallow it and then proceed. - === Storage Lifetimes [[storage-lifetimes]] The lifetime of any buffer recorded as part of a submission @@ -1149,25 +1121,6 @@ The copy will not occur if the buffer is accessed inside the submitted command group function with an accessor with `access_mode::write` or the `no_init` property. -=== Buffer Limitations for Record & Replay API - -Because of the delayed execution of a recorded graph, it is not possible to support -captured code which relies on the copy-back on destruction behavior of buffers. -Typically, applications would rely on this behavior to do work on the host which -cannot inherently be captured inside a command graph. Thus, when recording to a graph -it is an error to submit a command which has an accessor on a buffer which would -cause a write-back to happen. Using an incompatible buffer in this case will result -in a synchronous error being thrown with error code `invalid`. - -The copy-back mechanism can be disabled explicitly for buffers with attached host -storage using either `buffer::set_final_data(nullptr)` or -`buffer::set_write_back(false)`. - -It is also an error to create a host accessor to a buffer which is used in -commands which are currently being recorded to a command graph. Attempting to -construct a host accessor to an incompatible buffer will result in a -synchronous error being thrown with error code `invalid`. - === Host Tasks :host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks @@ -1181,16 +1134,6 @@ device at once. Host tasks can be updated as part of <> by replacing the whole node with the new callable. -=== Command Group Function Evaluation - -Host code within a command group function object is evaluated when the command -group is added to a graph. This is either before the return of the call to -`command_graph::add()` when using the explicit API or before the return of the call to -`queue::submit()` when submitting a command group to a queue that is recording to a graph. - -This does not apply to code within a {host-task}[host task] which is -evaluated as normal during command graph execution. - [source,c++] ---- auto node = graph.add([&](sycl::handler& cgh){ @@ -1201,62 +1144,68 @@ auto node = graph.add([&](sycl::handler& cgh){ }); ---- -=== Memory Allocation Nodes +=== Queue Behavior In Recording Mode -There is no provided interface for users to define a USM allocation/free -operation belonging to the scope of the graph. It would be error prone and -non-performant to allocate or free memory as a node executed during graph -submission. Instead, such a memory allocation API needs to provide a way to -return a pointer which won't be valid until the allocation is made on graph -finalization, as allocating at finalization is the only way to benefit from -the known graph scope for optimal memory allocation, and even optimize to -eliminate some allocations entirely. +When a queue is placed in recording mode via a call to `command_graph::begin_recording`, +some features of the queue are no longer available because the commands are not +executed during this mode. The general philosophy is to throw an exception at +runtime when a feature is not available, so that there is an obvious indication +of failure. The following list describes the behavior that changes during +recording mode. Features not listed below behave the same in recording mode as +they do in non-recording mode. -Such a deferred allocation strategy presents challenges however, and as a result -we recommend instead that prior to graph construction users perform core SYCL -USM allocations to be used in the graph submission. Before to coming to this -recommendation we considered the following explicit graph building interfaces -for adding a memory allocation owned by the graph: +==== Event Limitations -1. Allocation function returning a reference to the raw pointer, i.e. `void*&`, - which will be instantiated on graph finalization with the location of the - allocated USM memory. +Events returned from queue submissions when a queue is in the recording state +may only be used as parameters to `handler::depends_on()` or as dependent +events for queue shortcuts like `queue::parallel_for()` for submissions which +are being recorded to the same modifiable `command_graph`. -2. Allocation function returning a handle to the allocation. Applications use - the handle in node command-group functions to access memory when allocated. +- Calling `event::get_info()` or +`event::get_profiling_info()` on an event returned from a queue submission +recorded to a graph will throw synchronously with error code `invalid`. -3. Allocation function returning a pointer to a virtual allocation, only backed - with an actual allocation when graph is finalized or submitted. +- Waiting on an event returned from a queue submission recorded to a graph +will throw synchronously with error code `invalid`. -Design 1) has the drawback of forcing users to keep the user pointer variable -alive so that the reference is valid, which is unintuitive and is likely to -result in bugs. +- Waiting on a queue in the recording state is an error and will throw +synchronously with error code `invalid`. -Design 2) introduces a handle object which has the advantages of being a less -error prone way to provide the pointer to the deferred allocation. However, it -requires kernel changes and introduces an overhead above the raw pointers that -are the advantage of USM. +==== Buffer Limitations -Design 3) needs specific backend support for deferred allocation. +Because of the delayed execution of a recorded graph, it is not possible to support +captured code which relies on the copy-back on destruction behavior of buffers. +Typically, applications would rely on this behavior to do work on the host which +cannot inherently be captured inside a command graph. -=== Device Specific Graph +- Thus, when recording to a graph it is an error to submit a command which has +an accessor on a buffer which would cause a write-back to happen. Using an +incompatible buffer in this case will result in a synchronous error being +thrown with error code `invalid`. -A modifiable state `command_graph` contains nodes targeting specific devices, -rather than being a device agnostic representation only tied to devices on -finalization. This allows the implementation to process nodes which require -device information when the command group function is evaluated. For example, -a SYCL reduction implementation may desire the work-group/sub-group size, which -is normally gathered by the runtime from the device associated with the queue. +- The copy-back mechanism can be disabled explicitly for buffers with attached host +storage using either `buffer::set_final_data(nullptr)` or +`buffer::set_write_back(false)`. -This design also enables the future capability for a user to compose a graph -with nodes targeting different devices, allowing the benefits of defining an -execution graph ahead of submission to be extended to multi-device platforms. -Without this capability a user currently has to submit individual single-device -graphs and use events for dependencies, which is a usage model this extension is -aiming to optimize. Automatic load balancing of commands across devices is not a -problem this extension currently aims to solve, it is the responsibility of the -user to decide the device each command will be processed for, not the SYCL -runtime. +- It is also an error to create a host accessor to a buffer which is used in +commands which are currently being recorded to a command graph. Attempting to +construct a host accessor to an incompatible buffer will result in a +synchronous error being thrown with error code `invalid`. + +==== Error Handling + +When a queue is in recording mode asynchronous exceptions will not be +generated, as no device execution is occurring. Synchronous errors specified as +being thrown in the default queue executing state, will still be thrown when a +queue is in the recording state. Queue query methods operate as usual in +recording mode, as opposed to throwing. + +The `command_graph::begin_recording` and `command_graph::end_recording` +entry-points return a `bool` value informing the user whether a related queue +state change occurred. False is returned rather than throwing an exception when +no queue state is changed. This design is because the queues are already in +the state the user desires, so if the function threw an exception in this case, +the application would likely swallow it and then proceed. === Interaction With Other Extensions [[extension-interaction]] @@ -1541,6 +1490,65 @@ submitted in its entirety for execution via ---- +== Future Direction + +=== Memory Allocation Nodes + +There is no provided interface for users to define a USM allocation/free +operation belonging to the scope of the graph. It would be error prone and +non-performant to allocate or free memory as a node executed during graph +submission. Instead, such a memory allocation API needs to provide a way to +return a pointer which won't be valid until the allocation is made on graph +finalization, as allocating at finalization is the only way to benefit from +the known graph scope for optimal memory allocation, and even optimize to +eliminate some allocations entirely. + +Such a deferred allocation strategy presents challenges however, and as a result +we recommend instead that prior to graph construction users perform core SYCL +USM allocations to be used in the graph submission. Before to coming to this +recommendation we considered the following explicit graph building interfaces +for adding a memory allocation owned by the graph: + +1. Allocation function returning a reference to the raw pointer, i.e. `void*&`, + which will be instantiated on graph finalization with the location of the + allocated USM memory. + +2. Allocation function returning a handle to the allocation. Applications use + the handle in node command-group functions to access memory when allocated. + +3. Allocation function returning a pointer to a virtual allocation, only backed + with an actual allocation when graph is finalized or submitted. + +Design 1) has the drawback of forcing users to keep the user pointer variable +alive so that the reference is valid, which is unintuitive and is likely to +result in bugs. + +Design 2) introduces a handle object which has the advantages of being a less +error prone way to provide the pointer to the deferred allocation. However, it +requires kernel changes and introduces an overhead above the raw pointers that +are the advantage of USM. + +Design 3) needs specific backend support for deferred allocation. + +=== Device Specific Graph + +A modifiable state `command_graph` contains nodes targeting specific devices, +rather than being a device agnostic representation only tied to devices on +finalization. This allows the implementation to process nodes which require +device information when the command group function is evaluated. For example, +a SYCL reduction implementation may desire the work-group/sub-group size, which +is normally gathered by the runtime from the device associated with the queue. + +This design also enables the future capability for a user to compose a graph +with nodes targeting different devices, allowing the benefits of defining an +execution graph ahead of submission to be extended to multi-device platforms. +Without this capability a user currently has to submit individual single-device +graphs and use events for dependencies, which is a usage model this extension is +aiming to optimize. Automatic load balancing of commands across devices is not a +problem this extension currently aims to solve, it is the responsibility of the +user to decide the device each command will be processed for, not the SYCL +runtime. + == Issues === Simultaneous Graph Submission From 544a75199c5a46dbe71ddd4bde018aee40a57a5b Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 31 May 2023 15:46:33 +0100 Subject: [PATCH 71/82] [SYCL][Doc] Expand on update() error wording for invalid device (#204) Actions specfication feedback from https://github.com/intel/llvm/pull/5626#discussion_r1206832653 --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 202a5ed4bf1c6..6559b4028e3ad 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -886,8 +886,9 @@ Exceptions: which is not a kernel command or host-task, e.g. {handler-copy-functions}[memory operations]. -* Throws synchronously with error code `invalid` if the context associated with - `graph` does not match that of the `command_graph` being updated. +* Throws synchronously with error code `invalid` if the context or device + associated with `graph` does not match that of the `command_graph` being + updated. |=== ==== Graph Properties From 342c51bf5f524fd9b8b01476acf284115cc792ff Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 7 Jun 2023 08:24:43 -0500 Subject: [PATCH 72/82] [SYCL][DOC] Remove duplicate section on graph properties (#209) Update sycl_ext_oneapi_graph.asciidoc --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 6559b4028e3ad..b6fa73df13d59 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -530,6 +530,17 @@ associated with a buffer that was created using a host data pointer will outlive any executable graphs created from a modifiable graph which uses that buffer. +==== No-Cycle Check + +The `property::graph::no_cycle_check` property disables any checks if a newly +added dependency will lead to a cycle in a specific `command_graph` and can be +passed to a `command_graph` on construction via the property list parameter. +As a result, no errors are reported when a function tries to create a cyclic +dependency. Thus, it's the user's responsibility to create an acyclic graph +for execution when this property is set. Creating a `command_graph` in +executable state through `finalize` from a graph with cyclic dependencies +is not allowed and results in undefined behavior. + ==== Executable Graph Update A graph in the executable state can have each nodes inputs & outputs updated @@ -891,19 +902,6 @@ Exceptions: updated. |=== -==== Graph Properties - -There is the one following property defined that can be passed to a `command_graph` -on construction via the property list parameter. - -1. `property::graph::no_cycle_check` - This property disables any checks if - a newly added dependency will lead to a cycle in a specific `command_graph`. - As a result, no errors are reported when a function tries to create a cyclic - dependency. Thus, it's the user's responsibility to create an acyclic graph - for execution when this property is set. Creating a `command_graph` in - executable state through `finalize` from a graph with cyclic dependencies - is not allowed and results in undefined behavior. - === Queue Class Modifications :queue-class: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class From 9e1f8f76f1d6abbebd360a1d6942e7be0412962c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 7 Jun 2023 16:43:35 +0100 Subject: [PATCH 73/82] [SYCL][Graph] code snippet to motivate buffer host copy (#210) Add a copy snippet to illustrate motivating use case behind the need for host data copy when buffers are used as part of a graph. Also updates a UK spelling of "behaviour" to US "behavior" I noticed. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index b6fa73df13d59..f1422b70843d7 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1108,7 +1108,7 @@ required by the graph (such as after being replaced through executable graph upd If a buffer created with a host data pointer is recorded as part of a submission to a command graph, the lifetime of that host data will also be extended by taking a copy of that data inside the buffer. This copy will occur during the call to -`queue::submit()`. Users can opt-out of this behaviour by passing the property +`queue::submit()`. Users can opt-out of this behavior by passing the property `graph::no_host_copy` to `command_graph::begin_recording()` or the constructor of the `command_graph` the commands will be recorded to. Passing the property to `begin_recording()` will prevent host copies only for commands recorded before @@ -1116,6 +1116,23 @@ of the `command_graph` the commands will be recorded to. Passing the property to `command_graph` constructor will prevent host copies for all commands recorded to the graph. +This behavior was introduced to address the case where host data used to create +a buffer is destroyed before the copy to device has happened. For example: +[source,c++] +---- +void foo(queue q /* queue in recording mode */ ) { + float data[NUM]; + buffer buf{data, range{NUM}}; + q.submit([&](handler &cgh) { + accessor acc{buf, cgh, read_only}; + cgh.single_task([] { + // use "acc" + }); + }); + // "data" goes out of scope +} +---- + The copy will not occur if the buffer is accessed inside the submitted command group function with an accessor with `access_mode::write` or the `no_init` property. From 2043866dce0326bd2c9249c0474801d36b891daf Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 16 Jun 2023 15:56:57 +0100 Subject: [PATCH 74/82] [SYCL][Doc] Correct property section level (#232) Fix the section hierarchy of "No Cycle Check" so that is on the same level as "No Host Copy" and under "Graph Properties". Additionally, hyphenate the section header and give "Property" suffix, so that section naming is consistent. Resolves feedback: https://github.com/intel/llvm/pull/5626#discussion_r1230045468 --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index f1422b70843d7..f6dedbe60bb10 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -530,7 +530,7 @@ associated with a buffer that was created using a host data pointer will outlive any executable graphs created from a modifiable graph which uses that buffer. -==== No-Cycle Check +===== No-Cycle-Check Property The `property::graph::no_cycle_check` property disables any checks if a newly added dependency will lead to a cycle in a specific `command_graph` and can be From 6a625891126c26917252003a8bbf49a9c4dcb65c Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 19 Jun 2023 07:15:59 -0500 Subject: [PATCH 75/82] [SYCL][DOC] Incorporate feedback from PR (#231) * [SYCL][DOC] Incorporate feedback from PR Addresses https://github.com/intel/llvm/pull/5626#discussion_r1230095503 https://github.com/intel/llvm/pull/5626#discussion_r1230102941 Co-authored-by: Ben Tracy Co-authored-by: Ewan Crawford --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index f6dedbe60bb10..a26e6d9d2215c 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -1107,17 +1107,8 @@ required by the graph (such as after being replaced through executable graph upd If a buffer created with a host data pointer is recorded as part of a submission to a command graph, the lifetime of that host data will also be extended by taking a -copy of that data inside the buffer. This copy will occur during the call to -`queue::submit()`. Users can opt-out of this behavior by passing the property -`graph::no_host_copy` to `command_graph::begin_recording()` or the constructor -of the `command_graph` the commands will be recorded to. Passing the property to -`begin_recording()` will prevent host copies only for commands recorded before -`end_recording()` is called for a given queue. Passing the property to the -`command_graph` constructor will prevent host copies for all commands recorded to -the graph. - -This behavior was introduced to address the case where host data used to create -a buffer is destroyed before the copy to device has happened. For example: +copy of that data inside the buffer. To illustrate, consider the following example: + [source,c++] ---- void foo(queue q /* queue in recording mode */ ) { @@ -1133,9 +1124,26 @@ void foo(queue q /* queue in recording mode */ ) { } ---- -The copy will not occur if the buffer is accessed inside the submitted command -group function with an accessor with `access_mode::write` or the `no_init` -property. +In this example, the implementation extends the lifetime of the buffer because +it is used in the recorded graph. Because the buffer uses the host memory data, +the implementation also makes an internal copy of that host data. As illustrated +above, that host memory might go out of scope before the recorded graph goes out +of scope, or before the data has been copied to the device. + +The default behavior is to always copy the host data in a case like this, but +this is not necessary if the user knows that the lifetime of the host data +outlives the lifetime of the recorded graph. If the user knows this is the +case, they may use the `graph::no_host_copy` property to avoid the internal +copy. Passing the property to `begin_recording()` will prevent host copies only +for commands recorded before `end_recording()` is called for a given queue. +Passing the property to the `command_graph` constructor will prevent host copies +for all commands recorded to the graph. + +The implementation guarantees that the host memory will not be copied internally +if all the commands accessing this buffer use `access_mode::write` or the +`no_init` property because the host memory is not needed in these cases. +Note, however, that these cases require the application to disable copy-back +as described in <>. === Host Tasks From d3ed070e64b0b60e7d8d87595b3460d61530ed7c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 19 Jun 2023 13:17:05 +0100 Subject: [PATCH 76/82] [SYCL][Doc] Define allowed differences in graph update (#233) * [SYCL][Doc] Define allowed differences in graph update Define what characteristic of a graph can be different when performing executable graph update. Resolves feedback https://github.com/intel/llvm/pull/5626#discussion_r1230114283 * Throw an error on change of device code in update * Update sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc Co-authored-by: Ben Tracy Co-authored-by: Pablo Reble --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index a26e6d9d2215c..2a4f65fe975f3 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -871,6 +871,19 @@ same command, targeting the same device. There is the additional limitation that to update an executable graph, every node in the graph must be either a kernel command or a host-task. +The only characteristics that can differ between two topologically identical +graphs during an update are: + +* The arguments to kernel nodes may be different between graphs. For example, + the graph may capture different values for the USM pointers or accessors used + in the graph. It is these kernels arguments in `graph` that constitute the + inputs & outputs to update to. + +* The code that is executed in host-task nodes may be different between graphs, + and the update will reflect the code from `graph` host-tasks nodes. However, + any changes to executable code between kernel nodes are not counted as + topologically identical and is an exception with error code `invalid`. + The effects of the update will be visible on the next submission of the executable graph without the need for additional user synchronization. From 9ebc7d518300dcf1145ad77290739098f5098c3e Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 5 Jul 2023 12:14:58 +0100 Subject: [PATCH 77/82] [SYCL] Update wording around updating kernel/host code (#248) - Make modifying kernel or host code in update undefined behaviour. - Use host task consistently in spec --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 2a4f65fe975f3..e81c7d71a5006 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -851,6 +851,8 @@ Exceptions: |=== +:sycl-kernel-function: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sycl-kernel-function + Table {counter: tableNumber}. Member functions of the `command_graph` class (executable graph update). [cols="2a,a"] |=== @@ -863,26 +865,26 @@ void update(const command_graph& graph); ---- + |Updates the executable graph node inputs & outputs from a topologically identical modifiable graph. A topologically identical graph is one with the same structure of nodes and edges, and the nodes added in the same order to both graphs. Equivalent nodes in topologically identical graphs each have the same command, targeting the same device. There is the additional limitation that to update an executable graph, every node in the graph must be either a kernel -command or a host-task. +command or a host task. The only characteristics that can differ between two topologically identical -graphs during an update are: +graphs during an update are the arguments to kernel nodes. For example, +the graph may capture different values for the USM pointers or accessors used +in the graph. It is these kernels arguments in `graph` that constitute the +inputs & outputs to update to. -* The arguments to kernel nodes may be different between graphs. For example, - the graph may capture different values for the USM pointers or accessors used - in the graph. It is these kernels arguments in `graph` that constitute the - inputs & outputs to update to. +Differences in the following characteristics between two graphs during an +update results in undefined behavior: -* The code that is executed in host-task nodes may be different between graphs, - and the update will reflect the code from `graph` host-tasks nodes. However, - any changes to executable code between kernel nodes are not counted as - topologically identical and is an exception with error code `invalid`. +* Modifying the native C++ callable of a `host task` node. +* Modifying the {sycl-kernel-function}[kernel function] of a kernel node. The effects of the update will be visible on the next submission of the executable graph without the need for additional user synchronization. @@ -907,7 +909,7 @@ Exceptions: :handler-copy-functions: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#table.members.handler.copy * Throws synchronously with error code `invalid` if `graph` contains any node - which is not a kernel command or host-task, e.g. + which is not a kernel command or host task, e.g. {handler-copy-functions}[memory operations]. * Throws synchronously with error code `invalid` if the context or device From 1a42a442c21c2338c83b3b17de097408692662eb Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 11 Jul 2023 12:25:25 +0100 Subject: [PATCH 78/82] [SYCL][Doc] Address superficial Graphs spec feedback (#253) Gordon and Greg left some spec feedback on our upstream PR, address the superficial comments. --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index e81c7d71a5006..4cd391377e0b1 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -19,7 +19,7 @@ == Notice -Copyright (c) 2022 Intel Corporation. All rights reserved. +Copyright (c) 2022-2023 Intel Corporation. All rights reserved. NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. @@ -67,10 +67,10 @@ not rely on APIs defined in this specification.* == Introduction -With command groups SYCL is already able to create a dependency +With command groups SYCL is already able to create an implicit dependency graph (in the form of a directed acyclic graph) of kernel execution at runtime, as a command group object defines a set of requisites (edges) which must be -satisfied for kernels (nodes) to be executed. However, because command-group +satisfied for commands (nodes) to be executed. However, because command-group submission is tied to execution on the queue, without having a prior construction step before starting execution, optimization opportunities are missed from the runtime not being made aware of a defined dependency graph ahead @@ -85,7 +85,7 @@ dependency graph to the SYCL runtime prior to execution: * Enable more work to be done ahead of time to improve runtime performance. This early work could be done in a setup phase of the program prior to repeated executions of the graph. Alternately, a future offline AOT compiler in a different - process could run be prior to the execution of the application. + process could be run prior to the execution of the application. * Unlock DMA hardware features through graph analysis by the runtime. @@ -115,7 +115,7 @@ requirements were considered: nodes. 4. Integrate sub-graphs (previously constructed graphs) when constructing a new graph. -5. Support the USM model of memory as well as buffer model. +5. Support the USM model of memory as well as buffer/accessor model. 6. Compatible with other SYCL extensions and features, e.g. kernel fusion & built-in kernels. 7. Ability to record a graph with commands submitted to different devices in the @@ -498,7 +498,7 @@ An instance of a `command_graph` object can be in one of two states: * **Executable** - Graph topology is fixed after finalization and graph is ready to be submitted for execution. -A `command_graph` object is constructed in the _recording_ state and is made +A `command_graph` object is constructed in the _modifiable_ state and is made _executable_ by the user invoking `command_graph::finalize()` to create a new executable instance of the graph. An executable graph cannot be converted to a modifiable graph. After finalizing a graph in the modifiable state, it is @@ -874,7 +874,7 @@ same command, targeting the same device. There is the additional limitation that to update an executable graph, every node in the graph must be either a kernel command or a host task. -The only characteristics that can differ between two topologically identical +The only characteristic that can differ between two topologically identical graphs during an update are the arguments to kernel nodes. For example, the graph may capture different values for the USM pointers or accessors used in the graph. It is these kernels arguments in `graph` that constitute the From 68c5b42999ccb861849d593ef15183885569f6b8 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 11 Jul 2023 18:59:58 +0100 Subject: [PATCH 79/82] [SYCL][Doc] Change `graph_support_level` namespace (#255) * [SYCL][Doc] Change `graph_support_level` namespace Address [Steffen's feedback](https://github.com/intel/llvm/pull/5626#discussion_r1259846186) and move the `info::device::graph_support_level` to enum up a namespace level to `info::graph_support_level`. Using `info::device::graph_support` to return a `info::graph_support_level` is analogous with the `info::device::device_type` query in the main spec which returns a `info::device_type`. Or `info::device::local_mem_type` which returns a `info::local_mem_type`. * Add Steffen as a contributor --- sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 4cd391377e0b1..3833576274699 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -40,6 +40,7 @@ John Pennycook, Intel + Guo Yejun, Intel + Dan Holmes, Intel + Greg Lueck, Intel + +Steffen Larsen, Intel + Ewan Crawford, Codeplay + Ben Tracy, Codeplay + Duncan McBain, Codeplay + @@ -330,15 +331,16 @@ class depends_on { // Device query for level of support namespace info { namespace device { + struct graphs_support; +} // namespace device + enum class graph_support_level { unsupported, native, emulated }; - -} // namespace device } // namespace info class node {}; From 498686c6ddce3eeeb8fea991a3e3a23d36739e95 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 11 Jul 2023 19:01:21 +0100 Subject: [PATCH 80/82] [SYCL][Doc] Simplify queue recorded node definition (#254) Simplify the definition of a node in the Record & Replay API. Move the wording around sub-graphs to the sub-graph section, and use "command" terminology rather than "kernel" to be more generic. Actions Gordon's feedback from: * Lack of clarity over inclusion of queue shortcut functions https://github.com/intel/llvm/pull/5626#discussion_r1255811915 * Say "command" rather than "kernel" https://github.com/intel/llvm/pull/5626#discussion_r125 --- .../proposed/sycl_ext_oneapi_graph.asciidoc | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 3833576274699..08c66e0175040 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -200,8 +200,6 @@ Table {counter: tableNumber}. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro. === SYCL Graph Terminology -:explicit-memory-ops: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:explicitmemory - Table {counter: tableNumber}. Terminology. [%header,cols="1,3"] |=== @@ -231,8 +229,8 @@ Table {counter: tableNumber}. Explicit Graph Definition. | Node | In the explicit graph building API nodes are created by the user invoking -methods on a modifiable graph. Each node represents either a command-group -function or an empty operation. +methods on a modifiable graph passing a command-group function (CGF). Each node +represents either a command-group or an empty operation. | Edge | In the explicit graph building API edges are primarily defined by the user @@ -258,17 +256,14 @@ Table {counter: tableNumber}. Recorded Graph Definition. | Concept | Description | Node -| A node in a queue recorded graph represents a command group submission to the -device associated with the queue begin recorded. Each submission encompasses -either one or both of a.) some data movement, b.) a single asynchronous kernel -launch. Nodes cannot define forward edges, only backwards. That is, kernels can -only create dependencies on command-groups that have already been submitted. -This means that transparently a node can depend on a previously recorded graph -(sub-graph), which works by creating edges to the individual nodes in the old -graph. Explicit memory operations without kernels, such as a memory copy, are -still classed as nodes under this definition, as the -{explicit-memory-ops}[SYCL 2020 specification states] that these can be seen as -specialized kernels executing on the device. +| A node in a queue recorded graph represents a command-group submission to the +device associated with the queue being recorded. Nodes are constructed from +the command-group functions (CGF) passed to `queue::submit()`, or from the queue +shortcut equivalents for the defined handler command types. Each submission +encompasses either one or both of a.) some data movement, b.) a single +asynchronous command launch. Nodes cannot define forward edges, only backwards. +That is, nodes can only create dependencies on command-groups that have already +been submitted. | Edge | An edge in a queue recorded graph is expressed through command group @@ -287,7 +282,13 @@ buffers. A node in a graph can take the form of a nested sub-graph. This occurs when a command-group submission that invokes `handler::ext_oneapi_graph()` with an -executable graph object is added to the graph as a node. +executable graph object is added to the graph as a node. The child graph node is +scheduled in the parent graph as-if edges are created to connect the root nodes +of the child graph with the dependent nodes of the parent graph. + +Adding an executable graph as a sub-graph does not affect its existing node +dependencies, such that it could be submitted in future without any side +effects of prior uses as a sub-graph. === API Modifications From 816777b8bb42cc2f039f81553ce2004bc661773d Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 12 Jul 2023 14:54:27 +0100 Subject: [PATCH 81/82] [SYCL][Doc] Rephrase "recording to" (#257) [Review feedback](https://github.com/intel/llvm/pull/5626#discussion_r1260071204) questioned the phrase "graph is currently being recorded to" as to whether "to" should be "too". I don't think it should be "too", but rephrasing to "graph is currently recording any queues" avoids any confusion. Also updated the contributors to add Maxime and Jaime (who made the comment). --- .../extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index 08c66e0175040..ecf2d8b846568 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -41,6 +41,7 @@ Guo Yejun, Intel + Dan Holmes, Intel + Greg Lueck, Intel + Steffen Larsen, Intel + +Jaime Arteaga Molina, Intel + Ewan Crawford, Codeplay + Ben Tracy, Codeplay + Duncan McBain, Codeplay + @@ -50,6 +51,7 @@ Gordon Brown, Codeplay + Erik Tomusk, Codeplay + Bjoern Knafla, Codeplay + Lukas Sommer, Codeplay + +Maxime France-Pillois, Codeplay + Ronan Keryell, AMD + == Dependencies @@ -169,13 +171,13 @@ what data is internal to the graph for optimization, and dependencies don't need to be inferred. It is valid to combine these two mechanisms, however it is invalid to modify -a graph using the explicit API while that graph is currently being recorded to, -for example: +a graph using the explicit API while that graph is currently recording commands +from any queue, for example: [source, c++] ---- graph.begin_recording(queue); -graph.add(/*command group*/); // Invalid as graph is being recorded to +graph.add(/*command group*/); // Invalid as graph is recording a queue graph.end_recording(); ---- From b03b6ade9cb972b33f65ea5ccbf27d966db5f764 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Jul 2023 19:04:02 +0100 Subject: [PATCH 82/82] [SYCL][Graph] Revise undefined behaviour with no_cycle_check (#263) - Change wording around undefined behaviour when creating a cycle with no checks - UB is created at point of adding the cycle, not at finalize. --- .../doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc index ecf2d8b846568..178dfa9a4315e 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc @@ -542,9 +542,10 @@ added dependency will lead to a cycle in a specific `command_graph` and can be passed to a `command_graph` on construction via the property list parameter. As a result, no errors are reported when a function tries to create a cyclic dependency. Thus, it's the user's responsibility to create an acyclic graph -for execution when this property is set. Creating a `command_graph` in -executable state through `finalize` from a graph with cyclic dependencies -is not allowed and results in undefined behavior. +for execution when this property is set. Creating a cycle in a `command_graph` +puts that `command_graph` into an undefined state. Any further operations +performed on a `command_graph` in this state will result in undefined +behavior. ==== Executable Graph Update