Skip to content

Commit

Permalink
[SYCL][Graph] Update design doc for copy optimization and add test
Browse files Browse the repository at this point in the history
- Update UR tag to include L0 command-buffer copy engine optimization
- Add test which mixes copy and kernel commands
- Update design doc to detail copy engine optimization
  • Loading branch information
mfrancepillois authored and Bensuo committed Jun 10, 2024
1 parent c98a37f commit eb67857
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 8 deletions.
18 changes: 18 additions & 0 deletions sycl/doc/design/CommandGraph.md
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,24 @@ Level Zero:
Future work will include exploring L0 API extensions to improve the mapping of
UR command-buffer to L0 command-list.

#### Copy Engine

For performance considerations, the Unified Runtime Level Zero adapter uses
different Level Zero command-queues to submit compute kernels and memory
operations when the device has a dedicated copy engine. To take advantage of the
copy engine when available, the graph workload can also be split between memory
operations and compute kernels. To achieve this, two graph workload
command-lists live simultaneously in a command-buffer.

When the command-buffer is finalized, memory operations (e.g. buffer copy,
buffer fill, ...) are enqueued in the *copy* command-list while the other
commands are enqueued in the compute command-list. On submission, if not empty,
the *copy* command-list is sent to the main copy command-queue while the compute
command-list is sent to the compute command-queue.

Both are executed concurrently. Synchronization between the command-lists is
handled by Level Zero events.

### CUDA

The SYCL Graph CUDA backend relies on the
Expand Down
10 changes: 2 additions & 8 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
# commit f06bc02a24418990ecb9d41471b729f32aebd804
# Merge: 42c0b025 9868e3b0
# Author: Kenneth Benzie (Benie) <[email protected]>
Expand All @@ -110,13 +110,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)

fetch_adapter_source(level_zero
${UNIFIED_RUNTIME_REPO}
# commit 0f118d75f6bfe77eb8d933808f7a125e77ca7358
# Merge: ed4211cd 721d63c6
# Author: Kenneth Benzie (Benie) <[email protected]>
# Date: Mon Jun 10 10:38:10 2024 +0100
# Merge pull request #1629 from Bensuo/ewan/L0_update_host_wait
# Use fence rather than event for sync in L0 command-buffer update
"0f118d75f6bfe77eb8d933808f7a125e77ca7358"
cmd-buf-copy-queue
)

fetch_adapter_source(opencl
Expand Down
102 changes: 102 additions & 0 deletions sycl/test-e2e/Graph/ValidUsage/linear_graph_l0_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//

// Tests that the optimization to use the L0 Copy Engine for memory commands
// does not interfere with the linear graph optimization

#include "../graph_common.hpp"

#include <sycl/properties/queue_properties.hpp>

int main() {
queue Queue{{sycl::property::queue::in_order{}}};

using T = int;

const T ModValue = 7;
std::vector<T> DataA(Size), DataB(Size), DataC(Size);

std::iota(DataA.begin(), DataA.end(), 1);
std::iota(DataB.begin(), DataB.end(), 10);
std::iota(DataC.begin(), DataC.end(), 1000);

// Create reference data for output
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
for (size_t i = 0; i < Iterations; i++) {
for (size_t j = 0; j < Size; j++) {
ReferenceA[j] += ModValue;
ReferenceB[j] = ReferenceA[j];
ReferenceB[j] -= ModValue;
ReferenceC[j] = ReferenceB[j];
ReferenceC[j] += ModValue;
}
}

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

Graph.begin_recording(Queue);
Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrA[LinID] += ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrB[LinID] -= ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrC[LinID] += ModValue;
});
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event =
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.copy(PtrA, DataA.data(), Size, Event);
Queue.copy(PtrB, DataB.data(), Size, Event);
Queue.copy(PtrC, DataC.data(), Size, Event);
Queue.wait_and_throw();

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

for (size_t i = 0; i < Size; i++) {
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
}
}

0 comments on commit eb67857

Please sign in to comment.