Skip to content

Commit

Permalink
[SYCL][Graph] Support for native-command
Browse files Browse the repository at this point in the history
WIP Prototype [sycl_ext_codeplay_enqueue_native_command](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc) support
for SYCL-Graph.

TODO:

* buffer support
* HIP/Level-Zero/OpenCL native tests
* spec wording
  • Loading branch information
EwanC committed Jan 31, 2025
1 parent adeaea8 commit 5a5aef2
Show file tree
Hide file tree
Showing 12 changed files with 296 additions and 10 deletions.
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# Merge pull request #2578 from Bensuo/ewan/remove_command_ref_counting
#
# Remove command-buffer command handle ref counting
set(UNIFIED_RUNTIME_TAG 14f4a3ba70b91b3adc411ec6bfc8ae86e948a990)
set(UNIFIED_RUNTIME_TAG "ewan/native_command")
11 changes: 11 additions & 0 deletions sycl/include/sycl/detail/backend_traits_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ typedef struct CUctx_st *CUcontext;
typedef struct CUstream_st *CUstream;
typedef struct CUevent_st *CUevent;
typedef struct CUmod_st *CUmodule;
typedef struct CUgraph_st *CUgraph;

// As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2
#if defined(_WIN64) || defined(__LP64__)
Expand Down Expand Up @@ -102,6 +103,16 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, queue> {
using type = CUstream;
};

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <> struct BackendInput<backend::ext_oneapi_cuda, graph> {
using type = CUgraph;
};

template <> struct BackendReturn<backend::ext_oneapi_cuda, graph> {
using type = CUgraph;
};

} // namespace detail
} // namespace _V1
} // namespace sycl
3 changes: 2 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ enum class node_type {
prefetch = 6,
memadvise = 7,
ext_oneapi_barrier = 8,
host_task = 9
host_task = 9,
native_command = 10
};

/// Class representing a node in the graph, returned by command_graph::add().
Expand Down
3 changes: 0 additions & 3 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler {
void(interop_handle)>::value>
ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) {
#ifndef __SYCL_DEVICE_ONLY__
throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_codeplay_enqueue_native_command>();
ext_codeplay_enqueue_native_command_impl(Func);
#endif
}
Expand Down
30 changes: 28 additions & 2 deletions sycl/include/sycl/interop_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class interop_handle {
/// interop_handle.
__SYCL_EXPORT backend get_backend() const noexcept;

/// Returns true if command-group is being added to a graph as a node
__SYCL_EXPORT bool has_graph() const noexcept;

/// Receives a SYCL accessor that has been defined as a requirement for the
/// command group, and returns the underlying OpenCL memory object that is
/// used by the SYCL runtime. If the accessor passed as parameter is not part
Expand Down Expand Up @@ -134,6 +137,26 @@ class interop_handle {
#endif
}

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <backend Backend = backend::opencl>
backend_return_t<Backend, graph> get_native_graph() const {
#ifndef __SYCL_DEVICE_ONLY__
// TODO: replace the exception thrown below with the SYCL 2020 exception
// with the error code 'errc::backend_mismatch' when those new exceptions
// are ready to be used.
if (Backend != get_backend())
throw exception(make_error_code(errc::invalid),
"Incorrect backend argument was passed");

// C-style cast required to allow various native types
return (backend_return_t<Backend, graph>)getNativeGraph();
#else
// we believe this won't be ever called on device side
return 0;
#endif
}

/// Returns the SYCL application interoperability native backend object
/// associated with the device associated with the SYCL queue that the host
/// task was submitted to. The native backend object returned must be in
Expand Down Expand Up @@ -186,8 +209,9 @@ class interop_handle {
interop_handle(std::vector<ReqToMem> MemObjs,
const std::shared_ptr<detail::queue_impl> &Queue,
const std::shared_ptr<detail::device_impl> &Device,
const std::shared_ptr<detail::context_impl> &Context)
: MQueue(Queue), MDevice(Device), MContext(Context),
const std::shared_ptr<detail::context_impl> &Context,
const ur_exp_command_buffer_handle_t &Graph)
: MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph),
MMemObjs(std::move(MemObjs)) {}

template <backend Backend, typename DataT, int Dims>
Expand All @@ -211,10 +235,12 @@ class interop_handle {
getNativeQueue(int32_t &NativeHandleDesc) const;
__SYCL_EXPORT ur_native_handle_t getNativeDevice() const;
__SYCL_EXPORT ur_native_handle_t getNativeContext() const;
__SYCL_EXPORT ur_native_handle_t getNativeGraph() const;

std::shared_ptr<detail::queue_impl> MQueue;
std::shared_ptr<detail::device_impl> MDevice;
std::shared_ptr<detail::context_impl> MContext;
ur_exp_command_buffer_handle_t MGraph;

std::vector<ReqToMem> MMemObjs;
};
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
std::shared_ptr<node_impl> Node) {

// Queue which will be used for allocation operations for accessors.
// Will also be used in native commands to return to the user in
// `interop_handler::get_native_queue()` calls
auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{},
sycl::property_list{});
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) {
return node_type::host_task;
case sycl::detail::CGType::ExecCommandBuffer:
return node_type::subgraph;
case sycl::detail::CGType::EnqueueNativeCommand:
return node_type::native_command;
default:
assert(false && "Invalid Graph Node Type");
return node_type::empty;
Expand Down
66 changes: 64 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ class DispatchHostTask {
"Host task submissions should have an associated queue");
interop_handle IH{MReqToMem, HostTask.MQueue,
HostTask.MQueue->getDeviceImplPtr(),
HostTask.MQueue->getContextImplPtr()};
HostTask.MQueue->getContextImplPtr(), nullptr};
// TODO: should all the backends that support this entry point use this
// for host task?
auto &Queue = HostTask.MQueue;
Expand Down Expand Up @@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue,
return Error;
}

namespace {

struct CommandBufferNativeCommandData {
sycl::interop_handle ih;
std::function<void(interop_handle)> func;
};

void CommandBufferInteropFreeFunc(void *InteropData) {
auto *Data = reinterpret_cast<CommandBufferNativeCommandData *>(InteropData);
return Data->func(Data->ih);
}
} // namespace

ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
assert(MQueue && "Command buffer enqueue should have an associated queue");
// Wait on host command dependencies
Expand Down Expand Up @@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
MEvent->setSyncPoint(OutSyncPoint);
return UR_RESULT_SUCCESS;
}
case CGType::EnqueueNativeCommand: {
// Queue is created by graph_impl before creating command to submit to
// scheduler.
const AdapterPtr &Adapter = MQueue->getAdapter();
const auto Backend = MQueue->get_device().get_backend();
CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get();

// TODO - Doc this
ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr;
if (Backend == sycl::backend::ext_oneapi_cuda ||
Backend == sycl::backend::ext_oneapi_hip) {

ur_exp_command_buffer_desc_t Desc{
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/,
nullptr /*pnext*/, false /* updatable */, false /* in-order */,
false /* profilable*/
};
auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context());
auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device());
Adapter->call<sycl::detail::UrApiKind::urCommandBufferCreateExp>(
ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
&ChildCommandBuffer);
}

std::vector<interop_handle::ReqToMem> ReqToMem; // TODO work with buffers
interop_handle IH{ReqToMem, HostTask->MQueue,
HostTask->MQueue->getDeviceImplPtr(),
HostTask->MQueue->getContextImplPtr(),
ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer};
CommandBufferNativeCommandData CustomOpData{
IH, HostTask->MHostTask->MInteropTask};

Adapter->call<UrApiKind::urCommandBufferAppendNativeCommandExp>(
MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData,
ChildCommandBuffer, MSyncPointDeps.size(),
MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(),
&OutSyncPoint);

if (ChildCommandBuffer) {
ur_result_t Res = Adapter->call_nocheck<
sycl::detail::UrApiKind::urCommandBufferReleaseExp>(
ChildCommandBuffer);
(void)Res;
assert(Res == UR_RESULT_SUCCESS);
}

MEvent->setSyncPoint(OutSyncPoint);
return UR_RESULT_SUCCESS;
}

default:
throw exception(make_error_code(errc::runtime),
Expand Down Expand Up @@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
EnqueueNativeCommandData CustomOpData{
interop_handle{ReqToMem, HostTask->MQueue,
HostTask->MQueue->getDeviceImplPtr(),
HostTask->MQueue->getContextImplPtr()},
HostTask->MQueue->getContextImplPtr(), nullptr},
HostTask->MHostTask->MInteropTask};

ur_bool_t NativeCommandSupport = false;
Expand Down
14 changes: 14 additions & 0 deletions sycl/source/interop_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ backend interop_handle::get_backend() const noexcept {
return detail::getImplBackend(MQueue);
}

bool interop_handle::has_graph() const noexcept { return MGraph != nullptr; }

ur_native_handle_t
interop_handle::getNativeMem(detail::Requirement *Req) const {
auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs),
Expand Down Expand Up @@ -53,5 +55,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const {
return MQueue->getNative(NativeHandleDesc);
}

ur_native_handle_t interop_handle::getNativeGraph() const {
if (!MGraph) {
throw exception(make_error_code(errc::invalid),
"Command-Group is not being added as a graph node");
}

auto Adapter = MQueue->getAdapter();
ur_native_handle_t Handle;
Adapter->call<detail::UrApiKind::urCommandBufferGetNativeHandleExp>(MGraph,
&Handle);
return Handle;
}
} // namespace _V1
} // namespace sycl
81 changes: 81 additions & 0 deletions sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// RUN: %{build} -g -o %t.out -lcuda
// RUN: %{run} %t.out
// REQUIRES: cuda

#include "../graph_common.hpp"
#include <cuda.h>
#include <sycl/backend.hpp>
#include <sycl/interop_handle.hpp>

int main() {
queue Queue;

int *PtrX = malloc_device<int>(Size, Queue);
int *PtrY = malloc_device<int>(Size, Queue);

exp_ext::command_graph Graph{Queue};

Graph.begin_recording(Queue);

auto EventA = Queue.submit([&](handler &CGH) {
CGH.single_task([=]() {
for (size_t i = 0; i < Size; i++) {
PtrX[i] = i;
PtrY[i] = 0;
}
});
});

auto EventB = Queue.submit([&](handler &CGH) {
CGH.depends_on(EventA);

CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) {
if (IH.has_graph()) {
CUgraph NativeGraph = IH.get_native_graph<backend::ext_oneapi_cuda>();

CUDA_MEMCPY3D Params;
std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D));
Params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
Params.srcDevice = (CUdeviceptr)PtrX;
Params.srcHost = nullptr;
Params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
Params.dstDevice = (CUdeviceptr)PtrY, Params.dstHost = nullptr;
Params.WidthInBytes = Size * sizeof(int);
Params.Height = 1;
Params.Depth = 1;

CUgraphNode Node;
CUcontext Context = IH.get_native_context<backend::ext_oneapi_cuda>();
auto Res = cuGraphAddMemcpyNode(&Node, NativeGraph, nullptr, 0, &Params,
Context);
assert(Res == CUDA_SUCCESS);
} else {
assert(false && "Native Handle should have a graph");
}
});
});

Queue.submit([&](handler &CGH) {
CGH.depends_on(EventB);
CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; });
});

Graph.end_recording();

auto ExecGraph = Graph.finalize();
Queue.ext_oneapi_graph(ExecGraph).wait();

std::vector<int> HostData(Size);

Queue.copy(PtrY, HostData.data(), Size).wait();
for (size_t i = 0; i < Size; i++) {
const int Ref = i * 2;
assert(check_value(Ref, HostData[i],
std::string("HostData at index ") + std::to_string(i)));
}

free(PtrX, Queue);
free(PtrY, Queue);

return 0;
}
Loading

0 comments on commit 5a5aef2

Please sign in to comment.