Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Graph] Support for native-command #383

Draft
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# Date: Fri Jan 31 10:15:03 2025 +0000
# Merge pull request #2575 from DBDuncan/duncan/extend-copies
# [CUDA][Bindless] Add support for device to device pitched copies and host to host copies
set(UNIFIED_RUNTIME_TAG e2df8acd37c3c159364cfdf9dfdc1be35b71779e)
set(UNIFIED_RUNTIME_TAG "ewan/native_command")
11 changes: 11 additions & 0 deletions sycl/include/sycl/detail/backend_traits_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ typedef struct CUctx_st *CUcontext;
typedef struct CUstream_st *CUstream;
typedef struct CUevent_st *CUevent;
typedef struct CUmod_st *CUmodule;
typedef struct CUgraph_st *CUgraph;

// As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2
#if defined(_WIN64) || defined(__LP64__)
Expand Down Expand Up @@ -102,6 +103,16 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, queue> {
using type = CUstream;
};

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <> struct BackendInput<backend::ext_oneapi_cuda, graph> {
using type = CUgraph;
};

template <> struct BackendReturn<backend::ext_oneapi_cuda, graph> {
using type = CUgraph;
};

} // namespace detail
} // namespace _V1
} // namespace sycl
11 changes: 11 additions & 0 deletions sycl/include/sycl/detail/backend_traits_hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ typedef struct ihipStream_t *HIPstream;
typedef struct ihipEvent_t *HIPevent;
typedef struct ihipModule_t *HIPmodule;
typedef void *HIPdeviceptr;
typedef struct ihipGraph *HIPGraph;

namespace sycl {
inline namespace _V1 {
Expand Down Expand Up @@ -96,6 +97,16 @@ template <> struct BackendReturn<backend::ext_oneapi_hip, queue> {
using type = HIPstream;
};

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <> struct BackendInput<backend::ext_oneapi_hip, graph> {
using type = HIPGraph;
};

template <> struct BackendReturn<backend::ext_oneapi_hip, graph> {
using type = HIPGraph;
};

template <> struct InteropFeatureSupportMap<backend::ext_oneapi_hip> {
static constexpr bool MakePlatform = false;
static constexpr bool MakeDevice = true;
Expand Down
10 changes: 10 additions & 0 deletions sycl/include/sycl/detail/backend_traits_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,16 @@ template <> struct BackendReturn<backend::ext_oneapi_level_zero, kernel> {
using type = ze_kernel_handle_t;
};

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <> struct BackendInput<backend::ext_oneapi_level_zero, graph> {
using type = ze_command_list_handle_t;
};

template <> struct BackendReturn<backend::ext_oneapi_level_zero, graph> {
using type = ze_command_list_handle_t;
};

template <> struct InteropFeatureSupportMap<backend::ext_oneapi_level_zero> {
static constexpr bool MakePlatform = true;
static constexpr bool MakeDevice = true;
Expand Down
10 changes: 10 additions & 0 deletions sycl/include/sycl/detail/backend_traits_opencl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,16 @@ template <> struct BackendReturn<backend::opencl, kernel> {
using type = cl_kernel;
};

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <> struct BackendInput<backend::opencl, graph> {
using type = cl_command_buffer_khr;
};

template <> struct BackendReturn<backend::opencl, graph> {
using type = cl_command_buffer_khr;
};

template <> struct InteropFeatureSupportMap<backend::opencl> {
static constexpr bool MakePlatform = true;
static constexpr bool MakeDevice = true;
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ enum class node_type {
prefetch = 6,
memadvise = 7,
ext_oneapi_barrier = 8,
host_task = 9
host_task = 9,
native_command = 10
};

/// Class representing a node in the graph, returned by command_graph::add().
Expand Down
3 changes: 0 additions & 3 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler {
void(interop_handle)>::value>
ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) {
#ifndef __SYCL_DEVICE_ONLY__
throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_codeplay_enqueue_native_command>();
ext_codeplay_enqueue_native_command_impl(Func);
#endif
}
Expand Down
30 changes: 28 additions & 2 deletions sycl/include/sycl/interop_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class interop_handle {
/// interop_handle.
__SYCL_EXPORT backend get_backend() const noexcept;

/// Returns true if command-group is being added to a graph as a node
__SYCL_EXPORT bool has_graph() const noexcept;

/// Receives a SYCL accessor that has been defined as a requirement for the
/// command group, and returns the underlying OpenCL memory object that is
/// used by the SYCL runtime. If the accessor passed as parameter is not part
Expand Down Expand Up @@ -134,6 +137,26 @@ class interop_handle {
#endif
}

using graph = ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::modifiable>;
template <backend Backend = backend::opencl>
backend_return_t<Backend, graph> get_native_graph() const {
#ifndef __SYCL_DEVICE_ONLY__
// TODO: replace the exception thrown below with the SYCL 2020 exception
// with the error code 'errc::backend_mismatch' when those new exceptions
// are ready to be used.
if (Backend != get_backend())
throw exception(make_error_code(errc::invalid),
"Incorrect backend argument was passed");

// C-style cast required to allow various native types
return (backend_return_t<Backend, graph>)getNativeGraph();
#else
// we believe this won't be ever called on device side
return 0;
#endif
}

/// Returns the SYCL application interoperability native backend object
/// associated with the device associated with the SYCL queue that the host
/// task was submitted to. The native backend object returned must be in
Expand Down Expand Up @@ -186,8 +209,9 @@ class interop_handle {
interop_handle(std::vector<ReqToMem> MemObjs,
const std::shared_ptr<detail::queue_impl> &Queue,
const std::shared_ptr<detail::device_impl> &Device,
const std::shared_ptr<detail::context_impl> &Context)
: MQueue(Queue), MDevice(Device), MContext(Context),
const std::shared_ptr<detail::context_impl> &Context,
const ur_exp_command_buffer_handle_t &Graph)
: MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph),
MMemObjs(std::move(MemObjs)) {}

template <backend Backend, typename DataT, int Dims>
Expand All @@ -211,10 +235,12 @@ class interop_handle {
getNativeQueue(int32_t &NativeHandleDesc) const;
__SYCL_EXPORT ur_native_handle_t getNativeDevice() const;
__SYCL_EXPORT ur_native_handle_t getNativeContext() const;
__SYCL_EXPORT ur_native_handle_t getNativeGraph() const;

std::shared_ptr<detail::queue_impl> MQueue;
std::shared_ptr<detail::device_impl> MDevice;
std::shared_ptr<detail::context_impl> MContext;
ur_exp_command_buffer_handle_t MGraph;

std::vector<ReqToMem> MMemObjs;
};
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
std::shared_ptr<node_impl> Node) {

// Queue which will be used for allocation operations for accessors.
// Will also be used in native commands to return to the user in
// `interop_handler::get_native_queue()` calls
auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{},
sycl::property_list{});
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) {
return node_type::host_task;
case sycl::detail::CGType::ExecCommandBuffer:
return node_type::subgraph;
case sycl::detail::CGType::EnqueueNativeCommand:
return node_type::native_command;
default:
assert(false && "Invalid Graph Node Type");
return node_type::empty;
Expand Down
66 changes: 64 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ class DispatchHostTask {
"Host task submissions should have an associated queue");
interop_handle IH{MReqToMem, HostTask.MQueue,
HostTask.MQueue->getDeviceImplPtr(),
HostTask.MQueue->getContextImplPtr()};
HostTask.MQueue->getContextImplPtr(), nullptr};
// TODO: should all the backends that support this entry point use this
// for host task?
auto &Queue = HostTask.MQueue;
Expand Down Expand Up @@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue,
return Error;
}

namespace {

struct CommandBufferNativeCommandData {
sycl::interop_handle ih;
std::function<void(interop_handle)> func;
};

void CommandBufferInteropFreeFunc(void *InteropData) {
auto *Data = reinterpret_cast<CommandBufferNativeCommandData *>(InteropData);
return Data->func(Data->ih);
}
} // namespace

ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
assert(MQueue && "Command buffer enqueue should have an associated queue");
// Wait on host command dependencies
Expand Down Expand Up @@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
MEvent->setSyncPoint(OutSyncPoint);
return UR_RESULT_SUCCESS;
}
case CGType::EnqueueNativeCommand: {
// Queue is created by graph_impl before creating command to submit to
// scheduler.
const AdapterPtr &Adapter = MQueue->getAdapter();
const auto Backend = MQueue->get_device().get_backend();
CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get();

// TODO - Doc this
ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr;
if (Backend == sycl::backend::ext_oneapi_cuda ||
Backend == sycl::backend::ext_oneapi_hip) {

ur_exp_command_buffer_desc_t Desc{
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/,
nullptr /*pnext*/, false /* updatable */, false /* in-order */,
false /* profilable*/
};
auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context());
auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device());
Adapter->call<sycl::detail::UrApiKind::urCommandBufferCreateExp>(
ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
&ChildCommandBuffer);
}

std::vector<interop_handle::ReqToMem> ReqToMem; // TODO work with buffers
interop_handle IH{ReqToMem, HostTask->MQueue,
HostTask->MQueue->getDeviceImplPtr(),
HostTask->MQueue->getContextImplPtr(),
ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer};
CommandBufferNativeCommandData CustomOpData{
IH, HostTask->MHostTask->MInteropTask};

Adapter->call<UrApiKind::urCommandBufferAppendNativeCommandExp>(
MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData,
ChildCommandBuffer, MSyncPointDeps.size(),
MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(),
&OutSyncPoint);

if (ChildCommandBuffer) {
ur_result_t Res = Adapter->call_nocheck<
sycl::detail::UrApiKind::urCommandBufferReleaseExp>(
ChildCommandBuffer);
(void)Res;
assert(Res == UR_RESULT_SUCCESS);
}

MEvent->setSyncPoint(OutSyncPoint);
return UR_RESULT_SUCCESS;
}

default:
throw exception(make_error_code(errc::runtime),
Expand Down Expand Up @@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
EnqueueNativeCommandData CustomOpData{
interop_handle{ReqToMem, HostTask->MQueue,
HostTask->MQueue->getDeviceImplPtr(),
HostTask->MQueue->getContextImplPtr()},
HostTask->MQueue->getContextImplPtr(), nullptr},
HostTask->MHostTask->MInteropTask};

ur_bool_t NativeCommandSupport = false;
Expand Down
14 changes: 14 additions & 0 deletions sycl/source/interop_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ backend interop_handle::get_backend() const noexcept {
return detail::getImplBackend(MQueue);
}

bool interop_handle::has_graph() const noexcept { return MGraph != nullptr; }

ur_native_handle_t
interop_handle::getNativeMem(detail::Requirement *Req) const {
auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs),
Expand Down Expand Up @@ -53,5 +55,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const {
return MQueue->getNative(NativeHandleDesc);
}

ur_native_handle_t interop_handle::getNativeGraph() const {
if (!MGraph) {
throw exception(make_error_code(errc::invalid),
"Command-Group is not being added as a graph node");
}

auto Adapter = MQueue->getAdapter();
ur_native_handle_t Handle;
Adapter->call<detail::UrApiKind::urCommandBufferGetNativeHandleExp>(MGraph,
&Handle);
return Handle;
}
} // namespace _V1
} // namespace sycl
1 change: 1 addition & 0 deletions sycl/test-e2e/Graph/NativeCommand/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config.required_features += ['aspect-ext_oneapi_limited_graph']
Loading