From d92c4ff950db7167d1d51551bbc0c31bdb30d04c Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 24 Jul 2024 19:50:40 +0000 Subject: [PATCH] SWDEV-475341 - Fix stream resolution for graphs launches This issue was happening because of incorrect usage of getStream call, if we get the null stream first and then typecast it, and call on getStream again, we lose the advantage of simply passing "nullptr" to indicate NULL stream. Thus we enter the waitActiveStream call and add barriers to sync across streams. Change-Id: I94dc4e3ec927295b9e1ab6dee4b37d7d3e00b0cc --- hipamd/src/hip_graph_internal.cpp | 33 ++++++++++----------- hipamd/src/hip_graph_internal.hpp | 49 +++++++++++++++---------------- 2 files changed, 39 insertions(+), 43 deletions(-) diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index cec2d6821..ee0bd5b65 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -606,7 +606,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St } else { topoOrder[i]->SetStream(hip_stream, graphExec); status = topoOrder[i]->CreateCommand(topoOrder[i]->GetQueue()); - topoOrder[i]->EnqueueCommands(reinterpret_cast(hip_stream)); + topoOrder[i]->EnqueueCommands(hip_stream); } } @@ -617,17 +617,14 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St return status; } -hipError_t GraphExec::Run(hipStream_t stream) { +hipError_t GraphExec::Run(hipStream_t graph_launch_stream) { hipError_t status = hipSuccess; - if (hip::getStream(stream) == nullptr) { - return hipErrorInvalidResourceHandle; - } - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() - : reinterpret_cast(stream); + hip::Stream* launch_stream = hip::getStream(graph_launch_stream); + if (flags_ & hipGraphInstantiateFlagAutoFreeOnLaunch) { if (!topoOrder_.empty()) { - topoOrder_[0]->GetParentGraph()->FreeAllMemory(hip_stream); + topoOrder_[0]->GetParentGraph()->FreeAllMemory(launch_stream); } } @@ -644,31 +641,31 @@ hipError_t GraphExec::Run(hipStream_t stream) { } if (parallelLists_.size() == 1 && - instantiateDeviceId_ == hip_stream->DeviceId()) { + instantiateDeviceId_ == launch_stream->DeviceId()) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { // If the graph has kernels that does device side allocation, during packet capture, heap is // allocated because heap pointer has to be added to the AQL packet, and initialized during // graph launch. static bool initialized = false; if (!initialized && HasHiddenHeap()) { - hip_stream->vdev()->HiddenHeapInit(); + launch_stream->vdev()->HiddenHeapInit(); initialized = true; } } - status = EnqueueGraphWithSingleList(topoOrder_, hip_stream, this); + status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this); } else if (parallelLists_.size() == 1 && - instantiateDeviceId_ != hip_stream->DeviceId()) { + instantiateDeviceId_ != launch_stream->DeviceId()) { for (int i = 0; i < topoOrder_.size(); i++) { - topoOrder_[i]->SetStream(hip_stream, this); + topoOrder_[i]->SetStream(launch_stream, this); status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); - topoOrder_[i]->EnqueueCommands(stream); + topoOrder_[i]->EnqueueCommands(launch_stream); } } else { - UpdateStream(parallelLists_, hip_stream, this); + UpdateStream(parallelLists_, launch_stream, this); amd::Command* rootCommand = nullptr; amd::Command* endCommand = nullptr; status = FillCommands(parallelLists_, nodeWaitLists_, topoOrder_, clonedGraph_, rootCommand, - endCommand, hip_stream); + endCommand, launch_stream); if (status != hipSuccess) { return status; } @@ -677,7 +674,7 @@ hipError_t GraphExec::Run(hipStream_t stream) { rootCommand->release(); } for (int i = 0; i < topoOrder_.size(); i++) { - topoOrder_[i]->EnqueueCommands(reinterpret_cast(topoOrder_[i]->GetQueue())); + topoOrder_[i]->EnqueueCommands(topoOrder_[i]->GetQueue()); } if (endCommand != nullptr) { endCommand->enqueue(); @@ -685,7 +682,7 @@ hipError_t GraphExec::Run(hipStream_t stream) { } } amd::ScopedLock lock(GraphExecStatusLock_); - GraphExecStatus_[this] = std::make_pair(hip_stream, false); + GraphExecStatus_[this] = std::make_pair(launch_stream, false); ResetQueueIndex(); return status; } diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index b0ce0ba8b..2bbac67db 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -325,7 +325,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute { } virtual hipError_t GetNumParallelStreams(size_t &num) { return hipSuccess; } /// Enqueue commands part of the node - virtual void EnqueueCommands(hipStream_t stream) { + virtual void EnqueueCommands(hip::Stream* stream) { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. // Node can be enabled/disabled only for kernel, memcpy and memset nodes. if (!isEnabled_ && @@ -335,8 +335,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute { if (!commands_.empty()) { waitList = commands_[0]->eventWaitList(); } - hip::Stream* hip_stream = hip::getStream(stream); - amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); return; @@ -757,10 +756,10 @@ struct ChildGraphNode : public GraphNode { bool TopologicalOrder(std::vector& TopoOrder) override { return childGraph_->TopologicalOrder(TopoOrder); } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { if (graphCaptureStatus_) { hipError_t status = - EnqueueGraphWithSingleList(childGraphNodeOrder_, reinterpret_cast(stream)); + EnqueueGraphWithSingleList(childGraphNodeOrder_, stream); } else { // enqueue child graph start command if (startCommand_ != nullptr) { @@ -822,7 +821,7 @@ class GraphKernelNode : public GraphNode { size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } bool HasHiddenHeap() const { return hasHiddenHeap_; } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. // Node can be enabled/disabled only for kernel, memcpy and memset nodes. if (!isEnabled_) { @@ -830,8 +829,7 @@ class GraphKernelNode : public GraphNode { if (!commands_.empty()) { waitList = commands_[0]->eventWaitList(); } - hip::Stream* hip_stream = hip::getStream(stream); - amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); return; @@ -845,6 +843,7 @@ class GraphKernelNode : public GraphNode { command->release(); } } + void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override { out << "["; out << "style"; @@ -1284,12 +1283,12 @@ class GraphMemcpyNode : public GraphNode { return status; } - virtual void EnqueueCommands(hipStream_t stream) override { + virtual void EnqueueCommands(hip::Stream* stream) override { if ( (copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) && isEnabled_ && IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) { ihipHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr, copyParams_.extent.width * copyParams_.extent.height * - copyParams_.extent.depth, *hip::getStream(stream)); + copyParams_.extent.depth, *stream); return; } GraphNode::EnqueueCommands(stream); @@ -1438,7 +1437,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { return status; } - virtual void EnqueueCommands(hipStream_t stream) override { + virtual void EnqueueCommands(hip::Stream* stream) override { bool isH2H = false; if ((kind_ == hipMemcpyHostToHost || kind_ == hipMemcpyDefault) && IsHtoHMemcpy(dst_, src_)) { isH2H = true; @@ -1451,14 +1450,13 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { if (isEnabled_) { //HtoH if (isH2H) { - ihipHtoHMemcpy(dst_, src_, count_, *hip::getStream(stream)); + ihipHtoHMemcpy(dst_, src_, count_, *stream); return; } amd::Command* command = commands_[0]; amd::HostQueue* cmdQueue = command->queue(); - hip::Stream* hip_stream = hip::getStream(stream); - if (cmdQueue == hip_stream) { + if (cmdQueue == stream) { command->enqueue(); command->release(); return; @@ -1466,7 +1464,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { amd::Command::EventWaitList waitList; amd::Command* depdentMarker = nullptr; - amd::Command* cmd = hip_stream->getLastQueuedCommand(true); + amd::Command* cmd = stream->getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); amd::Command* depdentMarker = new amd::Marker(*cmdQueue, true, waitList); @@ -1483,7 +1481,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { if (cmd != nullptr) { waitList.clear(); waitList.push_back(cmd); - amd::Command* depdentMarker = new amd::Marker(*hip_stream, true, waitList); + amd::Command* depdentMarker = new amd::Marker(*stream, true, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); // Make sure future commands of queue synced with command depdentMarker->release(); @@ -1492,8 +1490,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { } } else { amd::Command::EventWaitList waitList; - hip::Stream* hip_stream = hip::getStream(stream); - amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); } @@ -1973,11 +1970,12 @@ class GraphEventRecordNode : public GraphNode { return status; } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { if (!commands_.empty()) { hip::Event* e = reinterpret_cast(event_); // command release during enqueueRecordCommand - hipError_t status = e->enqueueRecordCommand(stream, commands_[0], true); + hipError_t status = e->enqueueRecordCommand( + reinterpret_cast(stream), commands_[0], true); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Enqueue event record command failed for node %p - status %d", this, @@ -2026,10 +2024,11 @@ class GraphEventWaitNode : public GraphNode { return status; } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { if (!commands_.empty()) { hip::Event* e = reinterpret_cast(event_); - hipError_t status = e->enqueueStreamWaitCommand(stream, commands_[0]); + hipError_t status = + e->enqueueStreamWaitCommand(reinterpret_cast(stream), commands_[0]); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Enqueue stream wait command failed for node %p - status %d", this, @@ -2087,7 +2086,7 @@ class GraphHostNode : public GraphNode { NodeParams->fn(NodeParams->userData); } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { if (!commands_.empty()) { if (!commands_[0]->setCallback(CL_COMPLETE, GraphHostNode::Callback, &NodeParams_)) { ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during setCallback"); @@ -2411,7 +2410,7 @@ class GraphDrvMemcpyNode : public GraphNode { return status; } - void EnqueueCommands(hipStream_t stream) override { + void EnqueueCommands(hip::Stream* stream) override { bool isHtoH = false; if(copyParams_.srcMemoryType == hipMemoryTypeHost && copyParams_.dstMemoryType == hipMemoryTypeHost && @@ -2421,7 +2420,7 @@ class GraphDrvMemcpyNode : public GraphNode { if (isEnabled_ && isHtoH) { ihipHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost, copyParams_.WidthInBytes * copyParams_.Height * - copyParams_.Depth, *hip::getStream(stream)); + copyParams_.Depth, *stream); return; } GraphNode::EnqueueCommands(stream);