diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index c48e1828e..ae98fc67e 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -95,12 +95,8 @@ hipError_t ihipGraphAddKernelNode(hip::GraphNode** pGraphNode, hip::Graph* graph if (!hip::Graph::isGraphValid(graph)) { return hipErrorInvalidValue; } - hipFunction_t func = hip::GraphKernelNode::getFunc(*pNodeParams, ihipGetDevice()); - if (!func) { - return hipErrorInvalidDeviceFunction; - } - hipError_t status = - hip::GraphKernelNode::validateKernelParams(pNodeParams, func, ihipGetDevice()); + + hipError_t status = hip::GraphKernelNode::validateKernelParams(pNodeParams); if (hipSuccess != status) { return status; } diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index ee0bd5b65..ebcf4c91e 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -360,7 +360,8 @@ void GetKernelArgSizeForGraph(std::vector>& parallelLists, // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists) { for (auto& node : list) { - if (node->GetType() == hipGraphNodeTypeKernel) { + if (node->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(node)->HasHiddenHeap()) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } else if (node->GetType() == hipGraphNodeTypeGraph) { auto& childParallelLists = reinterpret_cast(node)->GetParallelLists(); @@ -376,13 +377,8 @@ hipError_t AllocKernelArgForGraph(std::vector& topoOrder, hip::Stream hip::GraphExec* graphExec) { hipError_t status = hipSuccess; for (auto& node : topoOrder) { - if (node->GetType() == hipGraphNodeTypeKernel) { - // Check if graph requires hidden heap and set as part of graphExec param. - static bool initialized = false; - if (!initialized && reinterpret_cast(node)->HasHiddenHeap()) { - graphExec->SetHiddenHeap(); - initialized = true; - } + if (node->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(node)->HasHiddenHeap()) { auto kernelNode = reinterpret_cast(node); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = nullptr; @@ -597,7 +593,8 @@ hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::St accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr); } for (int i = 0; i < topoOrder.size(); i++) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) { + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel && + !reinterpret_cast(topoOrder[i])->HasHiddenHeap()) { if (topoOrder[i]->GetEnabled()) { hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(), topoOrder[i]->GetKernelName(), @@ -642,16 +639,6 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) { if (parallelLists_.size() == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) { - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { - // If the graph has kernels that does device side allocation, during packet capture, heap is - // allocated because heap pointer has to be added to the AQL packet, and initialized during - // graph launch. - static bool initialized = false; - if (!initialized && HasHiddenHeap()) { - launch_stream->vdev()->HiddenHeapInit(); - initialized = true; - } - } status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this); } else if (parallelLists_.size() == 1 && instantiateDeviceId_ != launch_stream->DeviceId()) { diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 2bbac67db..f70e0bfd3 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -568,7 +568,6 @@ struct GraphExec : public amd::ReferenceCountedObject { uint32_t kernarg_graph_cur_offset_ = 0; uint32_t kernarg_graph_size_ = 128 * Ki; int instantiateDeviceId_ = -1; - bool hasHiddenHeap_ = false; //!< Kernel has hidden heap(device side allocation) public: GraphExec(std::vector& topoOrder, std::vector>& lists, @@ -619,10 +618,6 @@ struct GraphExec : public amd::ReferenceCountedObject { } return clonedNode; } - // returns if graph has nodes that require hidden heap/not - bool HasHiddenHeap() const { return hasHiddenHeap_; } - // Graph has nodes that require hidden heap. - void SetHiddenHeap() { hasHiddenHeap_ = true; } address allocKernArg(size_t size, size_t alignment) { assert(alignment != 0); @@ -821,28 +816,6 @@ class GraphKernelNode : public GraphNode { size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; } size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; } bool HasHiddenHeap() const { return hasHiddenHeap_; } - void EnqueueCommands(hip::Stream* stream) override { - // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. - // Node can be enabled/disabled only for kernel, memcpy and memset nodes. - if (!isEnabled_) { - amd::Command::EventWaitList waitList; - if (!commands_.empty()) { - waitList = commands_[0]->eventWaitList(); - } - amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); - command->enqueue(); - command->release(); - return; - } - for (auto& command : commands_) { - hipFunction_t func = getFunc(kernelParams_, ihipGetDevice()); - hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); - amd::Kernel* kernel = function->kernel(); - amd::ScopedLock lock(function->dflock_); - command->enqueue(); - command->release(); - } - } void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override { out << "["; @@ -1076,15 +1049,9 @@ class GraphKernelNode : public GraphNode { } hipError_t CreateCommand(hip::Stream* stream) override { - int devID = hip::getDeviceID(stream->context()); - hipFunction_t func = getFunc(kernelParams_, devID); - if (!func) { - return hipErrorInvalidDeviceFunction; - } - hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); - amd::Kernel* kernel = function->kernel(); - amd::ScopedLock lock(function->dflock_); - hipError_t status = validateKernelParams(&kernelParams_, func, devID); + hipFunction_t func = nullptr; + hipError_t status = validateKernelParams(&kernelParams_, &func, + stream ? hip::getDeviceID(stream->context()) : -1); if (hipSuccess != status) { return status; } @@ -1108,12 +1075,8 @@ class GraphKernelNode : public GraphNode { void GetParams(hipKernelNodeParams* params) { *params = kernelParams_; } hipError_t SetParams(const hipKernelNodeParams* params) { - hipFunction_t func = getFunc(kernelParams_, ihipGetDevice()); - if (!func) { - return hipErrorInvalidDeviceFunction; - } // updates kernel params - hipError_t status = validateKernelParams(params, func, ihipGetDevice()); + hipError_t status = validateKernelParams(params); if (hipSuccess != status) { ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to validateKernelParams"); return status; @@ -1229,7 +1192,13 @@ class GraphKernelNode : public GraphNode { } static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams, - hipFunction_t func, int devId) { + hipFunction_t* ptrFunc = nullptr, int devId = -1) { + devId = devId == -1 ? ihipGetDevice() : devId; + hipFunction_t func = getFunc(*pNodeParams, devId); + if (!func) { + return hipErrorInvalidDeviceFunction; + } + size_t globalWorkSizeX = static_cast(pNodeParams->gridDim.x) * pNodeParams->blockDim.x; size_t globalWorkSizeY = static_cast(pNodeParams->gridDim.y) * pNodeParams->blockDim.y; size_t globalWorkSizeZ = static_cast(pNodeParams->gridDim.z) * pNodeParams->blockDim.z; @@ -1242,6 +1211,8 @@ class GraphKernelNode : public GraphNode { if (status != hipSuccess) { return status; } + + if (ptrFunc) *ptrFunc = func; return hipSuccess; } }; diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 223cb780e..5fbfb1dfd 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1310,8 +1310,7 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; - //! Init hidden heap for device memory allocations - virtual void HiddenHeapInit() = 0; + //! Dispatch captured AQL packet virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, @@ -2103,9 +2102,7 @@ class Device : public RuntimeObject { static Memory* p2p_stage_; //!< Staging resources std::vector enabled_p2p_devices_; //!< List of user enabled P2P devices for this device - std::once_flag heap_initialized_; //!< Heap buffer initialization flag - std::once_flag heap_allocated_; //!< Heap buffer allocation flag - + std::once_flag heap_initialized_; //!< Heap buffer initialization flag device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device amd::Memory* arena_mem_obj_; //!< Arena memory object diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index 32a319e8b..34824d047 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -356,8 +356,6 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } - void HiddenHeapInit() {} - inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName, amd::AccumulateCommand* vcmd = nullptr) { vcmd->addKernelName(kernelName); diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index ebcc9e6c3..791fd98de 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2621,7 +2621,8 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem address parameters = captureArguments(kernels_[blitType]); result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); - gpu().releaseGpuMemoryFence(); + synchronize(); + return result; } diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 2785d1f0a..5e3b09fa2 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -3507,7 +3507,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer // ================================================================================================ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { - auto HeapAllocOnly = [this, &gpu]() -> bool { + auto HeapAllocZeroOut = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -3519,22 +3519,12 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { LogError("Heap buffer allocation failed!"); return false; } - return true; - }; - std::call_once(heap_allocated_, HeapAllocOnly); -} - -// ================================================================================================ -void Device::HiddenHeapInit(const VirtualGPU& gpu) { - auto HeapZeroOut = [this, &gpu]() -> bool { - static constexpr size_t HeapBufferSize = 128 * Ki; - bool result = static_cast(gpu.blitMgr()) - .initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize, - initial_heap_size_ / (2 * Mi)); + bool result = static_cast(gpu.blitMgr()).initHeap( + heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); return result; }; - std::call_once(heap_initialized_, HeapZeroOut); + std::call_once(heap_initialized_, HeapAllocZeroOut); } // ================================================================================================ diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 66f737601..2ff992b28 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -589,8 +589,7 @@ class Device : public NullDevice { //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(const VirtualGPU& gpu); - //! Init hidden heap for device memory allocations - void HiddenHeapInit(const VirtualGPU& gpu); + uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const; void resetSDMAMask(const device::BlitManager* handle) const; void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 0605bdd0e..ee5cacd4b 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2959,8 +2959,6 @@ static inline void nontemporalMemcpy( #endif } -void VirtualGPU::HiddenHeapInit() { const_cast(dev()).HiddenHeapInit(*this); } - // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, @@ -3015,7 +3013,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, amd::Memory* const* memories = reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); + for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { @@ -3142,10 +3140,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const_cast(dev()).HiddenHeapAlloc(*this); } if (dev().HeapBuffer() != nullptr) { - // Initialize hidden heap buffer - if (!isGraphCapture) { - const_cast(dev()).HiddenHeapInit(*this); - } // Add heap pointer to the code size_t heap_ptr = static_cast(dev().HeapBuffer()->virtualAddress()); WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); @@ -3228,6 +3222,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } address argBuffer = hidden_arguments; + bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize()); // Find all parameters for the current kernel diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 053c9751b..abb67689b 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -419,8 +419,6 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } - void HiddenHeapInit(); - void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } // } roc OpenCL integration