diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp
index c48e1828e..ae98fc67e 100644
--- a/hipamd/src/hip_graph.cpp
+++ b/hipamd/src/hip_graph.cpp
@@ -95,12 +95,8 @@ hipError_t ihipGraphAddKernelNode(hip::GraphNode** pGraphNode, hip::Graph* graph
   if (!hip::Graph::isGraphValid(graph)) {
     return hipErrorInvalidValue;
   }
-  hipFunction_t func = hip::GraphKernelNode::getFunc(*pNodeParams, ihipGetDevice());
-  if (!func) {
-    return hipErrorInvalidDeviceFunction;
-  }
-  hipError_t status =
-      hip::GraphKernelNode::validateKernelParams(pNodeParams, func, ihipGetDevice());
+
+  hipError_t status = hip::GraphKernelNode::validateKernelParams(pNodeParams);
   if (hipSuccess != status) {
     return status;
   }
diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index ee0bd5b65..ebcf4c91e 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -360,7 +360,8 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
   // arg size required for all graph kernel nodes to allocate
   for (const auto& list : parallelLists) {
     for (auto& node : list) {
-      if (node->GetType() == hipGraphNodeTypeKernel) {
+      if (node->GetType() == hipGraphNodeTypeKernel &&
+          !reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
         kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
       } else if (node->GetType() == hipGraphNodeTypeGraph) {
         auto& childParallelLists = reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -376,13 +377,8 @@ hipError_t AllocKernelArgForGraph(std::vector<hip::Node>& topoOrder, hip::Stream
                                   hip::GraphExec* graphExec) {
   hipError_t status = hipSuccess;
   for (auto& node : topoOrder) {
-    if (node->GetType() == hipGraphNodeTypeKernel) {
-      // Check if graph requires hidden heap and set as part of graphExec param.
-      static bool initialized = false;
-      if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
-        graphExec->SetHiddenHeap();
-        initialized = true;
-      }
+    if (node->GetType() == hipGraphNodeTypeKernel &&
+        !reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
       auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
       // From the kernel pool allocate the kern arg size required for the current kernel node.
       address kernArgOffset = nullptr;
@@ -597,7 +593,8 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
     accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
   }
   for (int i = 0; i < topoOrder.size(); i++) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel &&
+        !reinterpret_cast<hip::GraphKernelNode*>(topoOrder[i])->HasHiddenHeap()) {
       if (topoOrder[i]->GetEnabled()) {
         hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
                                               topoOrder[i]->GetKernelName(),
@@ -642,16 +639,6 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
 
   if (parallelLists_.size() == 1 &&
       instantiateDeviceId_ == launch_stream->DeviceId()) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
-      // If the graph has kernels that does device side allocation,  during packet capture, heap is
-      // allocated because heap pointer has to be added to the AQL packet, and initialized during
-      // graph launch.
-      static bool initialized = false;
-      if (!initialized && HasHiddenHeap()) {
-        launch_stream->vdev()->HiddenHeapInit();
-        initialized = true;
-      }
-    }
     status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this);
   } else if (parallelLists_.size() == 1 &&
              instantiateDeviceId_ != launch_stream->DeviceId()) {
diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp
index 2bbac67db..f70e0bfd3 100644
--- a/hipamd/src/hip_graph_internal.hpp
+++ b/hipamd/src/hip_graph_internal.hpp
@@ -568,7 +568,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
   uint32_t kernarg_graph_cur_offset_ = 0;
   uint32_t kernarg_graph_size_ = 128 * Ki;
   int instantiateDeviceId_ = -1;
-  bool hasHiddenHeap_ = false;                 //!< Kernel has hidden heap(device side allocation)
 
  public:
   GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
@@ -619,10 +618,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
     }
     return clonedNode;
   }
-  // returns if graph has nodes that require hidden heap/not
-  bool HasHiddenHeap() const { return hasHiddenHeap_; }
-  // Graph has nodes that require hidden heap.
-  void SetHiddenHeap() { hasHiddenHeap_ = true; }
 
   address allocKernArg(size_t size, size_t alignment) {
     assert(alignment != 0);
@@ -821,28 +816,6 @@ class GraphKernelNode : public GraphNode {
   size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
   size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
   bool HasHiddenHeap() const { return hasHiddenHeap_; }
-  void EnqueueCommands(hip::Stream* stream) override {
-    // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
-    // Node can be enabled/disabled only for kernel, memcpy and memset nodes.
-    if (!isEnabled_) {
-      amd::Command::EventWaitList waitList;
-      if (!commands_.empty()) {
-        waitList = commands_[0]->eventWaitList();
-      }
-      amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
-      command->enqueue();
-      command->release();
-      return;
-    }
-    for (auto& command : commands_) {
-      hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
-      hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
-      amd::Kernel* kernel = function->kernel();
-      amd::ScopedLock lock(function->dflock_);
-      command->enqueue();
-      command->release();
-    }
-  }
 
   void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
     out << "[";
@@ -1076,15 +1049,9 @@ class GraphKernelNode : public GraphNode {
   }
 
   hipError_t CreateCommand(hip::Stream* stream) override {
-    int devID = hip::getDeviceID(stream->context());
-    hipFunction_t func = getFunc(kernelParams_, devID);
-    if (!func) {
-      return hipErrorInvalidDeviceFunction;
-    }
-    hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
-    amd::Kernel* kernel = function->kernel();
-    amd::ScopedLock lock(function->dflock_);
-    hipError_t status = validateKernelParams(&kernelParams_, func, devID);
+    hipFunction_t func = nullptr;
+    hipError_t status = validateKernelParams(&kernelParams_, &func,
+                                             stream ? hip::getDeviceID(stream->context()) : -1);
     if (hipSuccess != status) {
       return status;
     }
@@ -1108,12 +1075,8 @@ class GraphKernelNode : public GraphNode {
   void GetParams(hipKernelNodeParams* params) { *params = kernelParams_; }
 
   hipError_t SetParams(const hipKernelNodeParams* params) {
-    hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
-    if (!func) {
-      return hipErrorInvalidDeviceFunction;
-    }
     // updates kernel params
-    hipError_t status = validateKernelParams(params, func, ihipGetDevice());
+    hipError_t status = validateKernelParams(params);
     if (hipSuccess != status) {
       ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to validateKernelParams");
       return status;
@@ -1229,7 +1192,13 @@ class GraphKernelNode : public GraphNode {
   }
 
   static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
-                                         hipFunction_t func, int devId) {
+                                         hipFunction_t* ptrFunc = nullptr, int devId = -1) {
+    devId = devId == -1 ? ihipGetDevice() : devId;
+    hipFunction_t func = getFunc(*pNodeParams, devId);
+    if (!func) {
+      return hipErrorInvalidDeviceFunction;
+    }
+
     size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
     size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
     size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;
@@ -1242,6 +1211,8 @@ class GraphKernelNode : public GraphNode {
     if (status != hipSuccess) {
       return status;
     }
+
+    if (ptrFunc) *ptrFunc = func;
     return hipSuccess;
   }
 };
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index 223cb780e..5fbfb1dfd 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -1310,8 +1310,7 @@ class VirtualDevice : public amd::HeapObject {
 
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
-  //! Init hidden heap for device memory allocations
-  virtual void HiddenHeapInit() = 0;
+
   //! Dispatch captured AQL packet
   virtual bool dispatchAqlPacket(uint8_t* aqlpacket,
                                  const std::string& kernelName,
@@ -2103,9 +2102,7 @@ class Device : public RuntimeObject {
   static Memory* p2p_stage_;          //!< Staging resources
   std::vector<Device*> enabled_p2p_devices_;  //!< List of user enabled P2P devices for this device
 
-  std::once_flag heap_initialized_;  //!< Heap buffer initialization flag
-  std::once_flag heap_allocated_;    //!< Heap buffer allocation flag
-
+  std::once_flag heap_initialized_; //!< Heap buffer initialization flag
   device::Memory* heap_buffer_;     //!< Preallocated heap buffer for memory allocations on device
 
   amd::Memory* arena_mem_obj_;      //!< Arena memory object
diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp
index 32a319e8b..34824d047 100644
--- a/rocclr/device/pal/palvirtual.hpp
+++ b/rocclr/device/pal/palvirtual.hpp
@@ -356,8 +356,6 @@ class VirtualGPU : public device::VirtualDevice {
 
   bool isFenceDirty() const { return false; }
 
-  void HiddenHeapInit() {}
-
   inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
                                 amd::AccumulateCommand* vcmd = nullptr) {
     vcmd->addKernelName(kernelName);
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index ebcc9e6c3..791fd98de 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -2621,7 +2621,8 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem
   address parameters = captureArguments(kernels_[blitType]);
   result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
   releaseArguments(parameters);
-  gpu().releaseGpuMemoryFence();
+  synchronize();
+
   return result;
 }
 
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 2785d1f0a..5e3b09fa2 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -3507,7 +3507,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer
 
 // ================================================================================================
 void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
-  auto HeapAllocOnly = [this, &gpu]() -> bool {
+  auto HeapAllocZeroOut = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -3519,22 +3519,12 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    return true;
-  };
-  std::call_once(heap_allocated_, HeapAllocOnly);
-}
-
-// ================================================================================================
-void Device::HiddenHeapInit(const VirtualGPU& gpu) {
-  auto HeapZeroOut = [this, &gpu]() -> bool {
-    static constexpr size_t HeapBufferSize = 128 * Ki;
-    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr())
-                      .initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize,
-                                initial_heap_size_ / (2 * Mi));
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
+        heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;
   };
-  std::call_once(heap_initialized_, HeapZeroOut);
+  std::call_once(heap_initialized_, HeapAllocZeroOut);
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 66f737601..2ff992b28 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -589,8 +589,7 @@ class Device : public NullDevice {
 
   //! Allocates hidden heap for device memory allocations
   void HiddenHeapAlloc(const VirtualGPU& gpu);
-  //! Init hidden heap for device memory allocations
-  void HiddenHeapInit(const VirtualGPU& gpu);
+
   uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const;
   void resetSDMAMask(const device::BlitManager* handle) const;
   void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 0605bdd0e..ee5cacd4b 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -2959,8 +2959,6 @@ static inline void nontemporalMemcpy(
 #endif
 }
 
-void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*this); }
-
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     const amd::Kernel& kernel, const_address parameters, void* eventHandle,
@@ -3015,7 +3013,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+
   for (int j = 0; j < iteration; j++) {
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
@@ -3142,10 +3140,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
             const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
           }
           if (dev().HeapBuffer() != nullptr) {
-            // Initialize hidden heap buffer
-            if (!isGraphCapture) {
-              const_cast<Device&>(dev()).HiddenHeapInit(*this);
-            }
             // Add heap pointer to the code
             size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
             WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
@@ -3228,6 +3222,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     }
 
     address argBuffer = hidden_arguments;
+    bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
     size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
 
     // Find all parameters for the current kernel
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 053c9751b..abb67689b 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -419,8 +419,6 @@ class VirtualGPU : public device::VirtualDevice {
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
-  void HiddenHeapInit();
-
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
   uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
   // } roc OpenCL integration