Skip to content

Commit

Permalink
SWDEV-475341 - Revert extra changes
Browse files Browse the repository at this point in the history
- Revert "SWDEV-454247 - Fix graph multi threading issue"
- Revert "SWDEV-467102 - Hidden heap init for graph capture"

Change-Id: I0cccdf913210f368089290e0ed97594264f55e0a
  • Loading branch information
saleelk committed Jul 31, 2024
1 parent d92c4ff commit dd7f957
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 100 deletions.
8 changes: 2 additions & 6 deletions hipamd/src/hip_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,8 @@ hipError_t ihipGraphAddKernelNode(hip::GraphNode** pGraphNode, hip::Graph* graph
if (!hip::Graph::isGraphValid(graph)) {
return hipErrorInvalidValue;
}
hipFunction_t func = hip::GraphKernelNode::getFunc(*pNodeParams, ihipGetDevice());
if (!func) {
return hipErrorInvalidDeviceFunction;
}
hipError_t status =
hip::GraphKernelNode::validateKernelParams(pNodeParams, func, ihipGetDevice());

hipError_t status = hip::GraphKernelNode::validateKernelParams(pNodeParams);
if (hipSuccess != status) {
return status;
}
Expand Down
25 changes: 6 additions & 19 deletions hipamd/src/hip_graph_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
// arg size required for all graph kernel nodes to allocate
for (const auto& list : parallelLists) {
for (auto& node : list) {
if (node->GetType() == hipGraphNodeTypeKernel) {
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto& childParallelLists = reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
Expand All @@ -376,13 +377,8 @@ hipError_t AllocKernelArgForGraph(std::vector<hip::Node>& topoOrder, hip::Stream
hip::GraphExec* graphExec) {
hipError_t status = hipSuccess;
for (auto& node : topoOrder) {
if (node->GetType() == hipGraphNodeTypeKernel) {
// Check if graph requires hidden heap and set as part of graphExec param.
static bool initialized = false;
if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
graphExec->SetHiddenHeap();
initialized = true;
}
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
// From the kernel pool allocate the kern arg size required for the current kernel node.
address kernArgOffset = nullptr;
Expand Down Expand Up @@ -597,7 +593,8 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
}
for (int i = 0; i < topoOrder.size(); i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(topoOrder[i])->HasHiddenHeap()) {
if (topoOrder[i]->GetEnabled()) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
topoOrder[i]->GetKernelName(),
Expand Down Expand Up @@ -642,16 +639,6 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {

if (parallelLists_.size() == 1 &&
instantiateDeviceId_ == launch_stream->DeviceId()) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
// If the graph has kernels that does device side allocation, during packet capture, heap is
// allocated because heap pointer has to be added to the AQL packet, and initialized during
// graph launch.
static bool initialized = false;
if (!initialized && HasHiddenHeap()) {
launch_stream->vdev()->HiddenHeapInit();
initialized = true;
}
}
status = EnqueueGraphWithSingleList(topoOrder_, launch_stream, this);
} else if (parallelLists_.size() == 1 &&
instantiateDeviceId_ != launch_stream->DeviceId()) {
Expand Down
55 changes: 13 additions & 42 deletions hipamd/src/hip_graph_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
uint32_t kernarg_graph_cur_offset_ = 0;
uint32_t kernarg_graph_size_ = 128 * Ki;
int instantiateDeviceId_ = -1;
bool hasHiddenHeap_ = false; //!< Kernel has hidden heap(device side allocation)

public:
GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
Expand Down Expand Up @@ -619,10 +618,6 @@ struct GraphExec : public amd::ReferenceCountedObject {
}
return clonedNode;
}
// returns if graph has nodes that require hidden heap/not
bool HasHiddenHeap() const { return hasHiddenHeap_; }
// Graph has nodes that require hidden heap.
void SetHiddenHeap() { hasHiddenHeap_ = true; }

address allocKernArg(size_t size, size_t alignment) {
assert(alignment != 0);
Expand Down Expand Up @@ -821,28 +816,6 @@ class GraphKernelNode : public GraphNode {
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
bool HasHiddenHeap() const { return hasHiddenHeap_; }
void EnqueueCommands(hip::Stream* stream) override {
// If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
// Node can be enabled/disabled only for kernel, memcpy and memset nodes.
if (!isEnabled_) {
amd::Command::EventWaitList waitList;
if (!commands_.empty()) {
waitList = commands_[0]->eventWaitList();
}
amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
command->enqueue();
command->release();
return;
}
for (auto& command : commands_) {
hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
amd::Kernel* kernel = function->kernel();
amd::ScopedLock lock(function->dflock_);
command->enqueue();
command->release();
}
}

void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
out << "[";
Expand Down Expand Up @@ -1076,15 +1049,9 @@ class GraphKernelNode : public GraphNode {
}

hipError_t CreateCommand(hip::Stream* stream) override {
int devID = hip::getDeviceID(stream->context());
hipFunction_t func = getFunc(kernelParams_, devID);
if (!func) {
return hipErrorInvalidDeviceFunction;
}
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
amd::Kernel* kernel = function->kernel();
amd::ScopedLock lock(function->dflock_);
hipError_t status = validateKernelParams(&kernelParams_, func, devID);
hipFunction_t func = nullptr;
hipError_t status = validateKernelParams(&kernelParams_, &func,
stream ? hip::getDeviceID(stream->context()) : -1);
if (hipSuccess != status) {
return status;
}
Expand All @@ -1108,12 +1075,8 @@ class GraphKernelNode : public GraphNode {
void GetParams(hipKernelNodeParams* params) { *params = kernelParams_; }

hipError_t SetParams(const hipKernelNodeParams* params) {
hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
if (!func) {
return hipErrorInvalidDeviceFunction;
}
// updates kernel params
hipError_t status = validateKernelParams(params, func, ihipGetDevice());
hipError_t status = validateKernelParams(params);
if (hipSuccess != status) {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to validateKernelParams");
return status;
Expand Down Expand Up @@ -1229,7 +1192,13 @@ class GraphKernelNode : public GraphNode {
}

static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
hipFunction_t func, int devId) {
hipFunction_t* ptrFunc = nullptr, int devId = -1) {
devId = devId == -1 ? ihipGetDevice() : devId;
hipFunction_t func = getFunc(*pNodeParams, devId);
if (!func) {
return hipErrorInvalidDeviceFunction;
}

size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;
Expand All @@ -1242,6 +1211,8 @@ class GraphKernelNode : public GraphNode {
if (status != hipSuccess) {
return status;
}

if (ptrFunc) *ptrFunc = func;
return hipSuccess;
}
};
Expand Down
7 changes: 2 additions & 5 deletions rocclr/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1310,8 +1310,7 @@ class VirtualDevice : public amd::HeapObject {

//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;
//! Init hidden heap for device memory allocations
virtual void HiddenHeapInit() = 0;

//! Dispatch captured AQL packet
virtual bool dispatchAqlPacket(uint8_t* aqlpacket,
const std::string& kernelName,
Expand Down Expand Up @@ -2103,9 +2102,7 @@ class Device : public RuntimeObject {
static Memory* p2p_stage_; //!< Staging resources
std::vector<Device*> enabled_p2p_devices_; //!< List of user enabled P2P devices for this device

std::once_flag heap_initialized_; //!< Heap buffer initialization flag
std::once_flag heap_allocated_; //!< Heap buffer allocation flag

std::once_flag heap_initialized_; //!< Heap buffer initialization flag
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device

amd::Memory* arena_mem_obj_; //!< Arena memory object
Expand Down
2 changes: 0 additions & 2 deletions rocclr/device/pal/palvirtual.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,6 @@ class VirtualGPU : public device::VirtualDevice {

bool isFenceDirty() const { return false; }

void HiddenHeapInit() {}

inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd = nullptr) {
vcmd->addKernelName(kernelName);
Expand Down
3 changes: 2 additions & 1 deletion rocclr/device/rocm/rocblit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2621,7 +2621,8 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem
address parameters = captureArguments(kernels_[blitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
gpu().releaseGpuMemoryFence();
synchronize();

return result;
}

Expand Down
18 changes: 4 additions & 14 deletions rocclr/device/rocm/rocdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3507,7 +3507,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer

// ================================================================================================
void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
auto HeapAllocOnly = [this, &gpu]() -> bool {
auto HeapAllocZeroOut = [this, &gpu]() -> bool {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
Expand All @@ -3519,22 +3519,12 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
LogError("Heap buffer allocation failed!");
return false;
}
return true;
};
std::call_once(heap_allocated_, HeapAllocOnly);
}

// ================================================================================================
void Device::HiddenHeapInit(const VirtualGPU& gpu) {
auto HeapZeroOut = [this, &gpu]() -> bool {
static constexpr size_t HeapBufferSize = 128 * Ki;
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr())
.initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize,
initial_heap_size_ / (2 * Mi));
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));

return result;
};
std::call_once(heap_initialized_, HeapZeroOut);
std::call_once(heap_initialized_, HeapAllocZeroOut);
}

// ================================================================================================
Expand Down
3 changes: 1 addition & 2 deletions rocclr/device/rocm/rocdevice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,7 @@ class Device : public NullDevice {

//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc(const VirtualGPU& gpu);
//! Init hidden heap for device memory allocations
void HiddenHeapInit(const VirtualGPU& gpu);

uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const;
void resetSDMAMask(const device::BlitManager* handle) const;
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
Expand Down
9 changes: 2 additions & 7 deletions rocclr/device/rocm/rocvirtual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2959,8 +2959,6 @@ static inline void nontemporalMemcpy(
#endif
}

void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*this); }

// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
Expand Down Expand Up @@ -3015,7 +3013,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,

amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();

for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
Expand Down Expand Up @@ -3142,10 +3140,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
}
if (dev().HeapBuffer() != nullptr) {
// Initialize hidden heap buffer
if (!isGraphCapture) {
const_cast<Device&>(dev()).HiddenHeapInit(*this);
}
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
Expand Down Expand Up @@ -3228,6 +3222,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}

address argBuffer = hidden_arguments;
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());

// Find all parameters for the current kernel
Expand Down
2 changes: 0 additions & 2 deletions rocclr/device/rocm/rocvirtual.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,6 @@ class VirtualGPU : public device::VirtualDevice {

void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void HiddenHeapInit();

void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
// } roc OpenCL integration
Expand Down

0 comments on commit dd7f957

Please sign in to comment.