From b3626e7984a803cbcff2fdce49c7809c14596691 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 12 Jul 2021 14:53:41 +0000 Subject: [PATCH] Add setting to cache the GPUMemory mappings for pipeline upload During pipeline upload the GpuMemory is allocated from the pool. This memory is mapped, pipeline is uploaded and then the memory is unmapped. In applications with lots of pipelines being created during rendering, this causes lots of map/unmap calls and slows down the pipeline upload. Add a setting that allows the cache of the GpuMemory mapping, so that it can be reused by another pipeline upload. --- src/core/g_palSettings.cpp | 16 ++++ src/core/g_palSettings.h | 2 + src/core/hw/gfxip/pipeline.cpp | 4 +- src/core/internalMemMgr.cpp | 138 +++++++++++++++++++++++++++++++++ src/core/internalMemMgr.h | 20 +++++ src/core/settings_core.json | 19 ++++- 6 files changed, 196 insertions(+), 3 deletions(-) diff --git a/src/core/g_palSettings.cpp b/src/core/g_palSettings.cpp index d7668139..82a8675e 100644 --- a/src/core/g_palSettings.cpp +++ b/src/core/g_palSettings.cpp @@ -162,6 +162,7 @@ void SettingsLoader::SetupDefaults() m_settings.overlayReportMes = true; m_settings.mipGenUseFastPath = false; m_settings.useFp16GenMips = false; + m_settings.maxMappedPoolsSize = 0; m_settings.tmzEnabled = true; #if PAL_DEVELOPER_BUILD m_settings.dbgHelperBits = 0x0; @@ -610,6 +611,11 @@ void SettingsLoader::ReadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr, + Util::ValueType::Uint64, + &m_settings.maxMappedPoolsSize, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pTmzEnabledStr, Util::ValueType::Boolean, &m_settings.tmzEnabled, @@ -655,6 +661,11 @@ void SettingsLoader::RereadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr, + Util::ValueType::Uint64, + &m_settings.maxMappedPoolsSize, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pUseDccStr, Util::ValueType::Uint, &m_settings.useDcc, @@ -1100,6 +1111,11 @@ void SettingsLoader::InitSettingsInfo() info.valueSize = sizeof(m_settings.useFp16GenMips); m_settingsInfoMap.Insert(192229910, info); + info.type = SettingType::Uint64; + info.pValuePtr = &m_settings.maxMappedPoolsSize; + info.valueSize = sizeof(m_settings.maxMappedPoolsSize); + m_settingsInfoMap.Insert(3814409436, info); + info.type = SettingType::Boolean; info.pValuePtr = &m_settings.tmzEnabled; info.valueSize = sizeof(m_settings.tmzEnabled); diff --git a/src/core/g_palSettings.h b/src/core/g_palSettings.h index 02107710..c2b20848 100644 --- a/src/core/g_palSettings.h +++ b/src/core/g_palSettings.h @@ -279,6 +279,7 @@ struct PalSettings : public Pal::DriverSettings bool overlayReportMes; bool mipGenUseFastPath; bool useFp16GenMips; + gpusize maxMappedPoolsSize; bool tmzEnabled; #if PAL_DEVELOPER_BUILD uint64 dbgHelperBits; @@ -378,6 +379,7 @@ static const char* pDebugForceResourceAdditionalPaddingStr = "#3601080919"; static const char* pOverlayReportMesStr = "#1685803860"; static const char* pMipGenUseFastPathStr = "#3353227045"; static const char* pUseFp16GenMipsStr = "#192229910"; +static const char* pmaxMappedPoolsSizeStr = "#3814409436"; static const char* pTmzEnabledStr = "#2606194033"; #if PAL_DEVELOPER_BUILD static const char* pDbgHelperBitsStr = "#3894710420"; diff --git a/src/core/hw/gfxip/pipeline.cpp b/src/core/hw/gfxip/pipeline.cpp index cb6a4962..142d890f 100644 --- a/src/core/hw/gfxip/pipeline.cpp +++ b/src/core/hw/gfxip/pipeline.cpp @@ -881,7 +881,7 @@ Result PipelineUploader::UploadUsingCpu( const SectionAddressCalculator& addressCalc, void** ppMappedPtr) { - Result result = m_pGpuMemory->Map(&m_pMappedPtr); + Result result = m_pDevice->MemMgr()->Map(m_pGpuMemory, &m_pMappedPtr); if (result == Result::Success) { m_pMappedPtr = VoidPtrInc(m_pMappedPtr, static_cast(m_baseOffset)); @@ -1104,7 +1104,7 @@ Result PipelineUploader::End( else { PAL_ASSERT(m_pMappedPtr != nullptr); - result = m_pGpuMemory->Unmap(); + m_pDevice->MemMgr()->Unmap(m_pGpuMemory); } m_pMappedPtr = nullptr; diff --git a/src/core/internalMemMgr.cpp b/src/core/internalMemMgr.cpp index d39ebdee..e555aed0 100644 --- a/src/core/internalMemMgr.cpp +++ b/src/core/internalMemMgr.cpp @@ -129,6 +129,7 @@ InternalMemMgr::InternalMemMgr( : m_pDevice(pDevice), m_poolList(pDevice->GetPlatform()), + m_unusedMappedPoolList(pDevice->GetPlatform()), m_references(pDevice->GetPlatform()), m_referenceWatermark(0) { @@ -138,6 +139,24 @@ InternalMemMgr::InternalMemMgr( // Explicitly frees all GPU memory allocations. void InternalMemMgr::FreeAllocations() { + + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + PAL_ASSERT((it.Get() != nullptr) && (it.Get()->pBuddyAllocator != nullptr)); + + if (it.Get()->pData != nullptr && it.Get()->pGpuMemory != nullptr) + { + it.Get()->pGpuMemory->Unmap(); + it.Get()->pData = nullptr; + } + } + + while (m_unusedMappedPoolList.NumElements() > 0) + { + auto it = m_unusedMappedPoolList.Begin(); + m_unusedMappedPoolList.Erase(&it); + } + // Delete the GPU memory objects using the references list while (m_references.NumElements() != 0) { @@ -563,4 +582,123 @@ uint32 InternalMemMgr::GetReferencesCount() return static_cast(m_references.NumElements()); } +Result InternalMemMgr::Map( + GpuMemory* pGpuMemory, + void** ppData) +{ + PAL_ASSERT(pGpuMemory != nullptr); + Result result = Result::ErrorInvalidValue; + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData == nullptr) + { + result = pPool->pGpuMemory->Map(&pPool->pData); + if (result != Result::Success) + { + pPool->pData = nullptr; + break; + } + m_totalSizeMappedPools += pPool->pGpuMemory->Desc().size; + CheckMappedPoolLimit(); + } + else if (pPool->refCount == 0) + { + // should be in unused list, remove it from there. + for (auto it2 = m_unusedMappedPoolList.Begin(); it2.Get() != nullptr; it2.Next()) + { + if (*(it2.Get()) == pPool) + { + m_unusedMappedPoolList.Erase(&it2); + break; + } + } + } + pPool->refCount++; + *ppData = pPool->pData; + result = Result::Success; + break; + } + } + + // If we didn't find the allocation in the pool list then something went wrong with the allocation scheme + PAL_ASSERT(result == Result::Success); + } + else + { + result = pGpuMemory->Map(ppData); + } + + return result; +} + +Result InternalMemMgr::Unmap( + GpuMemory* pGpuMemory) +{ + PAL_ASSERT(pGpuMemory != nullptr); + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData != nullptr) + { + pPool->refCount--; + if (pPool->refCount == 0) + { + m_unusedMappedPoolList.PushBack(pPool); + CheckMappedPoolLimit(); + } + } + break; + } + } + } + else + { + pGpuMemory->Unmap(); + } + + return Result::Success; +} + +void InternalMemMgr::CheckMappedPoolLimit() +{ + if ((m_pDevice->Settings().maxMappedPoolsSize >= 0)) + { + while (m_totalSizeMappedPools > m_pDevice->Settings().maxMappedPoolsSize + && m_unusedMappedPoolList.NumElements() > 0) + { + auto it = m_unusedMappedPoolList.Begin(); + GpuMemoryPool *pPool = *it.Get(); + + PAL_ASSERT(pPool->pBuddyAllocator != nullptr); + if (pPool->pData != nullptr && pPool->pGpuMemory != nullptr) + { + pPool->pGpuMemory->Unmap(); + pPool->pData = nullptr; + } + m_unusedMappedPoolList.Erase(&it); + PAL_ASSERT(m_totalSizeMappedPools >= pPool->pGpuMemory->Desc().size); + m_totalSizeMappedPools -= pPool->pGpuMemory->Desc().size; + } + } +} + + } // Pal diff --git a/src/core/internalMemMgr.h b/src/core/internalMemMgr.h index 2c9541d8..aa27be1e 100644 --- a/src/core/internalMemMgr.h +++ b/src/core/internalMemMgr.h @@ -56,6 +56,8 @@ struct GpuMemoryPool uint64 pagingFenceVal; // Paging fence value Util::BuddyAllocator* pBuddyAllocator; // Buddy allocator used for the suballocation + void* pData; // address of the already existing mapping + size_t refCount; // refCount the number of memory allocations use this mapping }; // ===================================================================================================================== @@ -77,6 +79,7 @@ class InternalMemMgr typedef Util::ListIterator GpuMemoryListIterator; typedef Util::List GpuMemoryPoolList; + typedef Util::List GpuMemoryPoolRefList; explicit InternalMemMgr(Device* pDevice); ~InternalMemMgr() { FreeAllocations(); } @@ -115,6 +118,17 @@ class InternalMemMgr // Number of all allocations in the reference list. Note that this function takes the reference list lock. uint32 GetReferencesCount(); + Result Map( + GpuMemory* pGpuMemory, + void** ppData); + + Result Unmap( + GpuMemory* pGpuMemory); + + // If the number of mapped pools are more then the maximum limit then unmap the least recently used pool. + void CheckMappedPoolLimit(); + + private: Result AllocateBaseGpuMem( const GpuMemoryCreateInfo& createInfo, @@ -133,6 +147,9 @@ class InternalMemMgr // Maintain a list of GPU memory objects that are sub-allocated GpuMemoryPoolList m_poolList; + // Maintain a list of GPU memory objects that are sub-allocated and mapped but unused + GpuMemoryPoolRefList m_unusedMappedPoolList; + // Maintain a list of internal GPU memory references GpuMemoryList m_references; @@ -142,6 +159,9 @@ class InternalMemMgr // Ever-incrementing watermark to signal changes to the internal memory reference list uint32 m_referenceWatermark; + // Total size of mapped pools + gpusize m_totalSizeMappedPools; + PAL_DISALLOW_COPY_AND_ASSIGN(InternalMemMgr); PAL_DISALLOW_DEFAULT_CTOR(InternalMemMgr); }; diff --git a/src/core/settings_core.json b/src/core/settings_core.json index 14242a5b..2abc9607 100644 --- a/src/core/settings_core.json +++ b/src/core/settings_core.json @@ -1881,6 +1881,23 @@ "VariableName": "useFp16GenMips", "Description": "If mipGenUseFastPath == true and this is true - use the fp16 single-pass GenMips compute pass." }, + { + "Name": "maxMappedPoolsSize", + "Tags": [ + "Resource Settings", + "Performance" + ], + "Defaults": { + "Default": 0 + }, + "Flags": { + "RereadSetting": true + }, + "Scope": "PrivatePalKey", + "Type": "gpusize", + "VariableName": "maxMappedPoolsSize", + "Description": "If maxMappedPoolsSize > 0 the mapped gpu memory for pipeline creation will not be unmapped. If the total size of mapped pools grows greater than maxMappedPoolsSize, then the least recently used pools will be unmapped." + }, { "Name": "TmzEnabled", "Tags": [ @@ -2019,4 +2036,4 @@ "Description": "Maximum string length for a miscellaneous string setting" } ] -} \ No newline at end of file +}