From d107728c42ed88dea277ba6176b93cc0e724b211 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 12 Jul 2021 14:53:41 +0000 Subject: [PATCH] Add setting to cache the GPUMemory mappings for pipeline upload During pipeline upload the GpuMemory is allocated from the pool. This memory is mapped, pipeline is uploaded and then the memory is unmapped. In applications with lots of pipelines being created during rendering, this causes lots of map/unmap calls and slows down the pipeline upload. Add a setting that allows the cache of the GpuMemory mapping, so that it can be reused by another pipeline upload. --- src/core/g_palSettings.cpp | 16 ++++ src/core/g_palSettings.h | 2 + src/core/hw/gfxip/pipeline.cpp | 4 +- src/core/internalMemMgr.cpp | 138 +++++++++++++++++++++++++++++++++ src/core/internalMemMgr.h | 20 +++++ src/core/settings_core.json | 19 ++++- 6 files changed, 196 insertions(+), 3 deletions(-) diff --git a/src/core/g_palSettings.cpp b/src/core/g_palSettings.cpp index d7668139..88498a4e 100644 --- a/src/core/g_palSettings.cpp +++ b/src/core/g_palSettings.cpp @@ -162,6 +162,7 @@ void SettingsLoader::SetupDefaults() m_settings.overlayReportMes = true; m_settings.mipGenUseFastPath = false; m_settings.useFp16GenMips = false; + m_settings.maxNumMappedPool = 0; m_settings.tmzEnabled = true; #if PAL_DEVELOPER_BUILD m_settings.dbgHelperBits = 0x0; @@ -610,6 +611,11 @@ void SettingsLoader::ReadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxNumMappedPoolStr, + Util::ValueType::Uint, + &m_settings.maxNumMappedPool, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pTmzEnabledStr, Util::ValueType::Boolean, &m_settings.tmzEnabled, @@ -655,6 +661,11 @@ void SettingsLoader::RereadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxNumMappedPoolStr, + Util::ValueType::Uint, + &m_settings.maxNumMappedPool, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pUseDccStr, Util::ValueType::Uint, &m_settings.useDcc, @@ -1100,6 +1111,11 @@ void SettingsLoader::InitSettingsInfo() info.valueSize = sizeof(m_settings.useFp16GenMips); m_settingsInfoMap.Insert(192229910, info); + info.type = SettingType::Uint; + info.pValuePtr = &m_settings.maxNumMappedPool; + info.valueSize = sizeof(m_settings.maxNumMappedPool); + m_settingsInfoMap.Insert(4064599218, info); + info.type = SettingType::Boolean; info.pValuePtr = &m_settings.tmzEnabled; info.valueSize = sizeof(m_settings.tmzEnabled); diff --git a/src/core/g_palSettings.h b/src/core/g_palSettings.h index 02107710..aa096d1e 100644 --- a/src/core/g_palSettings.h +++ b/src/core/g_palSettings.h @@ -279,6 +279,7 @@ struct PalSettings : public Pal::DriverSettings bool overlayReportMes; bool mipGenUseFastPath; bool useFp16GenMips; + uint32 maxNumMappedPool; bool tmzEnabled; #if PAL_DEVELOPER_BUILD uint64 dbgHelperBits; @@ -378,6 +379,7 @@ static const char* pDebugForceResourceAdditionalPaddingStr = "#3601080919"; static const char* pOverlayReportMesStr = "#1685803860"; static const char* pMipGenUseFastPathStr = "#3353227045"; static const char* pUseFp16GenMipsStr = "#192229910"; +static const char* pmaxNumMappedPoolStr = "#4064599218"; static const char* pTmzEnabledStr = "#2606194033"; #if PAL_DEVELOPER_BUILD static const char* pDbgHelperBitsStr = "#3894710420"; diff --git a/src/core/hw/gfxip/pipeline.cpp b/src/core/hw/gfxip/pipeline.cpp index cb6a4962..142d890f 100644 --- a/src/core/hw/gfxip/pipeline.cpp +++ b/src/core/hw/gfxip/pipeline.cpp @@ -881,7 +881,7 @@ Result PipelineUploader::UploadUsingCpu( const SectionAddressCalculator& addressCalc, void** ppMappedPtr) { - Result result = m_pGpuMemory->Map(&m_pMappedPtr); + Result result = m_pDevice->MemMgr()->Map(m_pGpuMemory, &m_pMappedPtr); if (result == Result::Success) { m_pMappedPtr = VoidPtrInc(m_pMappedPtr, static_cast(m_baseOffset)); @@ -1104,7 +1104,7 @@ Result PipelineUploader::End( else { PAL_ASSERT(m_pMappedPtr != nullptr); - result = m_pGpuMemory->Unmap(); + m_pDevice->MemMgr()->Unmap(m_pGpuMemory); } m_pMappedPtr = nullptr; diff --git a/src/core/internalMemMgr.cpp b/src/core/internalMemMgr.cpp index d39ebdee..f8844387 100644 --- a/src/core/internalMemMgr.cpp +++ b/src/core/internalMemMgr.cpp @@ -129,6 +129,7 @@ InternalMemMgr::InternalMemMgr( : m_pDevice(pDevice), m_poolList(pDevice->GetPlatform()), + m_unusedMappedPoolList(pDevice->GetPlatform()), m_references(pDevice->GetPlatform()), m_referenceWatermark(0) { @@ -138,6 +139,24 @@ InternalMemMgr::InternalMemMgr( // Explicitly frees all GPU memory allocations. void InternalMemMgr::FreeAllocations() { + + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + PAL_ASSERT((it.Get() != nullptr) && (it.Get()->pBuddyAllocator != nullptr)); + + if (it.Get()->pData != nullptr && it.Get()->pGpuMemory != nullptr) + { + it.Get()->pGpuMemory->Unmap(); + it.Get()->pData = nullptr; + } + } + + while (m_unusedMappedPoolList.NumElements() > 0) + { + auto it = m_unusedMappedPoolList.Begin(); + m_unusedMappedPoolList.Erase(&it); + } + // Delete the GPU memory objects using the references list while (m_references.NumElements() != 0) { @@ -563,4 +582,123 @@ uint32 InternalMemMgr::GetReferencesCount() return static_cast(m_references.NumElements()); } +Result InternalMemMgr::Map( + GpuMemory* pGpuMemory, + void** ppData) +{ + PAL_ASSERT(pGpuMemory != nullptr); + Result result = Result::ErrorInvalidValue; + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData == nullptr) + { + result = pPool->pGpuMemory->Map(&pPool->pData); + if (result != Result::Success) + { + --m_numMappedPools; + pPool->pData = nullptr; + break; + } + ++m_numMappedPools; + CheckMappedPoolLimit(); + } + else if (pPool->refCount == 0) + { + // should be in unused list, remove it from there. + for (auto it2 = m_unusedMappedPoolList.Begin(); it2.Get() != nullptr; it2.Next()) + { + if (*(it2.Get()) == pPool) + { + m_unusedMappedPoolList.Erase(&it2); + break; + } + } + } + pPool->refCount++; + *ppData = pPool->pData; + result = Result::Success; + break; + } + } + + // If we didn't find the allocation in the pool list then something went wrong with the allocation scheme + PAL_ASSERT(result == Result::Success); + } + else + { + result = pGpuMemory->Map(ppData); + } + + return result; +} + +Result InternalMemMgr::Unmap( + GpuMemory* pGpuMemory) +{ + PAL_ASSERT(pGpuMemory != nullptr); + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData != nullptr) + { + pPool->refCount--; + if (pPool->refCount == 0) + { + m_unusedMappedPoolList.PushBack(pPool); + CheckMappedPoolLimit(); + } + } + break; + } + } + } + else + { + pGpuMemory->Unmap(); + } + + return Result::Success; +} + +void InternalMemMgr::CheckMappedPoolLimit() +{ + if ((m_pDevice->Settings().maxNumMappedPool >= 0)) + { + while (m_numMappedPools > m_pDevice->Settings().maxNumMappedPool + && m_unusedMappedPoolList.NumElements() > 0) + { + auto it = m_unusedMappedPoolList.Begin(); + GpuMemoryPool *pPool = *it.Get(); + + PAL_ASSERT(pPool->pBuddyAllocator != nullptr); + if (pPool->pData != nullptr && pPool->pGpuMemory != nullptr) + { + pPool->pGpuMemory->Unmap(); + pPool->pData = nullptr; + } + m_unusedMappedPoolList.Erase(&it); + --m_numMappedPools; + } + } +} + + } // Pal diff --git a/src/core/internalMemMgr.h b/src/core/internalMemMgr.h index 2c9541d8..748d6819 100644 --- a/src/core/internalMemMgr.h +++ b/src/core/internalMemMgr.h @@ -56,6 +56,8 @@ struct GpuMemoryPool uint64 pagingFenceVal; // Paging fence value Util::BuddyAllocator* pBuddyAllocator; // Buddy allocator used for the suballocation + void* pData; // address of the already existing mapping + size_t refCount; // refCount the number of memory allocations use this mapping }; // ===================================================================================================================== @@ -77,6 +79,7 @@ class InternalMemMgr typedef Util::ListIterator GpuMemoryListIterator; typedef Util::List GpuMemoryPoolList; + typedef Util::List GpuMemoryPoolRefList; explicit InternalMemMgr(Device* pDevice); ~InternalMemMgr() { FreeAllocations(); } @@ -115,6 +118,17 @@ class InternalMemMgr // Number of all allocations in the reference list. Note that this function takes the reference list lock. uint32 GetReferencesCount(); + Result Map( + GpuMemory* pGpuMemory, + void** ppData); + + Result Unmap( + GpuMemory* pGpuMemory); + + // If the number of mapped pools are more then the maximum limit then unmap the least recently used pool. + void CheckMappedPoolLimit(); + + private: Result AllocateBaseGpuMem( const GpuMemoryCreateInfo& createInfo, @@ -133,6 +147,9 @@ class InternalMemMgr // Maintain a list of GPU memory objects that are sub-allocated GpuMemoryPoolList m_poolList; + // Maintain a list of GPU memory objects that are sub-allocated and mapped but unused + GpuMemoryPoolRefList m_unusedMappedPoolList; + // Maintain a list of internal GPU memory references GpuMemoryList m_references; @@ -142,6 +159,9 @@ class InternalMemMgr // Ever-incrementing watermark to signal changes to the internal memory reference list uint32 m_referenceWatermark; + // Number of mapped memory pools + uint32 m_numMappedPools; + PAL_DISALLOW_COPY_AND_ASSIGN(InternalMemMgr); PAL_DISALLOW_DEFAULT_CTOR(InternalMemMgr); }; diff --git a/src/core/settings_core.json b/src/core/settings_core.json index 14242a5b..9bf56537 100644 --- a/src/core/settings_core.json +++ b/src/core/settings_core.json @@ -1881,6 +1881,23 @@ "VariableName": "useFp16GenMips", "Description": "If mipGenUseFastPath == true and this is true - use the fp16 single-pass GenMips compute pass." }, + { + "Name": "maxNumMappedPool", + "Tags": [ + "Resource Settings", + "Performance" + ], + "Defaults": { + "Default": 0 + }, + "Flags": { + "RereadSetting": true + }, + "Scope": "PrivatePalKey", + "Type": "uint32", + "VariableName": "maxNumMappedPool", + "Description": "If maxNumMappedPool > 0 the mapped gpu memory for pipeline creation will not be unmapped. If the number of unused mapped pools grows greater than maxNumMappedPool, then the least recently used pool will be unmapped." + }, { "Name": "TmzEnabled", "Tags": [ @@ -2019,4 +2036,4 @@ "Description": "Maximum string length for a miscellaneous string setting" } ] -} \ No newline at end of file +}