From e129c531cc0b6b08b170d67b68075fd3c629030c Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 12 Jul 2021 14:53:41 +0000 Subject: [PATCH] Add setting to cache the GPUMemory mappings for pipeline upload During pipeline upload the GpuMemory is allocated from the pool. This memory is mapped, pipeline is uploaded and then the memory is unmapped. In applications with lots of pipelines being created during rendering, this causes lots of map/unmap calls and slows down the pipeline upload. Add a setting that allows the cache of the GpuMemory mapping, so that it can be reused by another pipeline upload. --- src/core/g_palSettings.cpp | 16 ++++ src/core/g_palSettings.h | 2 + src/core/hw/gfxip/pipeline.cpp | 4 +- src/core/internalMemMgr.cpp | 144 +++++++++++++++++++++++++++++++++ src/core/internalMemMgr.h | 20 +++++ src/core/settings_core.json | 19 ++++- 6 files changed, 202 insertions(+), 3 deletions(-) diff --git a/src/core/g_palSettings.cpp b/src/core/g_palSettings.cpp index 43b108cf6..2755bcdff 100644 --- a/src/core/g_palSettings.cpp +++ b/src/core/g_palSettings.cpp @@ -164,6 +164,7 @@ void SettingsLoader::SetupDefaults() m_settings.overlayReportMes = true; m_settings.mipGenUseFastPath = false; m_settings.useFp16GenMips = false; + m_settings.maxMappedPoolsSize = 0; m_settings.tmzEnabled = true; #if PAL_DEVELOPER_BUILD m_settings.dbgHelperBits = 0x0; @@ -633,6 +634,11 @@ void SettingsLoader::ReadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr, + Util::ValueType::Uint64, + &m_settings.maxMappedPoolsSize, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pTmzEnabledStr, Util::ValueType::Boolean, &m_settings.tmzEnabled, @@ -683,6 +689,11 @@ void SettingsLoader::RereadSettings() &m_settings.useFp16GenMips, InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr, + Util::ValueType::Uint64, + &m_settings.maxMappedPoolsSize, + InternalSettingScope::PrivatePalKey); + static_cast(m_pDevice)->ReadSetting(pUseDccStr, Util::ValueType::Uint, &m_settings.useDcc, @@ -1148,6 +1159,11 @@ void SettingsLoader::InitSettingsInfo() info.valueSize = sizeof(m_settings.useFp16GenMips); m_settingsInfoMap.Insert(192229910, info); + info.type = SettingType::Uint64; + info.pValuePtr = &m_settings.maxMappedPoolsSize; + info.valueSize = sizeof(m_settings.maxMappedPoolsSize); + m_settingsInfoMap.Insert(3814409436, info); + info.type = SettingType::Boolean; info.pValuePtr = &m_settings.tmzEnabled; info.valueSize = sizeof(m_settings.tmzEnabled); diff --git a/src/core/g_palSettings.h b/src/core/g_palSettings.h index 057aa2c0a..6eaf195bf 100644 --- a/src/core/g_palSettings.h +++ b/src/core/g_palSettings.h @@ -290,6 +290,7 @@ struct PalSettings : public Pal::DriverSettings bool overlayReportMes; bool mipGenUseFastPath; bool useFp16GenMips; + gpusize maxMappedPoolsSize; bool tmzEnabled; #if PAL_DEVELOPER_BUILD uint64 dbgHelperBits; @@ -392,6 +393,7 @@ static const char* pDebugForceResourceAdditionalPaddingStr = "#3601080919"; static const char* pOverlayReportMesStr = "#1685803860"; static const char* pMipGenUseFastPathStr = "#3353227045"; static const char* pUseFp16GenMipsStr = "#192229910"; +static const char* pmaxMappedPoolsSizeStr = "#3814409436"; static const char* pTmzEnabledStr = "#2606194033"; #if PAL_DEVELOPER_BUILD static const char* pDbgHelperBitsStr = "#3894710420"; diff --git a/src/core/hw/gfxip/pipeline.cpp b/src/core/hw/gfxip/pipeline.cpp index d29705fd3..398221822 100644 --- a/src/core/hw/gfxip/pipeline.cpp +++ b/src/core/hw/gfxip/pipeline.cpp @@ -912,7 +912,7 @@ Result PipelineUploader::UploadUsingCpu( const SectionAddressCalculator& addressCalc, void** ppMappedPtr) { - Result result = m_pGpuMemory->Map(&m_pMappedPtr); + Result result = m_pDevice->MemMgr()->Map(m_pGpuMemory, &m_pMappedPtr); if (result == Result::Success) { m_pMappedPtr = VoidPtrInc(m_pMappedPtr, static_cast(m_baseOffset)); @@ -1141,7 +1141,7 @@ Result PipelineUploader::End( else { PAL_ASSERT(m_pMappedPtr != nullptr); - result = m_pGpuMemory->Unmap(); + m_pDevice->MemMgr()->Unmap(m_pGpuMemory); } m_pMappedPtr = nullptr; diff --git a/src/core/internalMemMgr.cpp b/src/core/internalMemMgr.cpp index 8fa36d8b7..34b7489fc 100644 --- a/src/core/internalMemMgr.cpp +++ b/src/core/internalMemMgr.cpp @@ -128,6 +128,7 @@ InternalMemMgr::InternalMemMgr( : m_pDevice(pDevice), m_poolList(pDevice->GetPlatform()), + m_unusedMappedPoolList(pDevice->GetPlatform()), m_references(pDevice->GetPlatform()), m_referenceWatermark(0) { @@ -137,6 +138,24 @@ InternalMemMgr::InternalMemMgr( // Explicitly frees all GPU memory allocations. void InternalMemMgr::FreeAllocations() { + + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + PAL_ASSERT((it.Get() != nullptr) && (it.Get()->pBuddyAllocator != nullptr)); + + if ((it.Get()->pData != nullptr) && (it.Get()->pGpuMemory != nullptr)) + { + it.Get()->pGpuMemory->Unmap(); + it.Get()->pData = nullptr; + } + } + + while (m_unusedMappedPoolList.NumElements() > 0) + { + auto it = m_unusedMappedPoolList.Begin(); + m_unusedMappedPoolList.Erase(&it); + } + // Delete the GPU memory objects using the references list while (m_references.NumElements() != 0) { @@ -562,4 +581,129 @@ uint32 InternalMemMgr::GetReferencesCount() return static_cast(m_references.NumElements()); } +// ===================================================================================================================== +// Map the GPU memory allocation for cpu access +Result InternalMemMgr::Map( + GpuMemory* pGpuMemory, + void** ppData) +{ + PAL_ASSERT(pGpuMemory != nullptr); + Result result = Result::ErrorInvalidValue; + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData == nullptr) + { + result = pPool->pGpuMemory->Map(&pPool->pData); + if (result != Result::Success) + { + pPool->pData = nullptr; + break; + } + m_totalSizeMappedPools += pPool->pGpuMemory->Desc().size; + CheckMappedPoolLimit(); + } + else if (pPool->refCount == 0) + { + // should be in unused list, remove it from there. + for (auto it2 = m_unusedMappedPoolList.Begin(); it2.Get() != nullptr; it2.Next()) + { + if (*(it2.Get()) == pPool) + { + m_unusedMappedPoolList.Erase(&it2); + break; + } + } + } + pPool->refCount++; + *ppData = pPool->pData; + result = Result::Success; + break; + } + } + + // If we didn't find the allocation in the pool list then something went wrong with the allocation scheme + PAL_ASSERT(result == Result::Success); + } + else + { + result = pGpuMemory->Map(ppData); + } + + return result; +} + +// ===================================================================================================================== +// Unmap the GPU memory allocation from cpu address space +Result InternalMemMgr::Unmap( + GpuMemory* pGpuMemory) +{ + PAL_ASSERT(pGpuMemory != nullptr); + if (pGpuMemory->WasBuddyAllocated()) + { + Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock + // Try to find the allocation in the pool list + for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next()) + { + GpuMemoryPool* pPool = it.Get(); + + PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr)); + if (pPool->pGpuMemory == pGpuMemory) + { + if (pPool->pData != nullptr) + { + pPool->refCount--; + if (pPool->refCount == 0) + { + m_unusedMappedPoolList.PushBack(pPool); + CheckMappedPoolLimit(); + } + } + break; + } + } + } + else + { + pGpuMemory->Unmap(); + } + + return Result::Success; +} + +// ===================================================================================================================== +// Check the total size of mapped pools, if it is greater than maximum limit then unmap the least recently used memory +void InternalMemMgr::CheckMappedPoolLimit() +{ + if (m_pDevice->Settings().maxMappedPoolsSize >= 0) + { + while ((m_totalSizeMappedPools > m_pDevice->Settings().maxMappedPoolsSize) + && (m_unusedMappedPoolList.NumElements() > 0)) + { + auto it = m_unusedMappedPoolList.Begin(); + GpuMemoryPool *pPool = *it.Get(); + + PAL_ASSERT(pPool->pBuddyAllocator != nullptr); + if ((pPool->pData != nullptr) && (pPool->pGpuMemory != nullptr)) + { + pPool->pGpuMemory->Unmap(); + pPool->pData = nullptr; + } + m_unusedMappedPoolList.Erase(&it); + PAL_ASSERT(m_totalSizeMappedPools >= pPool->pGpuMemory->Desc().size); + m_totalSizeMappedPools -= pPool->pGpuMemory->Desc().size; + } + } +} + + } // Pal diff --git a/src/core/internalMemMgr.h b/src/core/internalMemMgr.h index 2c9541d8f..aa27be1e2 100644 --- a/src/core/internalMemMgr.h +++ b/src/core/internalMemMgr.h @@ -56,6 +56,8 @@ struct GpuMemoryPool uint64 pagingFenceVal; // Paging fence value Util::BuddyAllocator* pBuddyAllocator; // Buddy allocator used for the suballocation + void* pData; // address of the already existing mapping + size_t refCount; // refCount the number of memory allocations use this mapping }; // ===================================================================================================================== @@ -77,6 +79,7 @@ class InternalMemMgr typedef Util::ListIterator GpuMemoryListIterator; typedef Util::List GpuMemoryPoolList; + typedef Util::List GpuMemoryPoolRefList; explicit InternalMemMgr(Device* pDevice); ~InternalMemMgr() { FreeAllocations(); } @@ -115,6 +118,17 @@ class InternalMemMgr // Number of all allocations in the reference list. Note that this function takes the reference list lock. uint32 GetReferencesCount(); + Result Map( + GpuMemory* pGpuMemory, + void** ppData); + + Result Unmap( + GpuMemory* pGpuMemory); + + // If the number of mapped pools are more then the maximum limit then unmap the least recently used pool. + void CheckMappedPoolLimit(); + + private: Result AllocateBaseGpuMem( const GpuMemoryCreateInfo& createInfo, @@ -133,6 +147,9 @@ class InternalMemMgr // Maintain a list of GPU memory objects that are sub-allocated GpuMemoryPoolList m_poolList; + // Maintain a list of GPU memory objects that are sub-allocated and mapped but unused + GpuMemoryPoolRefList m_unusedMappedPoolList; + // Maintain a list of internal GPU memory references GpuMemoryList m_references; @@ -142,6 +159,9 @@ class InternalMemMgr // Ever-incrementing watermark to signal changes to the internal memory reference list uint32 m_referenceWatermark; + // Total size of mapped pools + gpusize m_totalSizeMappedPools; + PAL_DISALLOW_COPY_AND_ASSIGN(InternalMemMgr); PAL_DISALLOW_DEFAULT_CTOR(InternalMemMgr); }; diff --git a/src/core/settings_core.json b/src/core/settings_core.json index 55a49db42..d89c4cdf2 100644 --- a/src/core/settings_core.json +++ b/src/core/settings_core.json @@ -1945,6 +1945,23 @@ "VariableName": "useFp16GenMips", "Description": "If mipGenUseFastPath == true and this is true - use the fp16 single-pass GenMips compute pass." }, + { + "Name": "maxMappedPoolsSize", + "Tags": [ + "Resource Settings", + "Performance" + ], + "Defaults": { + "Default": 0 + }, + "Flags": { + "RereadSetting": true + }, + "Scope": "PrivatePalKey", + "Type": "gpusize", + "VariableName": "maxMappedPoolsSize", + "Description": "If maxMappedPoolsSize > 0 the mapped gpu memory for pipeline creation will not be unmapped. If the total size of mapped pools grows greater than maxMappedPoolsSize, then the least recently used pools will be unmapped." + }, { "Name": "TmzEnabled", "Tags": [ @@ -2129,4 +2146,4 @@ "Description": "Maximum string length for a miscellaneous string setting" } ] -} \ No newline at end of file +}