Skip to content

Commit

Permalink
Add setting to cache the GPUMemory mappings for pipeline upload
Browse files Browse the repository at this point in the history
During pipeline upload the GpuMemory is allocated from the pool. This
memory is mapped, pipeline is uploaded and then the memory is unmapped.
In applications with lots of pipelines being created during rendering,
this causes lots of map/unmap calls and slows down the pipeline upload.

Add a setting that allows the cache of the GpuMemory mapping, so that it
can be reused by another pipeline upload.
  • Loading branch information
samikhawaja committed Jul 16, 2021
1 parent 1311c5c commit fa620f7
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 3 deletions.
16 changes: 16 additions & 0 deletions src/core/g_palSettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ void SettingsLoader::SetupDefaults()
m_settings.overlayReportMes = true;
m_settings.mipGenUseFastPath = false;
m_settings.useFp16GenMips = false;
m_settings.maxNumMappedPool = 4294967295;
m_settings.tmzEnabled = true;
#if PAL_DEVELOPER_BUILD
m_settings.dbgHelperBits = 0x0;
Expand Down Expand Up @@ -610,6 +611,11 @@ void SettingsLoader::ReadSettings()
&m_settings.useFp16GenMips,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pmaxNumMappedPoolStr,
Util::ValueType::Uint,
&m_settings.maxNumMappedPool,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pTmzEnabledStr,
Util::ValueType::Boolean,
&m_settings.tmzEnabled,
Expand Down Expand Up @@ -655,6 +661,11 @@ void SettingsLoader::RereadSettings()
&m_settings.useFp16GenMips,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pmaxNumMappedPoolStr,
Util::ValueType::Uint,
&m_settings.maxNumMappedPool,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pUseDccStr,
Util::ValueType::Uint,
&m_settings.useDcc,
Expand Down Expand Up @@ -1100,6 +1111,11 @@ void SettingsLoader::InitSettingsInfo()
info.valueSize = sizeof(m_settings.useFp16GenMips);
m_settingsInfoMap.Insert(192229910, info);

info.type = SettingType::Uint;
info.pValuePtr = &m_settings.maxNumMappedPool;
info.valueSize = sizeof(m_settings.maxNumMappedPool);
m_settingsInfoMap.Insert(4064599218, info);

info.type = SettingType::Boolean;
info.pValuePtr = &m_settings.tmzEnabled;
info.valueSize = sizeof(m_settings.tmzEnabled);
Expand Down
2 changes: 2 additions & 0 deletions src/core/g_palSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ struct PalSettings : public Pal::DriverSettings
bool overlayReportMes;
bool mipGenUseFastPath;
bool useFp16GenMips;
uint32 maxNumMappedPool;
bool tmzEnabled;
#if PAL_DEVELOPER_BUILD
uint64 dbgHelperBits;
Expand Down Expand Up @@ -378,6 +379,7 @@ static const char* pDebugForceResourceAdditionalPaddingStr = "#3601080919";
static const char* pOverlayReportMesStr = "#1685803860";
static const char* pMipGenUseFastPathStr = "#3353227045";
static const char* pUseFp16GenMipsStr = "#192229910";
static const char* pmaxNumMappedPoolStr = "#4064599218";
static const char* pTmzEnabledStr = "#2606194033";
#if PAL_DEVELOPER_BUILD
static const char* pDbgHelperBitsStr = "#3894710420";
Expand Down
4 changes: 2 additions & 2 deletions src/core/hw/gfxip/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ Result PipelineUploader::UploadUsingCpu(
const SectionAddressCalculator& addressCalc,
void** ppMappedPtr)
{
Result result = m_pGpuMemory->Map(&m_pMappedPtr);
Result result = m_pDevice->MemMgr()->Map(m_pGpuMemory, &m_pMappedPtr);
if (result == Result::Success)
{
m_pMappedPtr = VoidPtrInc(m_pMappedPtr, static_cast<size_t>(m_baseOffset));
Expand Down Expand Up @@ -1104,7 +1104,7 @@ Result PipelineUploader::End(
else
{
PAL_ASSERT(m_pMappedPtr != nullptr);
result = m_pGpuMemory->Unmap();
m_pDevice->MemMgr()->Unmap(m_pGpuMemory);
}

m_pMappedPtr = nullptr;
Expand Down
125 changes: 125 additions & 0 deletions src/core/internalMemMgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ InternalMemMgr::InternalMemMgr(
:
m_pDevice(pDevice),
m_poolList(pDevice->GetPlatform()),
m_unusedMappedPoolList(pDevice->GetPlatform()),
m_references(pDevice->GetPlatform()),
m_referenceWatermark(0)
{
Expand All @@ -138,6 +139,18 @@ InternalMemMgr::InternalMemMgr(
// Explicitly frees all GPU memory allocations.
void InternalMemMgr::FreeAllocations()
{

for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
PAL_ASSERT((it.Get() != nullptr) && (it.Get()->pBuddyAllocator != nullptr));

if (it.Get()->mapped && it.Get()->pGpuMemory != nullptr)
{
it.Get()->pGpuMemory->Unmap();
it.Get()->mapped = false;
}
}

// Delete the GPU memory objects using the references list
while (m_references.NumElements() != 0)
{
Expand Down Expand Up @@ -563,4 +576,116 @@ uint32 InternalMemMgr::GetReferencesCount()
return static_cast<uint32>(m_references.NumElements());
}

Result InternalMemMgr::Map(
GpuMemory* pGpuMemory,
void** ppData)
{
Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock
PAL_ASSERT(pGpuMemory != nullptr);

Result result = Result::ErrorInvalidValue;
if (pGpuMemory->WasBuddyAllocated())
{
// Try to find the allocation in the pool list
for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
GpuMemoryPool* pPool = it.Get();

PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr));

if (pPool->pGpuMemory == pGpuMemory)
{
if (!pPool->mapped)
{
result = pPool->pGpuMemory->Map(&pPool->pData);
if (result != Result::Success) {
--m_numMappedPools;
break;
}
pPool->mapped = true;
++m_numMappedPools;
CheckMappedPoolLimit();
} else if (pPool->refCount == 0) {
// should be in unused list, remove it from there.
for (auto it2 = m_unusedMappedPoolList.Begin(); it2.Get() != nullptr; it2.Next()) {
if (*(it2.Get()) == pPool) {
m_unusedMappedPoolList.Erase(&it2);
break;
}
}
}
pPool->refCount++;
*ppData = pPool->pData;
result = Result::Success;
break;
}
}

// If we didn't find the allocation in the pool list then something went wrong with the allocation scheme
PAL_ASSERT(result == Result::Success);
}
else
{
result = pGpuMemory->Map(ppData);
}

return result;
}

Result InternalMemMgr::Unmap(
GpuMemory* pGpuMemory)
{
Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock
PAL_ASSERT(pGpuMemory != nullptr);

if (pGpuMemory->WasBuddyAllocated())
{
// Try to find the allocation in the pool list
for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
GpuMemoryPool* pPool = it.Get();

PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr));
if (pPool->pGpuMemory == pGpuMemory)
{
if (pPool->mapped)
{
pPool->refCount--;
if (pPool->refCount == 0)
{
m_unusedMappedPoolList.PushBack(pPool);
CheckMappedPoolLimit();
}
}
break;
}
}
}

return Result::Success;
}

void InternalMemMgr::CheckMappedPoolLimit()
{
if ((m_pDevice->Settings().maxNumMappedPool >= 0))
{
while (m_numMappedPools > m_pDevice->Settings().maxNumMappedPool
&& m_unusedMappedPoolList.NumElements() > 0)
{
auto it = m_unusedMappedPoolList.Begin();
GpuMemoryPool *pPool = *it.Get();

PAL_ASSERT(pPool->pBuddyAllocator != nullptr);
if (pPool->mapped && pPool->pGpuMemory != nullptr)
{
pPool->pGpuMemory->Unmap();
pPool->mapped = false;
}
m_unusedMappedPoolList.Erase(&it);
--m_numMappedPools;
}
}
}


} // Pal
21 changes: 21 additions & 0 deletions src/core/internalMemMgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ struct GpuMemoryPool
uint64 pagingFenceVal; // Paging fence value

Util::BuddyAllocator<Platform>* pBuddyAllocator; // Buddy allocator used for the suballocation
bool mapped; // flag to check if this pool is already mapped
void* pData; // address of the already existing mapping
size_t refCount; // refCount the number of memory allocations use this mapping
};

// =====================================================================================================================
Expand All @@ -77,6 +80,7 @@ class InternalMemMgr
typedef Util::ListIterator<GpuMemoryInfo, Platform> GpuMemoryListIterator;

typedef Util::List<GpuMemoryPool, Platform> GpuMemoryPoolList;
typedef Util::List<GpuMemoryPool*, Platform> GpuMemoryPoolRefList;

explicit InternalMemMgr(Device* pDevice);
~InternalMemMgr() { FreeAllocations(); }
Expand Down Expand Up @@ -115,6 +119,17 @@ class InternalMemMgr
// Number of all allocations in the reference list. Note that this function takes the reference list lock.
uint32 GetReferencesCount();

Result Map(
GpuMemory* pGpuMemory,
void** ppData);

Result Unmap(
GpuMemory* pGpuMemory);

// If the number of mapped pools are more then the maximum limit then unmap the least recently used pool.
void CheckMappedPoolLimit();


private:
Result AllocateBaseGpuMem(
const GpuMemoryCreateInfo& createInfo,
Expand All @@ -133,6 +148,9 @@ class InternalMemMgr
// Maintain a list of GPU memory objects that are sub-allocated
GpuMemoryPoolList m_poolList;

// Maintain a list of GPU memory objects that are sub-allocated and mapped but unused
GpuMemoryPoolRefList m_unusedMappedPoolList;

// Maintain a list of internal GPU memory references
GpuMemoryList m_references;

Expand All @@ -142,6 +160,9 @@ class InternalMemMgr
// Ever-incrementing watermark to signal changes to the internal memory reference list
uint32 m_referenceWatermark;

// Number of mapped memory pools
uint32 m_numMappedPools;

PAL_DISALLOW_COPY_AND_ASSIGN(InternalMemMgr);
PAL_DISALLOW_DEFAULT_CTOR(InternalMemMgr);
};
Expand Down
19 changes: 18 additions & 1 deletion src/core/settings_core.json
Original file line number Diff line number Diff line change
Expand Up @@ -1881,6 +1881,23 @@
"VariableName": "useFp16GenMips",
"Description": "If mipGenUseFastPath == true and this is true - use the fp16 single-pass GenMips compute pass."
},
{
"Name": "maxNumMappedPool",
"Tags": [
"Resource Settings",
"Performance"
],
"Defaults": {
"Default": 4294967295
},
"Flags": {
"RereadSetting": true
},
"Scope": "PrivatePalKey",
"Type": "uint32",
"VariableName": "maxNumMappedPool",
"Description": "If maxNumMappedPool > 0 the mapped gpu memory for pipeline creation will not be unmapped. If the number of unused mapped pools grows greater than maxNumMappedPool, then the least recently used pool will be unmapped."
},
{
"Name": "TmzEnabled",
"Tags": [
Expand Down Expand Up @@ -2019,4 +2036,4 @@
"Description": "Maximum string length for a miscellaneous string setting"
}
]
}
}

0 comments on commit fa620f7

Please sign in to comment.