Skip to content

Commit

Permalink
Add setting to cache the GPUMemory mappings for pipeline upload
Browse files Browse the repository at this point in the history
During pipeline upload the GpuMemory is allocated from the pool. This
memory is mapped, pipeline is uploaded and then the memory is unmapped.
In applications with lots of pipelines being created during rendering,
this causes lots of map/unmap calls and slows down the pipeline upload.

Add a setting that allows the cache of the GpuMemory mapping, so that it
can be reused by another pipeline upload.
  • Loading branch information
samikhawaja committed Jan 5, 2022
1 parent bcec463 commit e129c53
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 3 deletions.
16 changes: 16 additions & 0 deletions src/core/g_palSettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ void SettingsLoader::SetupDefaults()
m_settings.overlayReportMes = true;
m_settings.mipGenUseFastPath = false;
m_settings.useFp16GenMips = false;
m_settings.maxMappedPoolsSize = 0;
m_settings.tmzEnabled = true;
#if PAL_DEVELOPER_BUILD
m_settings.dbgHelperBits = 0x0;
Expand Down Expand Up @@ -633,6 +634,11 @@ void SettingsLoader::ReadSettings()
&m_settings.useFp16GenMips,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr,
Util::ValueType::Uint64,
&m_settings.maxMappedPoolsSize,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pTmzEnabledStr,
Util::ValueType::Boolean,
&m_settings.tmzEnabled,
Expand Down Expand Up @@ -683,6 +689,11 @@ void SettingsLoader::RereadSettings()
&m_settings.useFp16GenMips,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pmaxMappedPoolsSizeStr,
Util::ValueType::Uint64,
&m_settings.maxMappedPoolsSize,
InternalSettingScope::PrivatePalKey);

static_cast<Pal::Device*>(m_pDevice)->ReadSetting(pUseDccStr,
Util::ValueType::Uint,
&m_settings.useDcc,
Expand Down Expand Up @@ -1148,6 +1159,11 @@ void SettingsLoader::InitSettingsInfo()
info.valueSize = sizeof(m_settings.useFp16GenMips);
m_settingsInfoMap.Insert(192229910, info);

info.type = SettingType::Uint64;
info.pValuePtr = &m_settings.maxMappedPoolsSize;
info.valueSize = sizeof(m_settings.maxMappedPoolsSize);
m_settingsInfoMap.Insert(3814409436, info);

info.type = SettingType::Boolean;
info.pValuePtr = &m_settings.tmzEnabled;
info.valueSize = sizeof(m_settings.tmzEnabled);
Expand Down
2 changes: 2 additions & 0 deletions src/core/g_palSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ struct PalSettings : public Pal::DriverSettings
bool overlayReportMes;
bool mipGenUseFastPath;
bool useFp16GenMips;
gpusize maxMappedPoolsSize;
bool tmzEnabled;
#if PAL_DEVELOPER_BUILD
uint64 dbgHelperBits;
Expand Down Expand Up @@ -392,6 +393,7 @@ static const char* pDebugForceResourceAdditionalPaddingStr = "#3601080919";
static const char* pOverlayReportMesStr = "#1685803860";
static const char* pMipGenUseFastPathStr = "#3353227045";
static const char* pUseFp16GenMipsStr = "#192229910";
static const char* pmaxMappedPoolsSizeStr = "#3814409436";
static const char* pTmzEnabledStr = "#2606194033";
#if PAL_DEVELOPER_BUILD
static const char* pDbgHelperBitsStr = "#3894710420";
Expand Down
4 changes: 2 additions & 2 deletions src/core/hw/gfxip/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,7 @@ Result PipelineUploader::UploadUsingCpu(
const SectionAddressCalculator& addressCalc,
void** ppMappedPtr)
{
Result result = m_pGpuMemory->Map(&m_pMappedPtr);
Result result = m_pDevice->MemMgr()->Map(m_pGpuMemory, &m_pMappedPtr);
if (result == Result::Success)
{
m_pMappedPtr = VoidPtrInc(m_pMappedPtr, static_cast<size_t>(m_baseOffset));
Expand Down Expand Up @@ -1141,7 +1141,7 @@ Result PipelineUploader::End(
else
{
PAL_ASSERT(m_pMappedPtr != nullptr);
result = m_pGpuMemory->Unmap();
m_pDevice->MemMgr()->Unmap(m_pGpuMemory);
}

m_pMappedPtr = nullptr;
Expand Down
144 changes: 144 additions & 0 deletions src/core/internalMemMgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ InternalMemMgr::InternalMemMgr(
:
m_pDevice(pDevice),
m_poolList(pDevice->GetPlatform()),
m_unusedMappedPoolList(pDevice->GetPlatform()),
m_references(pDevice->GetPlatform()),
m_referenceWatermark(0)
{
Expand All @@ -137,6 +138,24 @@ InternalMemMgr::InternalMemMgr(
// Explicitly frees all GPU memory allocations.
void InternalMemMgr::FreeAllocations()
{

for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
PAL_ASSERT((it.Get() != nullptr) && (it.Get()->pBuddyAllocator != nullptr));

if ((it.Get()->pData != nullptr) && (it.Get()->pGpuMemory != nullptr))
{
it.Get()->pGpuMemory->Unmap();
it.Get()->pData = nullptr;
}
}

while (m_unusedMappedPoolList.NumElements() > 0)
{
auto it = m_unusedMappedPoolList.Begin();
m_unusedMappedPoolList.Erase(&it);
}

// Delete the GPU memory objects using the references list
while (m_references.NumElements() != 0)
{
Expand Down Expand Up @@ -562,4 +581,129 @@ uint32 InternalMemMgr::GetReferencesCount()
return static_cast<uint32>(m_references.NumElements());
}

// =====================================================================================================================
// Map the GPU memory allocation for cpu access
Result InternalMemMgr::Map(
GpuMemory* pGpuMemory,
void** ppData)
{
PAL_ASSERT(pGpuMemory != nullptr);
Result result = Result::ErrorInvalidValue;
if (pGpuMemory->WasBuddyAllocated())
{
Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock
// Try to find the allocation in the pool list
for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
GpuMemoryPool* pPool = it.Get();

PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr));

if (pPool->pGpuMemory == pGpuMemory)
{
if (pPool->pData == nullptr)
{
result = pPool->pGpuMemory->Map(&pPool->pData);
if (result != Result::Success)
{
pPool->pData = nullptr;
break;
}
m_totalSizeMappedPools += pPool->pGpuMemory->Desc().size;
CheckMappedPoolLimit();
}
else if (pPool->refCount == 0)
{
// should be in unused list, remove it from there.
for (auto it2 = m_unusedMappedPoolList.Begin(); it2.Get() != nullptr; it2.Next())
{
if (*(it2.Get()) == pPool)
{
m_unusedMappedPoolList.Erase(&it2);
break;
}
}
}
pPool->refCount++;
*ppData = pPool->pData;
result = Result::Success;
break;
}
}

// If we didn't find the allocation in the pool list then something went wrong with the allocation scheme
PAL_ASSERT(result == Result::Success);
}
else
{
result = pGpuMemory->Map(ppData);
}

return result;
}

// =====================================================================================================================
// Unmap the GPU memory allocation from cpu address space
Result InternalMemMgr::Unmap(
GpuMemory* pGpuMemory)
{
PAL_ASSERT(pGpuMemory != nullptr);
if (pGpuMemory->WasBuddyAllocated())
{
Util::MutexAuto allocatorLock(&m_allocatorLock); // Ensure thread-safety using the lock
// Try to find the allocation in the pool list
for (auto it = m_poolList.Begin(); it.Get() != nullptr; it.Next())
{
GpuMemoryPool* pPool = it.Get();

PAL_ASSERT((pPool->pGpuMemory != nullptr) && (pPool->pBuddyAllocator != nullptr));
if (pPool->pGpuMemory == pGpuMemory)
{
if (pPool->pData != nullptr)
{
pPool->refCount--;
if (pPool->refCount == 0)
{
m_unusedMappedPoolList.PushBack(pPool);
CheckMappedPoolLimit();
}
}
break;
}
}
}
else
{
pGpuMemory->Unmap();
}

return Result::Success;
}

// =====================================================================================================================
// Check the total size of mapped pools, if it is greater than maximum limit then unmap the least recently used memory
void InternalMemMgr::CheckMappedPoolLimit()
{
if (m_pDevice->Settings().maxMappedPoolsSize >= 0)
{
while ((m_totalSizeMappedPools > m_pDevice->Settings().maxMappedPoolsSize)
&& (m_unusedMappedPoolList.NumElements() > 0))
{
auto it = m_unusedMappedPoolList.Begin();
GpuMemoryPool *pPool = *it.Get();

PAL_ASSERT(pPool->pBuddyAllocator != nullptr);
if ((pPool->pData != nullptr) && (pPool->pGpuMemory != nullptr))
{
pPool->pGpuMemory->Unmap();
pPool->pData = nullptr;
}
m_unusedMappedPoolList.Erase(&it);
PAL_ASSERT(m_totalSizeMappedPools >= pPool->pGpuMemory->Desc().size);
m_totalSizeMappedPools -= pPool->pGpuMemory->Desc().size;
}
}
}


} // Pal
20 changes: 20 additions & 0 deletions src/core/internalMemMgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ struct GpuMemoryPool
uint64 pagingFenceVal; // Paging fence value

Util::BuddyAllocator<Platform>* pBuddyAllocator; // Buddy allocator used for the suballocation
void* pData; // address of the already existing mapping
size_t refCount; // refCount the number of memory allocations use this mapping
};

// =====================================================================================================================
Expand All @@ -77,6 +79,7 @@ class InternalMemMgr
typedef Util::ListIterator<GpuMemoryInfo, Platform> GpuMemoryListIterator;

typedef Util::List<GpuMemoryPool, Platform> GpuMemoryPoolList;
typedef Util::List<GpuMemoryPool*, Platform> GpuMemoryPoolRefList;

explicit InternalMemMgr(Device* pDevice);
~InternalMemMgr() { FreeAllocations(); }
Expand Down Expand Up @@ -115,6 +118,17 @@ class InternalMemMgr
// Number of all allocations in the reference list. Note that this function takes the reference list lock.
uint32 GetReferencesCount();

Result Map(
GpuMemory* pGpuMemory,
void** ppData);

Result Unmap(
GpuMemory* pGpuMemory);

// If the number of mapped pools are more then the maximum limit then unmap the least recently used pool.
void CheckMappedPoolLimit();


private:
Result AllocateBaseGpuMem(
const GpuMemoryCreateInfo& createInfo,
Expand All @@ -133,6 +147,9 @@ class InternalMemMgr
// Maintain a list of GPU memory objects that are sub-allocated
GpuMemoryPoolList m_poolList;

// Maintain a list of GPU memory objects that are sub-allocated and mapped but unused
GpuMemoryPoolRefList m_unusedMappedPoolList;

// Maintain a list of internal GPU memory references
GpuMemoryList m_references;

Expand All @@ -142,6 +159,9 @@ class InternalMemMgr
// Ever-incrementing watermark to signal changes to the internal memory reference list
uint32 m_referenceWatermark;

// Total size of mapped pools
gpusize m_totalSizeMappedPools;

PAL_DISALLOW_COPY_AND_ASSIGN(InternalMemMgr);
PAL_DISALLOW_DEFAULT_CTOR(InternalMemMgr);
};
Expand Down
19 changes: 18 additions & 1 deletion src/core/settings_core.json
Original file line number Diff line number Diff line change
Expand Up @@ -1945,6 +1945,23 @@
"VariableName": "useFp16GenMips",
"Description": "If mipGenUseFastPath == true and this is true - use the fp16 single-pass GenMips compute pass."
},
{
"Name": "maxMappedPoolsSize",
"Tags": [
"Resource Settings",
"Performance"
],
"Defaults": {
"Default": 0
},
"Flags": {
"RereadSetting": true
},
"Scope": "PrivatePalKey",
"Type": "gpusize",
"VariableName": "maxMappedPoolsSize",
"Description": "If maxMappedPoolsSize > 0 the mapped gpu memory for pipeline creation will not be unmapped. If the total size of mapped pools grows greater than maxMappedPoolsSize, then the least recently used pools will be unmapped."
},
{
"Name": "TmzEnabled",
"Tags": [
Expand Down Expand Up @@ -2129,4 +2146,4 @@
"Description": "Maximum string length for a miscellaneous string setting"
}
]
}
}

0 comments on commit e129c53

Please sign in to comment.