Skip to content

Commit

Permalink
materialsystem: threaded optimizations, fix mat_queue_mode on some an…
Browse files Browse the repository at this point in the history
…droid devices
  • Loading branch information
nillerusr committed Jan 14, 2023
1 parent 3458c36 commit 3614a86
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 140 deletions.
45 changes: 11 additions & 34 deletions materialsystem/cmaterialsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread )

bool bOldAllow = m_bAllowQueuedRendering;

if ( GetCPUInformation()->m_nPhysicalProcessors >= 2 )
if ( GetCPUInformation()->m_nLogicalProcessors >= 2 )
{
m_bAllowQueuedRendering = bAllow;
bool bQueued = m_IdealThreadMode != MATERIAL_SINGLE_THREADED;
Expand Down Expand Up @@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps( "mat_normalmaps", "0", FCVAR_CHEAT );
static ConVar mat_measurefillrate( "mat_measurefillrate", "0", FCVAR_CHEAT );
static ConVar mat_fillrate( "mat_fillrate", "0", FCVAR_CHEAT );
static ConVar mat_reversedepth( "mat_reversedepth", "0", FCVAR_CHEAT );
#ifdef DX_TO_GL_ABSTRACTION
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "0" ); // I'm not seeing any benefit speed wise for buffered primitives on GLM/POSIX (checked via TF2 timedemo) - default to zero
#else
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "1" );
#endif
static ConVar mat_drawflat( "mat_drawflat","0", FCVAR_CHEAT );
static ConVar mat_softwarelighting( "mat_softwarelighting", "0", FCVAR_ALLOWED_IN_COMPETITIVE );
static ConVar mat_proxy( "mat_proxy", "0", FCVAR_CHEAT, "", MatProxyCallback );
Expand Down Expand Up @@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
{
// We need lower-case symbols for this to work
int nLen = Q_strlen( pMaterialName ) + 1;
char *pFixedNameTemp = (char*)malloc( nLen );
char *pTemp = (char*)malloc( nLen );
char *pFixedNameTemp = (char*)stackalloc( nLen );
char *pTemp = (char*)stackalloc( nLen );
Q_strncpy( pFixedNameTemp, pMaterialName, nLen );
Q_strlower( pFixedNameTemp );
#ifdef POSIX
Expand Down Expand Up @@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
}
}

free(pTemp);
free(pFixedNameTemp);

return g_pErrorMaterial->GetRealTimeVersion();
}

Expand Down Expand Up @@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel )
//-----------------------------------------------------------------------------
void CMaterialSystem::CacheUsedMaterials( )
{
printf("Cache materials\n");

g_pShaderAPI->EvictManagedResources();
size_t count = 0;

for (MaterialHandle_t i = FirstMaterial(); i != InvalidMaterial(); i = NextMaterial(i) )
{
// Some (mac) drivers (amd) seem to keep extra resources around on uploads until the next frame swap. This
// injects pointless synthetic swaps (between already-static load frames)
if ( mat_texture_reload_frame_swap_workaround.GetBool() )
{
if ( count++ % 20 == 0 )
{
Flush(true);
SwapBuffers(); // Not the right thing to call
}
}
IMaterialInternal* pMat = GetMaterialInternal(i);
Assert( pMat->GetReferenceCount() >= 0 );
if( pMat->GetReferenceCount() > 0 )
Expand Down Expand Up @@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void )
ThreadAcquire( true );
}

IThreadPool* pThreadPool = CreateMatQueueThreadPool();

if ( m_pActiveAsyncJob && !m_pActiveAsyncJob->IsFinished() )
{
m_pActiveAsyncJob->WaitForFinish();
m_pActiveAsyncJob->WaitForFinish(TT_INFINITE, pThreadPool);

// Sync with GPU if we had a job for it, even if it finished early on CPU!
if ( !IsPC() && g_config.ForceHWSync() )
{
g_pShaderAPI->ForceHardwareSync();
Expand All @@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void )
}
}

IThreadPool *pThreadPool = CreateMatQueueThreadPool();
pThreadPool->AddJob( m_pActiveAsyncJob );
break;
}
Expand Down Expand Up @@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void )

void CMaterialSystem::EndRenderTargetAllocation( void )
{
// Any GPU newer than 2005 doesn't need to do this, and it eats up ~40% of our level load time!
const bool cbRequiresRenderTargetAllocationFirst = mat_requires_rt_alloc_first.GetBool();

g_pShaderAPI->FlushBufferedPrimitives();
m_bAllocatingRenderTargets = false;

if ( IsPC() && cbRequiresRenderTargetAllocationFirst && g_pShaderAPI->CanDownloadTextures() )
{
// Simulate an Alt-Tab...will cause RTs to be allocated first

g_pShaderDevice->ReleaseResources();
g_pShaderDevice->ReacquireResources();
}

TextureManager()->CacheExternalStandardRenderTargets();
}

Expand Down
9 changes: 3 additions & 6 deletions materialsystem/cmatqueuedrendercontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,14 +455,11 @@ class CMatQueuedMesh : public IMesh
}
else
{
ALIGN16 uint16 tempIndices[16];
static ALIGN16 uint16 tempIndices[256];

// original method
int i = 0;
if ( (size_t)desc.m_pIndices % 4 == 2 )
{
desc.m_pIndices[i] = pIndexData[i] + desc.m_nFirstVertex;
i++;
}

while ( i < nIndices )
{
int nToCopy = min( (int)ARRAYSIZE(tempIndices), nIndices - i );
Expand Down
15 changes: 1 addition & 14 deletions materialsystem/ctexture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2458,15 +2458,8 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int
return false;
}

if ( V_strstr( GetName(), "c_sniperrifle_scope" ) )
{
int i = 0;
i = 3;
}


tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s - %s", __FUNCTION__, tmDynamicString( TELEMETRY_LEVEL0, pCacheFileName ) );

// OSX hackery
int nPreserveFlags = nAdditionalCreationFlags;
if ( m_nFlags & TEXTUREFLAGS_SRGB )
Expand Down Expand Up @@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile
// NOTE! NOTE! NOTE! or by the streaming texture code!
Assert( ppOutVtfTexture != NULL && *ppOutVtfTexture != NULL );

if ( V_strstr( pName, "c_rocketlauncher/c_rocketlauncher" ) )
{
int i = 0;
i = 3;
}

CUtlBuffer buf;

{
Expand Down
40 changes: 35 additions & 5 deletions public/tier0/threadtools.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
#pragma once
#pragma warning(push)
#pragma warning(disable:4251)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif

#ifdef COMPILER_MSVC64
Expand Down Expand Up @@ -194,8 +196,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );

//-----------------------------------------------------------------------------

PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0);
PLATFORM_INTERFACE void ThreadNanoSleep(unsigned ns);
PLATFORM_INTERFACE ThreadId_t ThreadGetCurrentId();
PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle();
PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL );
Expand Down Expand Up @@ -229,10 +229,10 @@ inline void ThreadPause()
{
#if defined( COMPILER_PS3 )
__db16cyc();
#elif defined(__arm__) || defined(__aarch64__)
sched_yield();
#elif defined( COMPILER_GCC )
#elif defined( COMPILER_GCC ) && (defined( __i386__ ) || defined( __x86_64__ ))
__asm __volatile( "pause" );
#elif defined( POSIX )
sched_yield();
#elif defined ( COMPILER_MSVC64 )
_mm_pause();
#elif defined( COMPILER_MSVC32 )
Expand All @@ -247,6 +247,36 @@ inline void ThreadPause()
#endif
}

inline void ThreadSleep(unsigned nMilliseconds = 0)
{
if( nMilliseconds == 0 )
{
ThreadPause();
return;
}

#ifdef _WIN32

#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif
Sleep( nMilliseconds );
#elif PS3
sys_timer_usleep( nMilliseconds * 1000 );
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}

PLATFORM_INTERFACE bool ThreadJoin( ThreadHandle_t, unsigned timeout = TT_INFINITE );

PLATFORM_INTERFACE void ThreadSetDebugName( ThreadHandle_t hThread, const char *pszName );
Expand Down
16 changes: 5 additions & 11 deletions public/tier1/memhelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,15 @@ namespace memutils
template<typename T>
inline void copy( T *dest, const T *src, size_t n )
{
do
{
--n;
*(dest+n) = *(src+n);
} while( n );
for(; n; n--)
*(dest++) = *(src++);
}

template<typename T>
inline void set( T *dest, T value, size_t n )
inline void set( T *dest, const T& value, size_t n )
{
do
{
--n;
*(dest+n) = value;
} while( n );
for(; n; n--)
*(dest++) = value;
}
}

Expand Down
4 changes: 2 additions & 2 deletions public/vstdlib/jobthread.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,8 @@ class CJob : public CRefCounted1<IRefCounted, CRefCountServiceMT>
//-----------------------------------------------------
// Thread event support (safe for NULL this to simplify code )
//-----------------------------------------------------
bool WaitForFinish( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; return ( !IsFinished() ) ? g_pThreadPool->YieldWait( this, dwTimeout ) : true; }
bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
inline bool WaitForFinish( uint32 dwTimeout = TT_INFINITE, IThreadPool *pool = g_pThreadPool ) { if (!this) return true; return ( !IsFinished() ) ? pool->YieldWait( this, dwTimeout ) : true; }
inline bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
CThreadEvent *AccessEvent() { return &m_CompleteEvent; }

//-----------------------------------------------------
Expand Down
53 changes: 0 additions & 53 deletions tier0/threadtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread )
//
//-----------------------------------------------------------------------------

void ThreadSleep(unsigned nMilliseconds)
{
#ifdef _WIN32

#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif

Sleep( nMilliseconds );
#elif PS3
if( nMilliseconds == 0 )
{
// sys_ppu_thread_yield doesn't seem to function properly, so sleep instead.
// sys_timer_usleep( 60 );
sys_ppu_thread_yield();
}
else
{
sys_timer_usleep( nMilliseconds * 1000 );
}
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}

//-----------------------------------------------------------------------------
void ThreadNanoSleep(unsigned ns)
{
#ifdef _WIN32
// ceil
Sleep( ( ns + 999 ) / 1000 );
#elif PS3
sys_timer_usleep( ns );
#elif defined(POSIX)
struct timespec tm;
tm.tv_sec = 0;
tm.tv_nsec = ns;
nanosleep( &tm, NULL );
#endif
}


//-----------------------------------------------------------------------------

#ifndef ThreadGetCurrentId
ThreadId_t ThreadGetCurrentId()
{
Expand Down
20 changes: 5 additions & 15 deletions vstdlib/jobthread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,11 @@ class CThreadPool : public CRefCounted1<IThreadPool, CRefCountServiceMT>
//-----------------------------------------------------
virtual int YieldWait( CThreadEvent **pEvents, int nEvents, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
virtual int YieldWait( CJob **, int nJobs, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
void Yield( unsigned timeout );
inline void Yield( unsigned timeout )
{
Assert( ThreadInMainThread() );
ThreadSleep( timeout );
}

//-----------------------------------------------------
// Add a native job to the queue (master thread)
Expand Down Expand Up @@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti
return YieldWait( handles.Base(), handles.Count(), bWaitAll, timeout);
}

//---------------------------------------------------------

void CThreadPool::Yield( unsigned timeout )
{
// @MULTICORE (toml 10/24/2006): not implemented
Assert( ThreadInMainThread() );
if ( !ThreadInMainThread() )
{
ThreadSleep( timeout );
return;
}
ThreadSleep( timeout );
}

//---------------------------------------------------------
// Add a job to the queue
//---------------------------------------------------------
Expand Down

0 comments on commit 3614a86

Please sign in to comment.