diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 058f1245a3a82..e0ea942ea552d 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -137,7 +137,7 @@ variables in production code. | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | -| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | MaxPoolableSize,Capacity,MaxPoolSize | Values specified as positive integers. Defaults are 1, 4, 256. MaxPoolableSize is the maximum allocation size in MB that may be pooled. Capacity is the number of allocations in each size range that are freed by the program but retained in the pool for reallocation. Size ranges follow this pattern: 32, 48, 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. MaxPoolSize is the maximum size of the pool in MB. | +| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | EnableBuffers, MaxPoolSize [, MemType, MaxPoolableSize, Capacity, SlabMinSize]... | EnableBuffers enables pooling for SYCL buffers, default false. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. | | `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 33b07c3ad5702..3229cfb4423b5 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3032,6 +3032,74 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, return PI_SUCCESS; } +// If indirect access tracking is enabled then performs reference counting, +// otherwise just calls zeMemAllocDevice. +static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context, + pi_device Device, size_t Size) { + pi_platform Plt = Device->Platform; + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + // Lock the mutex which is guarding contexts container in the platform. + // This prevents new kernels from being submitted in any context while + // we are in the process of allocating a memory, this is needed to + // properly capture allocations by kernels with indirect access. + ContextsLock.lock(); + // We are going to defer memory release if there are kernels with + // indirect access, that is why explicitly retain context to be sure + // that it is released after all memory allocations in this context are + // released. + PI_CALL(piContextRetain(Context)); + } + + ze_device_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = 0; + ZeDesc.ordinal = 0; + ZE_CALL(zeMemAllocDevice, + (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr)); + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*ResultPtr), + std::forward_as_tuple(Context)); + } + return PI_SUCCESS; +} + +// If indirect access tracking is enabled then performs reference counting, +// otherwise just calls zeMemAllocHost. +static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context, + size_t Size) { + pi_platform Plt = Context->Devices[0]->Platform; + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + // Lock the mutex which is guarding contexts container in the platform. + // This prevents new kernels from being submitted in any context while + // we are in the process of allocating a memory, this is needed to + // properly capture allocations by kernels with indirect access. + ContextsLock.lock(); + // We are going to defer memory release if there are kernels with + // indirect access, that is why explicitly retain context to be sure + // that it is released after all memory allocations in this context are + // released. + PI_CALL(piContextRetain(Context)); + } + + ze_host_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = 0; + ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr)); + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*ResultPtr), + std::forward_as_tuple(Context)); + } + return PI_SUCCESS; +} + pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *properties) { @@ -3091,12 +3159,20 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, pi_result Result; if (DeviceIsIntegrated) { - Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment); + if (enableBufferPooling()) + Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment); + else { + ZeHostMemAllocHelper(&Ptr, Context, Size); + } } else if (Context->SingleRootDevice) { // If we have a single discrete device or all devices in the context are // sub-devices of the same device then we can allocate on device - Result = piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice, - nullptr, Size, Alignment); + if (enableBufferPooling()) + Result = piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice, + nullptr, Size, Alignment); + else { + ZeDeviceMemAllocHelper(&Ptr, Context, Context->SingleRootDevice, Size); + } } else { // Context with several gpu cards. Temporarily use host allocation because // it is accessible by all devices. But it is not good in terms of @@ -3104,10 +3180,14 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, // TODO: We need to either allow remote access to device memory using IPC, // or do explicit memory transfers from one device to another using host // resources as backing buffers to allow those transfers. - Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment); + if (enableBufferPooling()) + Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment); + else { + ZeHostMemAllocHelper(&Ptr, Context, Size); + } } - if (Result != PI_SUCCESS) + if (enableBufferPooling() && Result != PI_SUCCESS) return Result; if (HostPtr) { @@ -3170,6 +3250,37 @@ pi_result piMemRetain(pi_mem Mem) { return PI_SUCCESS; } +// If indirect access tracking is not enabled then this functions just performs +// zeMemFree. If indirect access tracking is enabled then reference counting is +// performed. +static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) { + pi_platform Plt = Context->Devices[0]->Platform; + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + ContextsLock.lock(); + auto It = Context->MemAllocs.find(Ptr); + if (It == std::end(Context->MemAllocs)) { + die("All memory allocations must be tracked!"); + } + if (--(It->second.RefCount) != 0) { + // Memory can't be deallocated yet. + return PI_SUCCESS; + } + + // Reference count is zero, it is ok to free memory. + // We don't need to track this allocation anymore. + Context->MemAllocs.erase(It); + } + + ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); + + if (IndirectAccessTrackingEnabled) + PI_CALL(ContextReleaseHelper(Context)); + + return PI_SUCCESS; +} + pi_result piMemRelease(pi_mem Mem) { PI_ASSERT(Mem, PI_INVALID_MEM_OBJECT); @@ -3179,7 +3290,11 @@ pi_result piMemRelease(pi_mem Mem) { } else { auto Buf = static_cast<_pi_buffer *>(Mem); if (!Buf->isSubBuffer()) { - PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle())); + if (enableBufferPooling()) { + PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle())); + } else { + ZeMemFreeHelper(Mem->Context, Mem->getZeHandle()); + } } } delete Mem; @@ -4998,13 +5113,7 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) { if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP && Event->CommandData) { // Free the memory allocated in the piEnqueueMemBufferMap. - // TODO: always use piextUSMFree - if (IndirectAccessTrackingEnabled) { - // Use the version with reference counting - PI_CALL(piextUSMFree(Event->Context, Event->CommandData)); - } else { - ZE_CALL(zeMemFree, (Event->Context->ZeContext, Event->CommandData)); - } + ZeMemFreeHelper(Event->Context, Event->CommandData); Event->CommandData = nullptr; } if (Event->OwnZeEvent) { @@ -5795,17 +5904,7 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, if (Buffer->MapHostPtr) { *RetMap = Buffer->MapHostPtr + Offset; } else { - // TODO: always use piextUSMHostAlloc - if (IndirectAccessTrackingEnabled) { - // Use the version with reference counting - PI_CALL(piextUSMHostAlloc(RetMap, Queue->Context, nullptr, Size, 1)); - } else { - ZeStruct ZeDesc; - ZeDesc.flags = 0; - - ZE_CALL(zeMemAllocHost, - (Queue->Context->ZeContext, &ZeDesc, Size, 1, RetMap)); - } + ZeHostMemAllocHelper(RetMap, Queue->Context, Size); } const auto &ZeCommandList = CommandList->first; const auto &WaitList = (*Event)->WaitList; @@ -6495,6 +6594,18 @@ pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); } +SystemMemory::MemType USMSharedMemoryAlloc::getMemTypeImpl() { + return SystemMemory::Shared; +} + +SystemMemory::MemType USMDeviceMemoryAlloc::getMemTypeImpl() { + return SystemMemory::Device; +} + +SystemMemory::MemType USMHostMemoryAlloc::getMemTypeImpl() { + return SystemMemory::Host; +} + void *USMMemoryAllocBase::allocate(size_t Size) { void *Ptr = nullptr; @@ -6523,6 +6634,10 @@ void USMMemoryAllocBase::deallocate(void *Ptr) { } } +SystemMemory::MemType USMMemoryAllocBase::getMemType() { + return getMemTypeImpl(); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index f0af289788e82..367d0486f943d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -265,6 +265,7 @@ class USMMemoryAllocBase : public SystemMemory { // type virtual pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment) = 0; + virtual MemType getMemTypeImpl() = 0; public: USMMemoryAllocBase(pi_context Ctx, pi_device Dev) @@ -272,6 +273,7 @@ class USMMemoryAllocBase : public SystemMemory { void *allocate(size_t Size) override final; void *allocate(size_t Size, size_t Alignment) override final; void deallocate(void *Ptr) override final; + MemType getMemType() override final; }; // Allocation routines for shared memory type @@ -279,6 +281,7 @@ class USMSharedMemoryAlloc : public USMMemoryAllocBase { protected: pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment) override; + MemType getMemTypeImpl() override; public: USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev) @@ -290,6 +293,7 @@ class USMDeviceMemoryAlloc : public USMMemoryAllocBase { protected: pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment) override; + MemType getMemTypeImpl() override; public: USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev) @@ -301,6 +305,7 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase { protected: pi_result allocateImpl(void **ResultPtr, size_t Size, pi_uint32 Alignment) override; + MemType getMemTypeImpl() override; public: USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {} diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp index 2cfdcb87a4462..df95a9ba8023e 100644 --- a/sycl/plugins/level_zero/usm_allocator.cpp +++ b/sycl/plugins/level_zero/usm_allocator.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -24,15 +26,16 @@ #include #include -// USM allocations are a mimimum of 64KB in size even when a smaller size is +// USM allocations are a minimum of 4KB/64KB/2MB even when a smaller size is // requested. The implementation distinguishes between allocations of size -// ChunkCutOff (32KB) and those that are larger. +// ChunkCutOff = (minimum-alloc-size / 2) and those that are larger. // Allocation requests smaller than ChunkCutoff use chunks taken from a single -// 64KB USM allocation. Thus, for example, for 8-byte allocations, only 1 in -// ~8000 requests results in a new USM allocation. Freeing results only in a -// chunk of a larger 64KB allocation to be marked as available and no real -// return to the system. An allocation is returned to the system only when all -// chunks in a 64KB allocation are freed by the program. +// USM allocation. Thus, for example, for a 64KB minimum allocation size, +// and 8-byte allocations, only 1 in ~8000 requests results in a new +// USM allocation. Freeing results only in a chunk of a larger allocation +// to be marked as available and no real return to the system. +// An allocation is returned to the system only when all +// chunks in the larger allocation are freed by the program. // Allocations larger than ChunkCutOff use a separate USM allocation for each // request. These are subject to "pooling". That is, when such an allocation is // freed by the program it is retained in a pool. The pool is available for @@ -40,13 +43,6 @@ // allocations/deallocations. namespace settings { -// Minimum allocation size that will be requested from the system. -static constexpr size_t SlabMinSize = 64 * 1024; // 64KB - -// Allocations <= ChunkCutOff will use chunks from individual slabs. -// Allocations > ChunkCutOff will be rounded up to a multiple of -// SlabMinSize and allocated to occupy the whole slab. -static constexpr size_t ChunkCutOff = SlabMinSize / 2; // The largest size which is allocated via the allocator. // Allocations with size > CutOff bypass the USM allocator and // go directly to the runtime. @@ -59,15 +55,14 @@ using BucketsArrayType = std::array; // Generates a list of bucket sizes used by the allocator. static constexpr BucketsArrayType generateBucketSizes() { -// In order to make bucket sizes constexpr simply write -// them all. There are some restrictions that doesn't -// allow to write this in a nicer way. + // In order to make bucket sizes constexpr simply write + // them all. There are some restrictions that doesn't + // allow to write this in a nicer way. -// Simple helper to compute power of 2 + // Simple helper to compute power of 2 #define P(n) (1ULL << n) - BucketsArrayType Sizes = {32, 48, - 64, 96, + BucketsArrayType Sizes = {64, 96, 128, 192, P(8), P(8) + P(7), P(9), P(9) + P(8), @@ -100,57 +95,223 @@ static constexpr BucketsArrayType generateBucketSizes() { static constexpr BucketsArrayType BucketSizes = generateBucketSizes(); -// The implementation expects that SlabMinSize is 2^n -static_assert((SlabMinSize & (SlabMinSize - 1)) == 0, - "SlabMinSize must be a power of 2"); - // Protects the capacity checking of the pool. static sycl::detail::SpinLock PoolLock; static class SetLimits { public: - size_t MaxPoolableSize = 1; - size_t Capacity = 4; - size_t MaxPoolSize = 256; + // Minimum allocation size that will be requested from the system. + // By default this is the minimum allocation size of each memory type. + // Memory types are host, device, shared. + size_t SlabMinSize[3] = {64 * 1024, 64 * 1024, 2 * 1024 * 1024}; + + // Allocations up to this limit will be subject to chunking/pooling + size_t MaxPoolableSize[3] = {0, 32 * 1024, 0}; + + // When pooling, each bucket will hold a max of 4 unfreed slabs + size_t Capacity[3] = {0, 0, 0}; + + // Maximum memory left unfreed in pool + size_t MaxPoolSize = 0; + size_t CurPoolSize = 0; + size_t CurPoolSizes[3] = {0, 0, 0}; + + bool EnableBuffers = false; + + // Whether to print pool usage statistics + int PoolTrace = 0; SetLimits() { - // Parse optional parameters of this form (applicable to each context): - // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=[][,[][,[]]] - // MaxPoolableSize: Maximum poolable allocation size, specified in MB. - // Default 1MB. - // Capacity: Number of pooled allocations in each bucket. - // Default 4. - // MaxPoolSize: Maximum size of pool, specified in MB. - // Default 256MB. + // Parse optional parameters of this form: + // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=[EnableBuffers][;MaxPoolSize][;memtypelimits]... + // memtypelimits: [:] + // memtype: host|device|shared + // limits: [MaxPoolableSize][,[Capacity][,SlabMinSize]] + // + // Without a memory type, the limits are applied to each memory type. + // Parameters are for each context, except MaxPoolSize, which is overall + // pool size for all contexts. + // Duplicate specifications will result in the right-most taking effect. + // + // Current defaults are to match pre-2021.3 pooling. + // EnableBuffers: Apply chunking/pooling to SYCL buffers. + // Default 0 (false). + // MaxPoolSize: Limit on overall unfreed memory. + // Default 0MB. + // MaxPoolableSize: Maximum allocation size subject to chunking/pooling. + // Default 32KB. + // Capacity: Maximum number of unfreed allocations in each bucket. + // Default 0. + // SlabMinSize: Minimum allocation size requested from USM. + // Default 64KB. + // + // Example of usage: + // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M + + auto GetValue = [](std::string &Param, size_t Length) { + size_t Multiplier = 1; + if (tolower(Param[Length - 1]) == 'k') { + Length--; + Multiplier = 1024; + } + if (tolower(Param[Length - 1]) == 'm') { + Length--; + Multiplier = 1024 * 1024; + } + if (tolower(Param[Length - 1]) == 'g') { + Length--; + Multiplier = 1024 * 1024 * 1024; + } + std::string TheNumber = Param.substr(0, Length); + assert(TheNumber.find_first_not_of("0123456789") == std::string::npos); + return std::stoi(TheNumber) * Multiplier; + }; + + auto ParamParser = [=](std::string &Params, size_t &Setting, + bool &ParamWasSet) { + bool More; + if (Params.size() == 0) { + ParamWasSet = false; + return false; + } + size_t Pos = Params.find(','); + if (Pos != std::string::npos) { + if (Pos > 0) { + Setting = GetValue(Params, Pos); + ParamWasSet = true; + } + Params.erase(0, Pos + 1); + More = true; + } else { + Setting = GetValue(Params, Params.size()); + ParamWasSet = true; + More = false; + } + return More; + }; + + auto MemParser = [=](std::string &Params, SystemMemory::MemType M) { + bool ParamWasSet; + SystemMemory::MemType LM = M; + if (M == SystemMemory::All) + LM = SystemMemory::Host; + + bool More = ParamParser(Params, MaxPoolableSize[LM], ParamWasSet); + if (ParamWasSet && M == SystemMemory::All) { + MaxPoolableSize[SystemMemory::Shared] = + MaxPoolableSize[SystemMemory::Device] = + MaxPoolableSize[SystemMemory::Host]; + } + if (More) { + More = ParamParser(Params, Capacity[LM], ParamWasSet); + if (ParamWasSet && M == SystemMemory::All) { + Capacity[SystemMemory::Shared] = Capacity[SystemMemory::Device] = + Capacity[SystemMemory::Host]; + } + } + if (More) { + ParamParser(Params, SlabMinSize[LM], ParamWasSet); + if (ParamWasSet && M == SystemMemory::All) { + SlabMinSize[SystemMemory::Shared] = + SlabMinSize[SystemMemory::Device] = + SlabMinSize[SystemMemory::Host]; + } + } + }; + + auto MemTypeParser = [=](std::string &Params) { + int Pos = 0; + SystemMemory::MemType M = SystemMemory::All; + if (Params.compare(0, 5, "host:") == 0) { + Pos = 5; + M = SystemMemory::Host; + } else if (Params.compare(0, 7, "device:") == 0) { + Pos = 7; + M = SystemMemory::Device; + } else if (Params.compare(0, 7, "shared:") == 0) { + Pos = 7; + M = SystemMemory::Shared; + } + if (Pos > 0) + Params.erase(0, Pos); + MemParser(Params, M); + }; + // Update pool settings if specified in environment. char *PoolParams = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR"); if (PoolParams != nullptr) { std::string Params(PoolParams); - size_t Pos = Params.find(','); + size_t Pos = Params.find(';'); if (Pos != std::string::npos) { - if (Pos > 0) - MaxPoolableSize = std::stoi(Params.substr(0, Pos)); + if (Pos > 0) { + EnableBuffers = GetValue(Params, Pos); + } Params.erase(0, Pos + 1); - Pos = Params.find(','); + size_t Pos = Params.find(';'); if (Pos != std::string::npos) { - if (Pos > 0) - Capacity = std::stoi(Params.substr(0, Pos)); + if (Pos > 0) { + MaxPoolSize = GetValue(Params, Pos); + } Params.erase(0, Pos + 1); - if (Pos != std::string::npos) - MaxPoolSize = std::stoi(Params); + do { + size_t Pos = Params.find(';'); + if (Pos != std::string::npos) { + if (Pos > 0) { + std::string MemParams = Params.substr(0, Pos); + MemTypeParser(MemParams); + } + Params.erase(0, Pos + 1); + if (Params.size() == 0) + break; + } else { + MemTypeParser(Params); + break; + } + } while (true); } else { - Capacity = std::stoi(Params); + MaxPoolSize = GetValue(Params, Params.size()); } - } else - MaxPoolableSize = std::stoi(Params); + } else { + EnableBuffers = GetValue(Params, Params.size()); + } + } + + char *PoolTraceVal = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_TRACE"); + if (PoolTraceVal != nullptr) { + PoolTrace = std::atoi(PoolTraceVal); } - MaxPoolableSize *= (1 << 20); - MaxPoolSize *= (1 << 20); + if (PoolTrace < 1) + return; + + std::cout << "USM Pool Settings (Built-in or Adjusted by Environment " + "Variable)\n"; + + std::cout << std::setw(15) << "Parameter" << std::setw(12) << "Host" + << std::setw(12) << "Device" << std::setw(12) << "Shared" + << std::endl; + std::cout << std::setw(15) << "SlabMinSize" << std::setw(12) + << SlabMinSize[0] << std::setw(12) << SlabMinSize[1] + << std::setw(12) << SlabMinSize[2] << std::endl; + std::cout << std::setw(15) << "MaxPoolableSize" << std::setw(12) + << MaxPoolableSize[0] << std::setw(12) << MaxPoolableSize[1] + << std::setw(12) << MaxPoolableSize[2] << std::endl; + std::cout << std::setw(15) << "Capacity" << std::setw(12) << Capacity[0] + << std::setw(12) << Capacity[1] << std::setw(12) << Capacity[2] + << std::endl; + std::cout << std::setw(15) << "MaxPoolSize" << std::setw(12) << MaxPoolSize + << std::endl; + std::cout << std::setw(15) << "EnableBuffers" << std::setw(12) + << EnableBuffers << std::endl + << std::endl; } -} USMPoolSettings; +} USMSettings; } // namespace settings +using namespace settings; + +static const char *MemTypeNames[3] = {"Host", "Device", "Shared"}; + // Aligns the pointer down to the specified alignment // (e.g. returns 8 for Size = 13, Alignment = 8) static void *AlignPtrDown(void *Ptr, const size_t Alignment) { @@ -178,7 +339,7 @@ static size_t AlignUp(size_t Val, size_t Alignment) { class Bucket; -// Represents the allocated memory block of size 'settings::SlabMinSize' +// Represents the allocated memory block of size 'SlabMinSize' // Internally, it splits the memory block into chunks. The number of // chunks depends of the size of a Bucket which created the Slab. // The chunks @@ -213,8 +374,8 @@ class Slab { size_t FindFirstAvailableChunkIdx() const; // Register/Unregister the slab in the global slab address map. - static void regSlab(Slab &); - static void unregSlab(Slab &); + void regSlab(Slab &); + void unregSlab(Slab &); static void regSlabByAddr(void *, Slab &); static void unregSlabByAddr(void *, Slab &); @@ -234,9 +395,7 @@ class Slab { void *getSlab(); void *getPtr() const { return MemPtr; } - void *getEnd() const { - return static_cast(getPtr()) + settings::SlabMinSize; - } + void *getEnd() const; size_t getChunkSize() const; size_t getNumChunks() const { return Chunks.size(); } @@ -265,41 +424,79 @@ class Bucket { // routines, slab map and etc. USMAllocContext::USMAllocImpl &OwnAllocCtx; + // Statistics + size_t allocCount; + size_t allocPoolCount; + size_t freeCount; + size_t currSlabsInUse; + size_t currSlabsInPool; + size_t maxSlabsInUse; + size_t maxSlabsInPool; + public: Bucket(size_t Sz, USMAllocContext::USMAllocImpl &AllocCtx) - : Size{Sz}, OwnAllocCtx{AllocCtx} {} + : Size{Sz}, OwnAllocCtx{AllocCtx}, allocCount(0), allocPoolCount(0), + freeCount(0), currSlabsInUse(0), currSlabsInPool(0), maxSlabsInUse(0), + maxSlabsInPool(0) {} // Get pointer to allocation that is one piece of an available slab in this // bucket. - void *getChunk(); + void *getChunk(bool &FromAllocatedSlab); // Get pointer to allocation that is a full slab in this bucket. - void *getSlab(); + void *getSlab(bool &FromPool); size_t getSize() const { return Size; } // Free an allocation that is one piece of a slab in this bucket. - void freeChunk(void *Ptr, Slab &Slab); + void freeChunk(void *Ptr, Slab &Slab, bool &ToPool); // Free an allocation that is a full slab in this bucket. - void freeSlab(Slab &Slab); + void freeSlab(Slab &Slab, bool &ToPool); SystemMemory &getMemHandle(); + + SystemMemory::MemType getMemType(); + USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; } // Check whether an allocation to be freed can be placed in the pool. bool CanPool(); + // The minimum allocation size for a slab in this bucket. + size_t SlabMinSize(); + + // The minimum size of a chunk from this bucket's slabs. + size_t ChunkCutOff(); + + // The number of slabs in this bucket that can be in the pool. + size_t Capacity(); + + // The maximum allocation size subject to pooling. + size_t MaxPoolableSize(); + + // Update allocation count + void countAlloc(bool FromPool); + + // Update free count + void countFree(); + + // Update statistics of Available/Unavailable + void updateStats(int InUse, int InPool); + + // Print bucket statistics + void printStats(); + private: - void onFreeChunk(Slab &); + void onFreeChunk(Slab &, bool &ToPool); // Get a slab to be used for chunked allocations. // These slabs are used for allocations <= ChunkCutOff and not pooled. - decltype(AvailableSlabs.begin()) getAvailSlab(); + decltype(AvailableSlabs.begin()) getAvailSlab(bool &FromAllocatedSlab); // Get a slab that will be used as a whole for a single allocation. // These slabs are > ChunkCutOff in size and pooled. - decltype(AvailableSlabs.begin()) getAvailFullSlab(); + decltype(AvailableSlabs.begin()) getAvailFullSlab(bool &FromPool); }; class USMAllocContext::USMAllocImpl { @@ -318,16 +515,16 @@ class USMAllocContext::USMAllocImpl { USMAllocImpl(std::unique_ptr SystemMemHandle) : MemHandle{std::move(SystemMemHandle)} { - Buckets.reserve(settings::BucketSizes.size()); + Buckets.reserve(BucketSizes.size()); - for (auto &&Size : settings::BucketSizes) { + for (auto &&Size : BucketSizes) { Buckets.emplace_back(std::make_unique(Size, *this)); } } - void *allocate(size_t Size, size_t Alignment); - void *allocate(size_t Size); - void deallocate(void *Ptr); + void *allocate(size_t Size, size_t Alignment, bool &FromPool); + void *allocate(size_t Size, bool &FromPool); + void deallocate(void *Ptr, bool &ToPool); SystemMemory &getMemHandle() { return *MemHandle; } @@ -336,6 +533,12 @@ class USMAllocContext::USMAllocImpl { return KnownSlabs; } + size_t SlabMinSize() { + return USMSettings.SlabMinSize[(*MemHandle).getMemType()]; + }; + + void printStats(); + private: Bucket &findBucket(size_t Size); }; @@ -353,11 +556,11 @@ std::ostream &operator<<(std::ostream &Os, const Slab &Slab) { Slab::Slab(Bucket &Bkt) : // In case bucket size is not a multiple of SlabMinSize, we would have // some padding at the end of the slab. - Chunks(settings::SlabMinSize / Bkt.getSize()), NumAllocated{0}, + Chunks(Bkt.SlabMinSize() / Bkt.getSize()), NumAllocated{0}, bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} { size_t SlabAllocSize = Bkt.getSize(); - if (SlabAllocSize < settings::SlabMinSize) - SlabAllocSize = settings::SlabMinSize; + if (SlabAllocSize < Bkt.SlabMinSize()) + SlabAllocSize = Bkt.SlabMinSize(); MemPtr = Bkt.getMemHandle().allocate(SlabAllocSize); regSlab(*this); } @@ -433,16 +636,16 @@ void Slab::unregSlabByAddr(void *Addr, Slab &Slab) { } void Slab::regSlab(Slab &Slab) { - void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize); - void *EndAddr = static_cast(StartAddr) + settings::SlabMinSize; + void *StartAddr = AlignPtrDown(Slab.getPtr(), bucket.SlabMinSize()); + void *EndAddr = static_cast(StartAddr) + bucket.SlabMinSize(); regSlabByAddr(StartAddr, Slab); regSlabByAddr(EndAddr, Slab); } void Slab::unregSlab(Slab &Slab) { - void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize); - void *EndAddr = static_cast(StartAddr) + settings::SlabMinSize; + void *StartAddr = AlignPtrDown(Slab.getPtr(), bucket.SlabMinSize()); + void *EndAddr = static_cast(StartAddr) + bucket.SlabMinSize(); unregSlabByAddr(StartAddr, Slab); unregSlabByAddr(EndAddr, Slab); @@ -470,27 +673,40 @@ void Slab::freeChunk(void *Ptr) { FirstFreeChunkIdx = ChunkIdx; } +void *Slab::getEnd() const { + return static_cast(getPtr()) + bucket.SlabMinSize(); +} + bool Slab::hasAvail() { return NumAllocated != getNumChunks(); } -auto Bucket::getAvailFullSlab() -> decltype(AvailableSlabs.begin()) { +auto Bucket::getAvailFullSlab(bool &FromPool) + -> decltype(AvailableSlabs.begin()) { // Return a slab that will be used for a single allocation. if (AvailableSlabs.size() == 0) { auto It = AvailableSlabs.insert(AvailableSlabs.begin(), std::make_unique(*this)); (*It)->setIterator(It); + FromPool = false; + if (USMSettings.PoolTrace > 1) + updateStats(1, 0); } else { // If a slab was available in the pool then note that the current pooled // size has reduced by the size of this slab. - settings::USMPoolSettings.CurPoolSize -= Size; + FromPool = true; + if (USMSettings.PoolTrace > 1) { + updateStats(1, -1); + USMSettings.CurPoolSizes[getMemType()] -= Size; + } + USMSettings.CurPoolSize -= Size; } return AvailableSlabs.begin(); } -void *Bucket::getSlab() { +void *Bucket::getSlab(bool &FromPool) { std::lock_guard Lg(BucketLock); - auto SlabIt = getAvailFullSlab(); + auto SlabIt = getAvailFullSlab(FromPool); auto *FreeSlab = (*SlabIt)->getSlab(); auto It = UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt)); @@ -499,7 +715,7 @@ void *Bucket::getSlab() { return FreeSlab; } -void Bucket::freeSlab(Slab &Slab) { +void Bucket::freeSlab(Slab &Slab, bool &ToPool) { std::lock_guard Lg(BucketLock); auto SlabIter = Slab.getIterator(); assert(SlabIter != UnavailableSlabs.end()); @@ -508,25 +724,42 @@ void Bucket::freeSlab(Slab &Slab) { AvailableSlabs.insert(AvailableSlabs.begin(), std::move(*SlabIter)); UnavailableSlabs.erase(SlabIter); (*It)->setIterator(It); + + if (USMSettings.PoolTrace > 1) { + updateStats(-1, 1); + ToPool = true; + } } else { UnavailableSlabs.erase(SlabIter); + + if (USMSettings.PoolTrace > 1) { + updateStats(-1, 0); + ToPool = false; + } } } -auto Bucket::getAvailSlab() -> decltype(AvailableSlabs.begin()) { +auto Bucket::getAvailSlab(bool &FromAllocatedSlab) + -> decltype(AvailableSlabs.begin()) { + + FromAllocatedSlab = true; if (AvailableSlabs.size() == 0) { auto It = AvailableSlabs.insert(AvailableSlabs.begin(), std::make_unique(*this)); (*It)->setIterator(It); + + if (USMSettings.PoolTrace > 1) + updateStats(1, 0); + FromAllocatedSlab = false; } return AvailableSlabs.begin(); } -void *Bucket::getChunk() { +void *Bucket::getChunk(bool &FromAllocatedSlab) { std::lock_guard Lg(BucketLock); - auto SlabIt = getAvailSlab(); + auto SlabIt = getAvailSlab(FromAllocatedSlab); auto *FreeChunk = (*SlabIt)->getChunk(); // If the slab is full, move it to unavailable slabs and update its iterator @@ -540,16 +773,18 @@ void *Bucket::getChunk() { return FreeChunk; } -void Bucket::freeChunk(void *Ptr, Slab &Slab) { +void Bucket::freeChunk(void *Ptr, Slab &Slab, bool &ToPool) { std::lock_guard Lg(BucketLock); Slab.freeChunk(Ptr); - onFreeChunk(Slab); + onFreeChunk(Slab, ToPool); } // The lock must be acquired before calling this method -void Bucket::onFreeChunk(Slab &Slab) { +void Bucket::onFreeChunk(Slab &Slab, bool &ToPool) { + ToPool = true; + // In case if the slab was previously full and now has 1 available // chunk, it should be moved to the list of available slabs if (Slab.getNumAllocated() == (Slab.getNumChunks() - 1)) { @@ -563,29 +798,30 @@ void Bucket::onFreeChunk(Slab &Slab) { (*It)->setIterator(It); } - // If slab has no chunks allocated we could pool it if capacity is available - // or release it to the system. + // Remove the slab when all the chunks from it are deallocated + // Note: since the slab is stored as unique_ptr, just remove it from + // the list to remove the list to destroy the object if (Slab.getNumAllocated() == 0) { - // Pool has no space so release it. - if (!CanPool()) { - // Remove the slab when all the chunks from it are deallocated - // Note: since the slab is stored as unique_ptr, just remove it from - // the list to remove the list to destroy the object - auto It = Slab.getIterator(); - assert(It != AvailableSlabs.end()); - - AvailableSlabs.erase(It); - } + auto It = Slab.getIterator(); + assert(It != AvailableSlabs.end()); + + AvailableSlabs.erase(It); + + if (USMSettings.PoolTrace > 1) + updateStats(-1, 0); + + ToPool = false; } } bool Bucket::CanPool() { - std::lock_guard Lock{settings::PoolLock}; + std::lock_guard Lock{PoolLock}; size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1; - if (settings::USMPoolSettings.Capacity >= NewFreeSlabsInBucket) { - size_t NewPoolSize = settings::USMPoolSettings.CurPoolSize + Size; - if (settings::USMPoolSettings.MaxPoolSize >= NewPoolSize) { - settings::USMPoolSettings.CurPoolSize = NewPoolSize; + if (Capacity() >= NewFreeSlabsInBucket) { + size_t NewPoolSize = USMSettings.CurPoolSize + Size; + if (USMSettings.MaxPoolSize >= NewPoolSize) { + USMSettings.CurPoolSize = NewPoolSize; + USMSettings.CurPoolSizes[getMemType()] += Size; return true; } } @@ -594,49 +830,105 @@ bool Bucket::CanPool() { SystemMemory &Bucket::getMemHandle() { return OwnAllocCtx.getMemHandle(); } -void *USMAllocContext::USMAllocImpl::allocate(size_t Size) { +SystemMemory::MemType Bucket::getMemType() { + return getMemHandle().getMemType(); +} + +size_t Bucket::SlabMinSize() { return USMSettings.SlabMinSize[getMemType()]; } + +size_t Bucket::Capacity() { return USMSettings.Capacity[getMemType()]; } + +size_t Bucket::MaxPoolableSize() { + return USMSettings.MaxPoolableSize[getMemType()]; +} + +size_t Bucket::ChunkCutOff() { return SlabMinSize() / 2; } + +void Bucket::countAlloc(bool FromPool) { + ++allocCount; + if (FromPool) + ++allocPoolCount; +} + +void Bucket::countFree() { ++freeCount; } + +void Bucket::updateStats(int InUse, int InPool) { + currSlabsInUse += InUse; + maxSlabsInUse = std::max(currSlabsInUse, maxSlabsInUse); + currSlabsInPool += InPool; + maxSlabsInPool = std::max(currSlabsInPool, maxSlabsInPool); +} + +void Bucket::printStats() { + if (allocCount) { + std::cout << std::setw(14) << getSize() << std::setw(12) << allocCount + << std::setw(12) << freeCount << std::setw(18) << allocPoolCount + << std::setw(20) << maxSlabsInUse << std::setw(21) + << maxSlabsInPool << std::endl; + } +} + +// SystemMemory &Bucket::getMemHandle() { return OwnAllocCtx.getMemHandle(); } + +void *USMAllocContext::USMAllocImpl::allocate(size_t Size, bool &FromPool) { + void *Ptr; + if (Size == 0) return nullptr; - if (Size > settings::USMPoolSettings.MaxPoolableSize) { + FromPool = false; + if (Size > USMSettings.MaxPoolableSize[getMemHandle().getMemType()]) { return getMemHandle().allocate(Size); } auto &Bucket = findBucket(Size); - if (Size > settings::ChunkCutOff) { - return Bucket.getSlab(); - } - return Bucket.getChunk(); + if (Size > Bucket.ChunkCutOff()) + Ptr = Bucket.getSlab(FromPool); + else + Ptr = Bucket.getChunk(FromPool); + + if (USMSettings.PoolTrace > 1) + Bucket.countAlloc(FromPool); + + return Ptr; } -void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) { +void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment, + bool &FromPool) { + void *Ptr; + if (Size == 0) return nullptr; if (Alignment <= 1) - return allocate(Size); + return allocate(Size, FromPool); size_t AlignedSize = (Size > 1) ? AlignUp(Size, Alignment) : Alignment; // Check if requested allocation size is within pooling limit. // If not, just request aligned pointer from the system. - if (AlignedSize > settings::USMPoolSettings.MaxPoolableSize) { + FromPool = false; + if (AlignedSize > USMSettings.MaxPoolableSize[getMemHandle().getMemType()]) { return getMemHandle().allocate(Size, Alignment); } - void *Ptr; auto &Bucket = findBucket(AlignedSize); - if (AlignedSize > settings::ChunkCutOff) { - Ptr = Bucket.getSlab(); + + if (AlignedSize > Bucket.ChunkCutOff()) { + Ptr = Bucket.getSlab(FromPool); } else { - Ptr = Bucket.getChunk(); + Ptr = Bucket.getChunk(FromPool); } + + if (USMSettings.PoolTrace > 1) + Bucket.countAlloc(FromPool); + return AlignPtrUp(Ptr, Alignment); } Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) { - assert(Size <= settings::CutOff && "Unexpected size"); + assert(Size <= CutOff && "Unexpected size"); auto It = std::find_if( Buckets.begin(), Buckets.end(), @@ -647,12 +939,13 @@ Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) { return *(*It); } -void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) { - auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabMinSize); +void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool) { + auto *SlabPtr = AlignPtrDown(Ptr, SlabMinSize()); // Lock the map on read std::shared_lock Lk(getKnownSlabsMapLock()); + ToPool = false; auto Slabs = getKnownSlabs().equal_range(SlabPtr); if (Slabs.first == Slabs.second) { Lk.unlock(); @@ -669,11 +962,16 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) { // there Lk.unlock(); auto &Bucket = Slab.getBucket(); - if (Bucket.getSize() <= settings::ChunkCutOff) { - Bucket.freeChunk(Ptr, Slab); + + if (USMSettings.PoolTrace > 1) + Bucket.countFree(); + + if (Bucket.getSize() <= Bucket.ChunkCutOff()) { + Bucket.freeChunk(Ptr, Slab, ToPool); } else { - Bucket.freeSlab(Slab); + Bucket.freeSlab(Slab, ToPool); } + return; } } @@ -688,13 +986,70 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) { USMAllocContext::USMAllocContext(std::unique_ptr MemHandle) : pImpl(std::make_unique(std::move(MemHandle))) {} -void *USMAllocContext::allocate(size_t size) { return pImpl->allocate(size); } +void *USMAllocContext::allocate(size_t size) { + bool FromPool; + auto Ptr = pImpl->allocate(size, FromPool); + + if (USMSettings.PoolTrace > 2) { + auto MT = pImpl->getMemHandle().getMemType(); + std::cout << "Allocated " << std::setw(8) << size << " " << MemTypeNames[MT] + << " USM bytes from " << (FromPool ? "Pool" : "USM") << " ->" + << Ptr << std::endl; + } + return Ptr; +} void *USMAllocContext::allocate(size_t size, size_t alignment) { - return pImpl->allocate(size, alignment); + bool FromPool; + auto Ptr = pImpl->allocate(size, alignment, FromPool); + + if (USMSettings.PoolTrace > 2) { + auto MT = pImpl->getMemHandle().getMemType(); + std::cout << "Allocated " << std::setw(8) << size << " " << MemTypeNames[MT] + << " USM bytes aligned at " << alignment << " from " + << (FromPool ? "Pool" : "USM") << " ->" << Ptr << std::endl; + } + return Ptr; } -void USMAllocContext::deallocate(void *ptr) { return pImpl->deallocate(ptr); } +void USMAllocContext::deallocate(void *ptr) { + bool ToPool; + pImpl->deallocate(ptr, ToPool); + + if (USMSettings.PoolTrace > 2) { + auto MT = pImpl->getMemHandle().getMemType(); + std::cout << "Freed " << MemTypeNames[MT] << " USM " << ptr << " to " + << (ToPool ? "Pool" : "USM") << ", Current total pool size " + << USMSettings.CurPoolSize << ", Current pool sizes [" + << USMSettings.CurPoolSizes[SystemMemory::Host] << ", " + << USMSettings.CurPoolSizes[SystemMemory::Device] << ", " + << USMSettings.CurPoolSizes[SystemMemory::Shared] << "]\n"; + } + return; +} // Define destructor for its usage with unique_ptr -USMAllocContext::~USMAllocContext() = default; +USMAllocContext::~USMAllocContext() { + if (USMSettings.PoolTrace > 1) { + auto Label = "Shared"; + if (pImpl->getMemHandle().getMemType() == SystemMemory::Host) + Label = "Host"; + if (pImpl->getMemHandle().getMemType() == SystemMemory::Device) + Label = "Device"; + std::cout << Label << " memory statistics\n"; + pImpl->printStats(); + std::cout << "Current Pool Size " << USMSettings.CurPoolSize << std::endl; + } +} + +void USMAllocContext::USMAllocImpl::printStats() { + std::cout << std::setw(14) << "Bucket Size" << std::setw(12) << "Allocs" + << std::setw(12) << "Frees" << std::setw(18) << "Allocs From Pool" + << std::setw(20) << "Peak Slabs In Use" << std::setw(21) + << "Peak Slabs in Pool" << std::endl; + for (auto &B : Buckets) { + (*B).printStats(); + } +} + +bool enableBufferPooling() { return USMSettings.EnableBuffers; } diff --git a/sycl/plugins/level_zero/usm_allocator.hpp b/sycl/plugins/level_zero/usm_allocator.hpp index b72ca77d41538..e47ea090724f6 100644 --- a/sycl/plugins/level_zero/usm_allocator.hpp +++ b/sycl/plugins/level_zero/usm_allocator.hpp @@ -14,9 +14,11 @@ // USM system memory allocation/deallocation interface. class SystemMemory { public: + enum MemType { Host, Device, Shared, All }; virtual void *allocate(size_t size) = 0; virtual void *allocate(size_t size, size_t aligned) = 0; virtual void deallocate(void *ptr) = 0; + virtual MemType getMemType() = 0; virtual ~SystemMemory() = default; }; @@ -36,4 +38,7 @@ class USMAllocContext { std::unique_ptr pImpl; }; +// Temporary interface to allow pooling to be reverted, i.e., no buffer support +bool enableBufferPooling(); + #endif