Skip to content

Commit 9a919f3

Browse files
AlnisMfacebook-github-bot
authored andcommitted
Add option to aggregate pool stats to reduce ODS counter inflation
Summary: Implements configurable aggregated pool stats for cachelib to reduce ODS counter inflation. Adds aggregatePoolStats config option that combines all pool statistics into a single "aggregated" entry before export. Maintains backward compatibility (disabled by default) and includes comprehensive test coverage. Key changes: - Added CacheAllocatorConfig::enableAggregatePoolStats() configuration - Implemented updateAggregatedPoolStats()` method using PoolStats::operator+= - Adapted PoolStats::operator+= to allow aggregation of PoolStats with different allocation classes - Modified exportStats() to conditionally aggregate based on config - Enhanced PoolStats aggregation for pool-level metrics Reviewed By: rlyerly, pbhandar2 Differential Revision: D79768148 fbshipit-source-id: 29a9fb80b5c946be7da0d9b22ec3ee8b629380b3
1 parent 73bd8c0 commit 9a919f3

File tree

11 files changed

+708
-49
lines changed

11 files changed

+708
-49
lines changed

cachelib/allocator/Cache.cpp

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
#include "cachelib/allocator/Cache.h"
1818

19+
#include <folly/logging/xlog.h>
20+
1921
#include <mutex>
2022

2123
#include "cachelib/allocator/RebalanceStrategy.h"
@@ -89,6 +91,11 @@ void CacheBase::updateObjectCacheStats(const std::string& statPrefix) const {
8991
void CacheBase::updatePoolStats(const std::string& statPrefix,
9092
PoolId pid) const {
9193
const PoolStats stats = getPoolStats(pid);
94+
updatePoolStats(statPrefix, stats);
95+
}
96+
97+
void CacheBase::updatePoolStats(const std::string& statPrefix,
98+
const PoolStats& stats) const {
9299
const std::string prefix = statPrefix + "pool." + stats.poolName + ".";
93100

94101
counters_.updateCount(prefix + "size", stats.poolSize);
@@ -531,6 +538,12 @@ CacheBase::CacheHitRate CacheBase::calculateCacheHitRate(
531538
return {overall, ram, nvm};
532539
}
533540

541+
void CacheBase::updateIndividualPoolStats(const std::string& statPrefix) const {
542+
for (const auto pid : getRegularPoolIds()) {
543+
updatePoolStats(statPrefix, pid);
544+
}
545+
}
546+
534547
void CacheBase::exportStats(
535548
const std::string& statPrefix,
536549
std::chrono::seconds aggregationInterval,
@@ -539,8 +552,15 @@ void CacheBase::exportStats(
539552
updateNvmCacheStats(statPrefix);
540553
updateEventTrackerStats(statPrefix);
541554

542-
for (const auto pid : getRegularPoolIds()) {
543-
updatePoolStats(statPrefix, pid);
555+
if (aggregatePoolStats_ && canAggregatePoolStats()) {
556+
updateAggregatedPoolStats(statPrefix);
557+
} else {
558+
// Log warning when aggregation is enabled but not possible
559+
if (aggregatePoolStats_) {
560+
XLOG(WARN) << "Pool stats aggregation is enabled but cannot be performed "
561+
"due to too many allocation classes";
562+
}
563+
updateIndividualPoolStats(statPrefix);
544564
}
545565

546566
for (const auto pid : getCCachePoolIds()) {
@@ -556,4 +576,37 @@ void CacheBase::exportStats(
556576

557577
return counters_.exportStats(aggregationInterval, cb);
558578
}
579+
580+
bool CacheBase::canAggregatePoolStats() const {
581+
const auto poolIds = getRegularPoolIds();
582+
XDCHECK(!poolIds.empty(), "Regular pool IDs should not be empty");
583+
584+
// Collect all unique allocation sizes from all pools
585+
std::unordered_set<uint32_t> allAllocSizes;
586+
for (const auto pid : poolIds) {
587+
const auto& pool = getPool(pid);
588+
for (const auto& allocSize : pool.getAllocSizes()) {
589+
allAllocSizes.insert(allocSize);
590+
}
591+
}
592+
return allAllocSizes.size() <= MemoryAllocator::kMaxClasses;
593+
}
594+
595+
void CacheBase::updateAggregatedPoolStats(const std::string& statPrefix) const {
596+
const auto poolIds = getRegularPoolIds();
597+
XDCHECK(!poolIds.empty(), "Regular pool IDs should not be empty");
598+
// Get the first pool stats to initialize the aggregated stats
599+
auto poolIdsIter = poolIds.begin();
600+
PoolStats aggregatedStats = getPoolStats(*poolIdsIter);
601+
++poolIdsIter;
602+
603+
// Aggregate all remaining pool stats
604+
for (; poolIdsIter != poolIds.end(); ++poolIdsIter) {
605+
const PoolStats stats = getPoolStats(*poolIdsIter);
606+
aggregatedStats += stats;
607+
}
608+
aggregatedStats.poolName = "aggregated";
609+
updatePoolStats(statPrefix, aggregatedStats);
610+
}
611+
559612
} // namespace facebook::cachelib

cachelib/allocator/Cache.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,9 @@ class CacheBase {
216216
// <Stat -> Count/Delta> maps
217217
mutable RateMap counters_;
218218

219+
// Whether to aggregate pool stats to reduce ODS counter inflation
220+
bool aggregatePoolStats_{false};
221+
219222
protected:
220223
// move bytes from one pool to another. The source pool should be at least
221224
// _bytes_ in size.
@@ -304,6 +307,21 @@ class CacheBase {
304307
// @param pid the poolId that needs updating
305308
void updatePoolStats(const std::string& statPrefix, PoolId pid) const;
306309

310+
// Update pool stats with a PoolStats object directly
311+
// @param stats the PoolStats object to update
312+
void updatePoolStats(const std::string& statPrefix,
313+
const PoolStats& stats) const;
314+
315+
// Update individual pool stats (each pool reported separately)
316+
void updateIndividualPoolStats(const std::string& statPrefix) const;
317+
318+
// Update aggregated pool stats (all pools combined into one stat)
319+
void updateAggregatedPoolStats(const std::string& statPrefix) const;
320+
321+
// Returns true if the number of distinct allocation sizes across all pools is
322+
// less than the maximum number of allocation sizes allowed.
323+
bool canAggregatePoolStats() const;
324+
307325
// Update stats specific to compact caches
308326
void updateCompactCacheStats(const std::string& statPrefix,
309327
const ICompactCache& c) const;
@@ -342,6 +360,12 @@ class CacheBase {
342360
poolResizeStrategies_;
343361
std::shared_ptr<PoolOptimizeStrategy> poolOptimizeStrategy_;
344362

363+
// Enable aggregating pool stats
364+
void enableAggregatePoolStats() { aggregatePoolStats_ = true; }
365+
366+
// Check if pool stats aggregation is enabled
367+
bool isAggregatePoolStatsEnabled() const { return aggregatePoolStats_; }
368+
345369
friend PoolResizer;
346370
friend PoolRebalancer;
347371
friend PoolOptimizer;

cachelib/allocator/CacheAllocator.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2539,6 +2539,9 @@ CacheAllocator<CacheTrait>::restoreCCacheManager() {
25392539

25402540
template <typename CacheTrait>
25412541
void CacheAllocator<CacheTrait>::initCommon(bool dramCacheAttached) {
2542+
// Initialize aggregate pool stats from config
2543+
aggregatePoolStats_ = config_.isAggregatePoolStatsEnabled();
2544+
25422545
if (config_.nvmConfig.has_value()) {
25432546
if (config_.nvmCacheAP) {
25442547
nvmAdmissionPolicy_ = config_.nvmCacheAP;

cachelib/allocator/CacheAllocatorConfig.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,35 @@ class CacheAllocatorConfig {
356356
return skipPromoteChildrenWhenParentFailed;
357357
}
358358

359+
// Enable aggregating pool stats to a single stat
360+
//
361+
// When enabled, pool stats from all pools will be aggregated into a single
362+
// "aggregated" stat to reduce ODS counter inflation. For example, with two
363+
// pools and this option disabled, you will have separate stats like:
364+
// -
365+
// cachelib.cache_name.pool.cache_name_0.items
366+
// -
367+
// cachelib.cache_name.pool.cache_name_1.items
368+
// With this option enabled, it will be aggregated to:
369+
// - cachelib.cache_name.pool.aggregated.items
370+
//
371+
// LIMITATIONS:
372+
// 1. If the cache is using more than 128 distinct allocation sizes across
373+
// all pools, pool stats cannot be aggregated and will fall back to
374+
// separate stat logging.
375+
// 2. Some statistics such as evictionAgeSecs (avg and quantiles) may not be
376+
// mathematically precise. These stats are aggregated using weighted
377+
// averages based on the relative number of evictions from each pool.
378+
// While this provides a reasonable approximation, it may not represent
379+
// the exact distribution that would result from treating all pools as
380+
// a single entity.
381+
CacheAllocatorConfig& enableAggregatePoolStats();
382+
383+
// @return whether pool stats aggregation is enabled
384+
bool isAggregatePoolStatsEnabled() const noexcept {
385+
return aggregatePoolStats;
386+
}
387+
359388
// @return whether compact cache is enabled
360389
bool isCompactCacheEnabled() const noexcept { return enableZeroedSlabAllocs; }
361390

@@ -666,6 +695,9 @@ class CacheAllocatorConfig {
666695

667696
size_t numShards{8192};
668697

698+
// If true, aggregate pool stats into a single stat before exporting
699+
bool aggregatePoolStats{false};
700+
669701
friend CacheT;
670702

671703
private:
@@ -1147,6 +1179,12 @@ CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::setNumShards(size_t shards) {
11471179
return *this;
11481180
}
11491181

1182+
template <typename T>
1183+
CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableAggregatePoolStats() {
1184+
aggregatePoolStats = true;
1185+
return *this;
1186+
}
1187+
11501188
template <typename T>
11511189
const CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::validate() const {
11521190
// we can track tail hits only if MMType is MM2Q
@@ -1300,6 +1338,7 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
13001338
configMap["delayCacheWorkersStart"] =
13011339
delayCacheWorkersStart ? "true" : "false";
13021340
configMap["numShards"] = std::to_string(numShards);
1341+
configMap["aggregatePoolStats"] = aggregatePoolStats ? "true" : "false";
13031342
mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig");
13041343
mergeWithPrefix(configMap,
13051344
chainedItemAccessConfig.serialize(),

0 commit comments

Comments
 (0)