Skip to content

Commit

Permalink
Add estimateCompact size API
Browse files Browse the repository at this point in the history
  • Loading branch information
kewang1024 committed Apr 1, 2024
1 parent 8aceb24 commit 1ab9854
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 14 deletions.
8 changes: 8 additions & 0 deletions velox/vector/BaseVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,14 @@ uint64_t BaseVector::estimateFlatSize() const {
return length_ * avgRowSize;
}

double BaseVector::estimateRowSize() const {
return estimateFlatSize() / length_;
}

uint64_t BaseVector::estimateCompactSize() const {
return estimateFlatSize();
}

namespace {
bool isReusableEncoding(VectorEncoding::Simple encoding) {
return encoding == VectorEncoding::Simple::FLAT ||
Expand Down
9 changes: 9 additions & 0 deletions velox/vector/BaseVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,15 @@ class BaseVector {
/// hasn't been loaded yet.
virtual uint64_t estimateFlatSize() const;

/// Returns an estimate size of each row in this vector.
virtual double estimateRowSize() const;

/// Returns an estimate size of the vector as if it was compacted, ignoring any over-allocations. For example:
/// (1) For dictionary vector, it only counts each dictionary entry once, rather than each time a value is referenced.
/// (2) For flat vector, returns its retained size.
/// (3) For lazy vector that hasn't been loaded, returns zero.
virtual uint64_t estimateCompactSize() const;

/// To safely reuse a vector one needs to (1) ensure that the vector as well
/// as all its buffers and child vectors are singly-referenced and mutable
/// (for buffers); (2) clear append-only string buffers and child vectors
Expand Down
8 changes: 8 additions & 0 deletions velox/vector/ConstantVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ class ConstantVector final : public SimpleVector<T> {
return sizeof(T);
}

double estimateRowSize() const override {
return valueVector_ ? valueVector_->estimateRowSize() : retainedSize();
}

uint64_t estimateCompactSize() const override {
return 1 * estimateRowSize();
}

BaseVector* loadedVector() override {
if (!valueVector_) {
return this;
Expand Down
19 changes: 19 additions & 0 deletions velox/vector/DictionaryVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,25 @@ class DictionaryVector : public SimpleVector<T> {
indices_->capacity();
}

double estimateRowSize() const override {
return 1.0 * dictionaryValues_->estimateRowSize();
}

uint64_t estimateCompactSize() const override {
if (indices() == nullptr) {
return 0;
}

std::unordered_set<int32_t> distinctCount;
const int32_t* rawIndices = indices_->as<int32_t>();
for (int i = 0; i < BaseVector::length_; i++) {
if (!BaseVector::isNullAt(i)) {
distinctCount.insert(rawIndices[i]);
}
}
return distinctCount.size() * estimateRowSize();
}

bool isScalar() const override {
return dictionaryValues_->isScalar();
}
Expand Down
11 changes: 11 additions & 0 deletions velox/vector/FlatVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,17 @@ class FlatVector final : public SimpleVector<T> {
return size;
}

double estimateRowSize() const override {
if (BaseVector::length_ == 0) {
return 0;
}
return 1.0 * retainedSize() / BaseVector::length_;
}

uint64_t estimateCompactSize() const override {
return retainedSize();
}

/**
* Used for vectors of type VARCHAR and VARBINARY to hold data referenced by
* StringView's. It is safe to share these among multiple vectors. These
Expand Down
8 changes: 8 additions & 0 deletions velox/vector/SequenceVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ class SequenceVector : public SimpleVector<T> {
return sequenceValues_->retainedSize() + sequenceLengths_->capacity();
}

double estimateRowSize() const override {
return sequenceValues_->estimateRowSize();
}

uint64_t estimateCompactSize() const override {
return sequenceLengths_->size() * estimateRowSize();
}

bool isScalar() const override {
return sequenceValues_->isScalar();
}
Expand Down
Loading

0 comments on commit 1ab9854

Please sign in to comment.