Skip to content

Commit

Permalink
Add estimateCompact size API
Browse files Browse the repository at this point in the history
  • Loading branch information
kewang1024 committed Apr 9, 2024
1 parent 8aceb24 commit 8db0b71
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 18 deletions.
8 changes: 8 additions & 0 deletions velox/vector/BaseVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,14 @@ uint64_t BaseVector::estimateFlatSize() const {
return length_ * avgRowSize;
}

double BaseVector::estimateRowSize() const {
return estimateFlatSize() / length_;
}

uint64_t BaseVector::estimateCompactSize() const {
return estimateFlatSize();
}

namespace {
bool isReusableEncoding(VectorEncoding::Simple encoding) {
return encoding == VectorEncoding::Simple::FLAT ||
Expand Down
11 changes: 11 additions & 0 deletions velox/vector/BaseVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,17 @@ class BaseVector {
/// hasn't been loaded yet.
virtual uint64_t estimateFlatSize() const;

/// Returns an estimate size of each row in this vector.
virtual double estimateRowSize() const;

/// Returns an estimate size of the vector as if it was compacted,
/// ignoring any over-allocations. For example:
/// (1) For dictionary vector, it only counts each dictionary entry once,
/// rather than each time a value is referenced.
/// (2) For flat vector, returns its retained size.
/// (3) For lazy vector that hasn't been loaded, returns zero.
virtual uint64_t estimateCompactSize() const;

/// To safely reuse a vector one needs to (1) ensure that the vector as well
/// as all its buffers and child vectors are singly-referenced and mutable
/// (for buffers); (2) clear append-only string buffers and child vectors
Expand Down
8 changes: 8 additions & 0 deletions velox/vector/ConstantVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ class ConstantVector final : public SimpleVector<T> {
return sizeof(T);
}

double estimateRowSize() const override {
return valueVector_ ? valueVector_->estimateRowSize() : retainedSize();
}

uint64_t estimateCompactSize() const override {
return 1 * estimateRowSize();
}

BaseVector* loadedVector() override {
if (!valueVector_) {
return this;
Expand Down
21 changes: 21 additions & 0 deletions velox/vector/DictionaryVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,27 @@ class DictionaryVector : public SimpleVector<T> {
indices_->capacity();
}

double estimateRowSize() const override {
return 1.0 * dictionaryValues_->estimateRowSize();
}

uint64_t estimateCompactSize() const override {
if (indices() == nullptr) {
return 0;
}

int uniqueIds = 0;
std::vector<bool> used(dictionaryValues_->size());
const int32_t* rawIndices = indices_->as<int32_t>();
for (int i = 0; i < BaseVector::length_; i++) {
if (!BaseVector::isNullAt(i)) {
uniqueIds += used[rawIndices[i]] ? 0 : 1;
used[rawIndices[i]] = true;
}
}
return uniqueIds * estimateRowSize();
}

bool isScalar() const override {
return dictionaryValues_->isScalar();
}
Expand Down
11 changes: 11 additions & 0 deletions velox/vector/FlatVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,17 @@ class FlatVector final : public SimpleVector<T> {
return size;
}

double estimateRowSize() const override {
if (BaseVector::length_ == 0) {
return 0;
}
return 1.0 * retainedSize() / BaseVector::length_;
}

uint64_t estimateCompactSize() const override {
return retainedSize();
}

/**
* Used for vectors of type VARCHAR and VARBINARY to hold data referenced by
* StringView's. It is safe to share these among multiple vectors. These
Expand Down
8 changes: 8 additions & 0 deletions velox/vector/SequenceVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ class SequenceVector : public SimpleVector<T> {
return sequenceValues_->retainedSize() + sequenceLengths_->capacity();
}

double estimateRowSize() const override {
return sequenceValues_->estimateRowSize();
}

uint64_t estimateCompactSize() const override {
return sequenceLengths_->size() * estimateRowSize();
}

bool isScalar() const override {
return sequenceValues_->isScalar();
}
Expand Down
Loading

0 comments on commit 8db0b71

Please sign in to comment.