Skip to content

Commit

Permalink
Update vendored DuckDB sources to f52a41f
Browse files Browse the repository at this point in the history
  • Loading branch information
duckdblabs-bot committed Jan 14, 2025
1 parent f52a41f commit 442deb3
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 43 deletions.
66 changes: 48 additions & 18 deletions src/duckdb/src/common/types/varint.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "duckdb/common/types/varint.hpp"
#include "duckdb/common/exception/conversion_exception.hpp"
#include "duckdb/common/numeric_utils.hpp"
#include "duckdb/common/typedefs.hpp"
#include <cmath>

namespace duckdb {
Expand Down Expand Up @@ -34,7 +36,7 @@ void Varint::Verify(const string_t &input) {
// No bytes between 4 and end can be 0, unless total size == 4
if (varint_bytes > 4) {
if (is_negative) {
if (~varint_ptr[3] == 0) {
if (static_cast<data_t>(~varint_ptr[3]) == 0) {
throw InternalException("Invalid top data bytes set to 0 for VARINT values");
}
} else {
Expand Down Expand Up @@ -159,10 +161,13 @@ void Varint::GetByteArray(vector<uint8_t> &byte_array, bool &is_negative, const

// Determine if the number is negative
is_negative = (blob_ptr[0] & 0x80) == 0;
for (idx_t i = 3; i < blob.GetSize(); i++) {
if (is_negative) {
byte_array.reserve(blob.GetSize() - 3);
if (is_negative) {
for (idx_t i = 3; i < blob.GetSize(); i++) {
byte_array.push_back(static_cast<uint8_t>(~blob_ptr[i]));
} else {
}
} else {
for (idx_t i = 3; i < blob.GetSize(); i++) {
byte_array.push_back(static_cast<uint8_t>(blob_ptr[i]));
}
}
Expand All @@ -184,28 +189,53 @@ string Varint::FromByteArray(uint8_t *data, idx_t size, bool is_negative) {
return result;
}

// Following CPython and Knuth (TAOCP, Volume 2 (3rd edn), section 4.4, Method 1b).
string Varint::VarIntToVarchar(const string_t &blob) {
string decimal_string;
vector<uint8_t> byte_array;
bool is_negative;
GetByteArray(byte_array, is_negative, blob);
while (!byte_array.empty()) {
string quotient;
uint8_t remainder = 0;
for (uint8_t byte : byte_array) {
int new_value = remainder * 256 + byte;
quotient += DigitToChar(new_value / 10);
remainder = static_cast<uint8_t>(new_value % 10);
vector<digit_t> digits;
// Rounding byte_array to digit_bytes multiple size, so that we can process every digit_bytes bytes
// at a time without if check in the for loop
idx_t padding_size = (-byte_array.size()) & (DIGIT_BYTES - 1);
byte_array.insert(byte_array.begin(), padding_size, 0);
for (idx_t i = 0; i < byte_array.size(); i += DIGIT_BYTES) {
digit_t hi = 0;
for (idx_t j = 0; j < DIGIT_BYTES; j++) {
hi |= UnsafeNumericCast<digit_t>(byte_array[i + j]) << (8 * (DIGIT_BYTES - j - 1));
}
decimal_string += DigitToChar(remainder);
// Remove leading zeros from the quotient
byte_array.clear();
for (char digit : quotient) {
if (digit != '0' || !byte_array.empty()) {
byte_array.push_back(static_cast<uint8_t>(CharToDigit(digit)));
}

for (idx_t j = 0; j < digits.size(); j++) {
twodigit_t tmp = UnsafeNumericCast<twodigit_t>(digits[j]) << DIGIT_BITS | hi;
hi = static_cast<digit_t>(tmp / UnsafeNumericCast<twodigit_t>(DECIMAL_BASE));
digits[j] = static_cast<digit_t>(tmp - UnsafeNumericCast<twodigit_t>(DECIMAL_BASE * hi));
}

while (hi) {
digits.push_back(hi % DECIMAL_BASE);
hi /= DECIMAL_BASE;
}
}

if (digits.empty()) {
digits.push_back(0);
}

for (idx_t i = 0; i < digits.size() - 1; i++) {
auto remain = digits[i];
for (idx_t j = 0; j < DECIMAL_SHIFT; j++) {
decimal_string += DigitToChar(static_cast<int>(remain % 10));
remain /= 10;
}
}

auto remain = digits.back();
do {
decimal_string += DigitToChar(static_cast<int>(remain % 10));
remain /= 10;
} while (remain != 0);

if (is_negative) {
decimal_string += '-';
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,17 @@ void ColumnCountScanner::FinalizeChunkProcess() {
cur_buffer_handle = buffer_manager->GetBuffer(++iterator.pos.buffer_idx);
if (!cur_buffer_handle) {
buffer_handle_ptr = nullptr;
if (states.IsQuotedCurrent() && !states.IsUnquoted()) {
// We are finishing our file on a quoted value that is never unquoted, straight to jail.
result.error = true;
return;
}
if (states.EmptyLine() || states.NewRow() || states.IsCurrentNewRow() || states.IsNotSet()) {
return;
}
// This means we reached the end of the file, we must add a last line if there is any to be added
if (result.comment) {
// If it's a comment we add the last line via unsetcomment
// If it's a comment we add the last line via unset comment
result.UnsetComment(result, NumericLimits<idx_t>::Maximum());
} else {
// OW, we do a regular AddRow
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "4-dev4483"
#define DUCKDB_PATCH_VERSION "4-dev4516"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 1
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.1.4-dev4483"
#define DUCKDB_VERSION "v1.1.4-dev4516"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "1c6ea28bdc"
#define DUCKDB_SOURCE_ID "2e533ec9df"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down
11 changes: 11 additions & 0 deletions src/duckdb/src/include/duckdb/common/types/varint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,22 @@
#include "duckdb/function/cast/default_casts.hpp"

namespace duckdb {
using digit_t = uint32_t;
using twodigit_t = uint64_t;

//! The Varint class is a static class that holds helper functions for the Varint type.
class Varint {
public:
//! Header size of a Varint is always 3 bytes.
DUCKDB_API static constexpr uint8_t VARINT_HEADER_SIZE = 3;
//! Max(e such that 10**e fits in a digit_t)
DUCKDB_API static constexpr uint8_t DECIMAL_SHIFT = 9;
//! 10 ** DECIMAL_SHIFT
DUCKDB_API static constexpr digit_t DECIMAL_BASE = 1000000000;
//! Bytes of a digit_t
DUCKDB_API static constexpr uint8_t DIGIT_BYTES = sizeof(digit_t);
//! Bits of a digit_t
DUCKDB_API static constexpr uint8_t DIGIT_BITS = DIGIT_BYTES * 8;
//! Verifies if a Varint is valid. i.e., if it has 3 header bytes. The header correctly represents the number of
//! data bytes, and the data bytes has no leading zero bytes.
DUCKDB_API static void Verify(const string_t &input);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,9 @@ struct RoaringAnalyzeState : public AnalyzeState {
//! Flushed analyze data

//! The space used by the current segment
idx_t space_used = 0;
idx_t data_size = 0;
idx_t metadata_size = 0;

//! The total amount of segments to write
idx_t segment_count = 0;
//! The amount of values in the current segment;
Expand Down
2 changes: 2 additions & 0 deletions src/duckdb/src/logging/log_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ void LogManager::SetLogStorage(DatabaseInstance &db, const string &storage_name)
throw NotImplementedException("File log storage is not yet implemented");
} else if (registered_log_storages.find(storage_name_to_lower) != registered_log_storages.end()) {
log_storage = registered_log_storages[storage_name_to_lower];
} else {
throw InvalidInputException("Log storage '%s' is not yet registered", storage_name);
}
config.storage = storage_name_to_lower;
}
Expand Down
11 changes: 2 additions & 9 deletions src/duckdb/src/storage/compression/dictionary/decompression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,15 @@ void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initiali
block_size = segment.GetBlockManager().GetBlockSize();

dict = DictionaryCompression::GetDictionary(segment, *handle);
dictionary = make_buffer<Vector>(segment.type, index_buffer_count);
dictionary_size = index_buffer_count;

if (!initialize_dictionary) {
// Used by fetch, as fetch will never produce a DictionaryVector
return;
}

dictionary = make_buffer<Vector>(segment.type, index_buffer_count);
dictionary_size = index_buffer_count;
auto dict_child_data = FlatVector::GetData<string_t>(*(dictionary));
auto &validity = FlatVector::Validity(*dictionary);
D_ASSERT(index_buffer_count >= 1);
validity.SetInvalid(0);
for (uint32_t i = 0; i < index_buffer_count; i++) {
// NOTE: the passing of dict_child_vector, will not be used, its for big strings
uint16_t str_len = GetStringLength(i);
Expand All @@ -64,7 +61,6 @@ void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initiali

void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count) {
auto result_data = FlatVector::GetData<string_t>(result);
auto &validity = FlatVector::Validity(result);

// Handling non-bitpacking-group-aligned start values;
idx_t start_offset = start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
Expand All @@ -86,9 +82,6 @@ void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_of
for (idx_t i = 0; i < scan_count; i++) {
// Lookup dict offset in index buffer
auto string_number = sel_vec->get_index(i + start_offset);
if (string_number == 0) {
validity.SetInvalid(result_offset + i);
}
auto dict_offset = index_buffer_ptr[string_number];
auto str_len = GetStringLength(UnsafeNumericCast<sel_t>(string_number));
result_data[result_offset + i] = FetchStringFromDict(UnsafeNumericCast<int32_t>(dict_offset), str_len);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,14 @@ void DictionaryCompressionStorage::StringFetchRow(ColumnSegment &segment, Column
// Get Function
//===--------------------------------------------------------------------===//
CompressionFunction DictionaryCompressionFun::GetFunction(PhysicalType data_type) {
auto res = CompressionFunction(
return CompressionFunction(
CompressionType::COMPRESSION_DICTIONARY, data_type, DictionaryCompressionStorage ::StringInitAnalyze,
DictionaryCompressionStorage::StringAnalyze, DictionaryCompressionStorage::StringFinalAnalyze,
DictionaryCompressionStorage::InitCompression, DictionaryCompressionStorage::Compress,
DictionaryCompressionStorage::FinalizeCompress, DictionaryCompressionStorage::StringInitScan,
DictionaryCompressionStorage::StringScan, DictionaryCompressionStorage::StringScanPartial<false>,
DictionaryCompressionStorage::StringFetchRow, UncompressedFunctions::EmptySkip,
UncompressedStringStorage::StringInitSegment);
res.validity = CompressionValidity::NO_VALIDITY_REQUIRED;
return res;
}

bool DictionaryCompressionFun::TypeIsSupported(const PhysicalType physical_type) {
Expand Down
12 changes: 7 additions & 5 deletions src/duckdb/src/storage/compression/roaring/analyze.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ void RoaringAnalyzeState::Flush(RoaringAnalyzeState &state) {
}

bool RoaringAnalyzeState::HasEnoughSpaceInSegment(idx_t required_space) {
auto space_used = data_size + metadata_size;
D_ASSERT(space_used <= info.GetBlockSize());
idx_t remaining_space = info.GetBlockSize() - space_used;
if (required_space > remaining_space) {
Expand All @@ -117,13 +118,15 @@ bool RoaringAnalyzeState::HasEnoughSpaceInSegment(idx_t required_space) {
}

void RoaringAnalyzeState::FlushSegment() {
auto space_used = data_size + metadata_size;
if (!current_count) {
D_ASSERT(!space_used);
return;
}
metadata_collection.FlushSegment();
total_size += space_used;
space_used = 0;
data_size = 0;
metadata_size = 0;
current_count = 0;
segment_count++;
}
Expand All @@ -146,15 +149,14 @@ void RoaringAnalyzeState::FlushContainer() {
arrays_count++;
}

idx_t required_space = metadata_collection.GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count);
metadata_size = metadata_collection.GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count);

required_space += metadata.GetDataSizeInBytes(count);
if (!HasEnoughSpaceInSegment(required_space)) {
data_size += metadata.GetDataSizeInBytes(count);
if (!HasEnoughSpaceInSegment(metadata_size + data_size)) {
FlushSegment();
}
container_metadata.push_back(metadata);
metadata_collection.AddMetadata(metadata);
space_used += required_space;
current_count += count;

// Reset the container analyze state
Expand Down
12 changes: 9 additions & 3 deletions src/duckdb/src/storage/table/column_data_checkpointer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "duckdb/storage/data_table.hpp"
#include "duckdb/parser/column_definition.hpp"
#include "duckdb/storage/table/scan_state.hpp"
#include "duckdb/main/database.hpp"

namespace duckdb {

Expand Down Expand Up @@ -240,14 +241,19 @@ vector<CheckpointAnalyzeResult> ColumnDataCheckpointer::DetectBestCompressionMet
}
}

auto &checkpoint_state = checkpoint_states[i];
auto &col_data = checkpoint_state.get().column_data;
if (!chosen_state) {
auto &checkpoint_state = checkpoint_states[i];
auto &col_data = checkpoint_state.get().column_data;
throw FatalException("No suitable compression/storage method found to store column of type %s",
col_data.type.ToString());
}
D_ASSERT(compression_idx != DConstants::INVALID_INDEX);
result[i] = CheckpointAnalyzeResult(std::move(chosen_state), *functions[compression_idx]);

auto &best_function = *functions[compression_idx];
Logger::Info(db, "FinalAnalyze(%s) result for %s.%s.%d(%s): %d", EnumUtil::ToString(best_function.type),
col_data.info.GetSchemaName(), col_data.info.GetTableName(), col_data.column_index,
col_data.type.ToString(), best_score);
result[i] = CheckpointAnalyzeResult(std::move(chosen_state), best_function);
}
return result;
}
Expand Down

0 comments on commit 442deb3

Please sign in to comment.