From 1420aef3342012e0b25472fc569de2e34b74556f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 21:52:55 +0000 Subject: [PATCH 01/21] vendor: Update vendored sources to duckdb/duckdb@acdbf60889033d2701a5fef360a19963cafea471 (#974) Annotate errors in table macros with the call position of the table macro (duckdb/duckdb#15590) Co-authored-by: krlmlr --- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- .../src/planner/binder/tableref/bind_table_function.cpp | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 77e744b6b..6bb466621 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4106" +#define DUCKDB_PATCH_VERSION "4-dev4108" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4106" +#define DUCKDB_VERSION "v1.1.4-dev4108" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "2edfde3071" +#define DUCKDB_SOURCE_ID "acdbf60889" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp b/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp index 2d222b4fc..3f4c249bd 100644 --- a/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +++ b/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp @@ -283,7 +283,14 @@ unique_ptr Binder::Bind(TableFunctionRef &ref) { binder->can_contain_nulls = true; binder->alias = ref.alias.empty() ? "unnamed_query" : ref.alias; - auto query = binder->BindNode(*query_node); + unique_ptr query; + try { + query = binder->BindNode(*query_node); + } catch (std::exception &ex) { + ErrorData error(ex); + error.AddQueryLocation(ref); + error.Throw(); + } idx_t bind_index = query->GetRootIndex(); // string alias; From 924d64963e5f7f722e60f7511dc3dcd83af381cb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 01:17:58 +0000 Subject: [PATCH 02/21] fledge: Bump version to 1.1.3.9034 (#975) Co-authored-by: krlmlr --- DESCRIPTION | 2 +- NEWS.md | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f0e516db7..87528928b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: duckdb Title: DBI Package for the DuckDB Database Management System -Version: 1.1.3.9033 +Version: 1.1.3.9034 Authors@R: c( person("Hannes", "Mühleisen", , "hannes@cwi.nl", role = "aut", comment = c(ORCID = "0000-0001-8552-0029")), diff --git a/NEWS.md b/NEWS.md index 452fe023a..2f6f554ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,36 @@ +# duckdb 1.1.3.9034 + +## vendor + +- Update vendored sources to duckdb/duckdb@acdbf60889033d2701a5fef360a19963cafea471 (#974). + +- Update vendored sources to duckdb/duckdb@2edfde3071cdc3ccc6047773e8229fb80238443d (#973). + +- Update vendored sources to duckdb/duckdb@3c0db29b2bf182105c1537ddf2fa8b12d186941d (#972). + +- Update vendored sources to duckdb/duckdb@5705d13dbf1d4f3db3e4f36d3194f669aaf99bc0 (#971). + +- Update vendored sources to duckdb/duckdb@41d13ad1612b3e91ff7a9a5c17edd6fdf51073f1 (#970). + +- Update vendored sources to duckdb/duckdb@52032a5b00d4f11376f4ba106753629aea87c0a8 (#969). + +- Update vendored sources to duckdb/duckdb@add512094d022fec50354208f908c3d60f3755a5 (#968). + +- Update vendored sources to duckdb/duckdb@d42c34cbd62061e5088fc1b81ced0c1008fa58ea (#967). + +- Update vendored sources to duckdb/duckdb@35bf611cc4832ceb2f20ec70a2597d0765618610 (duckdb/duckdb#15526, #966). + +- Update vendored sources to duckdb/duckdb@22de1ef5310136803f2caab0c9cc063b4fad52e5 (#965). + +- Update vendored sources to duckdb/duckdb@a3636bdd6203f4e4c47c5191264b9209e4f2a516 (#964). + +- Update vendored sources to duckdb/duckdb@50de06cec8de9c8da1b6fd9fcec3f514694a9b6a (#963). + +- Update vendored sources to duckdb/duckdb@4b163ff7c8cb9693fafe4d822dff54b31cfd2adf (#962). + + # duckdb 1.1.3.9033 ## vendor From 693ded16b4f2dbd4961b8afba31b3639e30c0228 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:20:36 +0000 Subject: [PATCH 03/21] vendor: Update vendored sources to duckdb/duckdb@c8fa9aee7858909c625b5c3abcc3a257c5d9d934 (#977) Line dependent buffer (duckdb/duckdb#14512) Co-authored-by: krlmlr --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 4 +- .../buffer_manager/csv_buffer_manager.cpp | 9 +- .../buffer_manager/csv_file_handle.cpp | 2 +- .../csv_scanner/scanner/scanner_boundary.cpp | 46 +++-- .../scanner/string_value_scanner.cpp | 56 +++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 11 +- .../csv_scanner/sniffer/type_detection.cpp | 26 +-- .../table_function/global_csv_state.cpp | 115 +----------- .../operator/csv_scanner/util/csv_error.cpp | 163 ++++++++++++++++-- .../csv_scanner/util/csv_reader_options.cpp | 21 ++- .../function/table/version/pragma_version.cpp | 6 +- .../operator/csv_scanner/csv_buffer.hpp | 14 +- .../operator/csv_scanner/csv_error.hpp | 37 ++-- .../csv_scanner/csv_reader_options.hpp | 5 +- .../operator/csv_scanner/scanner_boundary.hpp | 18 +- .../csv_scanner/string_value_scanner.hpp | 13 +- .../storage/serialization/serialize_nodes.cpp | 11 +- 17 files changed, 314 insertions(+), 243 deletions(-) diff --git a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 3677e0a75..c18c5f61e 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -58,7 +58,7 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) { block = handle.GetBlockHandle(); } -idx_t CSVBuffer::GetBufferSize() { +idx_t CSVBuffer::GetBufferSize() const { return actual_buffer_size; } @@ -87,7 +87,7 @@ void CSVBuffer::Unpin() { } } -bool CSVBuffer::IsCSVFileLastBuffer() { +bool CSVBuffer::IsCSVFileLastBuffer() const { return last_buffer; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp index 5f2823292..811689917 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp @@ -7,7 +7,7 @@ CSVBufferManager::CSVBufferManager(ClientContext &context_p, const CSVReaderOpti const idx_t file_idx_p, bool per_file_single_threaded_p, unique_ptr file_handle_p) : context(context_p), per_file_single_threaded(per_file_single_threaded_p), file_idx(file_idx_p), - file_path(file_path_p), buffer_size(CSVBuffer::CSV_BUFFER_SIZE) { + file_path(file_path_p), buffer_size(options.buffer_size_option.GetValue()) { D_ASSERT(!file_path.empty()); if (file_handle_p) { file_handle = std::move(file_handle_p); @@ -16,13 +16,6 @@ CSVBufferManager::CSVBufferManager(ClientContext &context_p, const CSVReaderOpti } is_pipe = file_handle->IsPipe(); skip_rows = options.dialect_options.skip_rows.GetValue(); - auto file_size = file_handle->FileSize(); - if (file_size > 0 && file_size < buffer_size) { - buffer_size = CSVBuffer::CSV_MINIMUM_BUFFER_SIZE; - } - if (options.buffer_size < buffer_size) { - buffer_size = options.buffer_size; - } Initialize(); } diff --git a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp index 0a8c1e20c..73fbb4ed0 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp @@ -10,7 +10,7 @@ namespace duckdb { CSVFileHandle::CSVFileHandle(DBConfig &config, unique_ptr file_handle_p, const string &path_p, const CSVReaderOptions &options) : compression_type(options.compression), file_handle(std::move(file_handle_p)), - encoder(config, options.encoding, options.buffer_size), path(path_p) { + encoder(config, options.encoding, options.buffer_size_option.GetValue()), path(path_p) { can_seek = file_handle->CanSeek(); on_disk_file = file_handle->OnDiskFile(); file_size = file_handle->GetFileSize(); diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp index 0a577d779..b0511bec6 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/scanner_boundary.cpp @@ -16,7 +16,7 @@ CSVBoundary::CSVBoundary() : buffer_idx(0), buffer_pos(0), boundary_idx(0), end_ CSVIterator::CSVIterator() : is_set(false) { } -void CSVBoundary::Print() { +void CSVBoundary::Print() const { #ifndef DUCKDB_DISABLE_PRINT std::cout << "---Boundary: " << boundary_idx << " ---" << '\n'; std::cout << "Buffer Index: " << buffer_idx << '\n'; @@ -26,26 +26,39 @@ void CSVBoundary::Print() { #endif } -void CSVIterator::Print() { +void CSVIterator::Print() const { #ifndef DUCKDB_DISABLE_PRINT boundary.Print(); std::cout << "Is set: " << is_set << '\n'; #endif } -bool CSVIterator::Next(CSVBufferManager &buffer_manager) { +idx_t CSVIterator::BytesPerThread(const CSVReaderOptions &reader_options) { + const idx_t buffer_size = reader_options.buffer_size_option.GetValue(); + const idx_t max_row_size = reader_options.maximum_line_size.GetValue(); + const idx_t bytes_per_thread = buffer_size / CSVBuffer::ROWS_PER_BUFFER * ROWS_PER_THREAD; + if (bytes_per_thread < max_row_size) { + // If we are setting up the buffer size directly, we must make sure each thread will read the full buffer. + return max_row_size; + } + return bytes_per_thread; +} + +bool CSVIterator::Next(CSVBufferManager &buffer_manager, const CSVReaderOptions &reader_options) { if (!is_set) { return false; } + const auto bytes_per_thread = BytesPerThread(reader_options); + // If we are calling next this is not the first one anymore first_one = false; boundary.boundary_idx++; // This is our start buffer auto buffer = buffer_manager.GetBuffer(boundary.buffer_idx); - if (buffer->is_last_buffer && boundary.buffer_pos + CSVIterator::BYTES_PER_THREAD > buffer->actual_size) { + if (buffer->is_last_buffer && boundary.buffer_pos + bytes_per_thread > buffer->actual_size) { // 1) We are done with the current file return false; - } else if (boundary.buffer_pos + BYTES_PER_THREAD >= buffer->actual_size) { + } else if (boundary.buffer_pos + bytes_per_thread >= buffer->actual_size) { // 2) We still have data to scan in this file, we set the iterator accordingly. // We must move the buffer boundary.buffer_idx++; @@ -58,9 +71,9 @@ bool CSVIterator::Next(CSVBufferManager &buffer_manager) { } else { // 3) We are not done with the current buffer, hence we just move where we start within the buffer - boundary.buffer_pos += BYTES_PER_THREAD; + boundary.buffer_pos += bytes_per_thread; } - boundary.end_pos = boundary.buffer_pos + BYTES_PER_THREAD; + boundary.end_pos = boundary.buffer_pos + bytes_per_thread; SetCurrentPositionToBoundary(); return true; } @@ -85,20 +98,21 @@ void CSVIterator::SetCurrentPositionToBoundary() { pos.buffer_pos = boundary.buffer_pos; } -void CSVIterator::SetCurrentBoundaryToPosition(bool single_threaded) { +void CSVIterator::SetCurrentBoundaryToPosition(bool single_threaded, const CSVReaderOptions &reader_options) { if (single_threaded) { is_set = false; return; } + const auto bytes_per_thread = BytesPerThread(reader_options); + boundary.buffer_idx = pos.buffer_idx; if (pos.buffer_pos == 0) { - boundary.end_pos = CSVIterator::BYTES_PER_THREAD; + boundary.end_pos = bytes_per_thread; } else { - boundary.end_pos = ((pos.buffer_pos + CSVIterator::BYTES_PER_THREAD - 1) / CSVIterator::BYTES_PER_THREAD) * - CSVIterator::BYTES_PER_THREAD; + boundary.end_pos = ((pos.buffer_pos + bytes_per_thread - 1) / bytes_per_thread) * bytes_per_thread; } - boundary.buffer_pos = boundary.end_pos - CSVIterator::BYTES_PER_THREAD; + boundary.buffer_pos = boundary.end_pos - bytes_per_thread; is_set = true; } @@ -110,7 +124,13 @@ void CSVIterator::SetEnd(idx_t pos) { boundary.end_pos = pos; } -idx_t CSVIterator::GetGlobalCurrentPos() { +void CSVIterator::CheckIfDone() { + if (IsBoundarySet() && (pos.buffer_idx > boundary.buffer_idx || pos.buffer_pos > boundary.buffer_pos)) { + done = true; + } +} + +idx_t CSVIterator::GetGlobalCurrentPos() const { return pos.buffer_pos + buffer_size * pos.buffer_idx; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 967660a8e..aeb305a55 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -14,19 +14,22 @@ #include namespace duckdb { + +constexpr idx_t StringValueScanner::LINE_FINDER_ID; + StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_machine, const shared_ptr &buffer_handle, Allocator &buffer_allocator, idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_hander_p, CSVIterator &iterator_p, bool store_line_size_p, shared_ptr csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p, - string path_p) + string path_p, idx_t scan_id) : ScannerResult(states, state_machine, result_size_p), number_of_columns(NumericCast(state_machine.dialect_options.num_cols)), null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()), extra_delimiter_bytes(state_machine.dialect_options.state_machine_options.delimiter.GetValue().size() - 1), error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p), csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), - current_errors(state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(std::move(path_p)) { + current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(std::move(path_p)) { // Vector information D_ASSERT(number_of_columns > 0); if (!buffer_handle) { @@ -681,7 +684,7 @@ bool LineError::HandleErrors(StringValueResult &result) { break; case MAXIMUM_LINE_SIZE: csv_error = CSVError::LineSizeError( - result.state_machine.options, cur_error.current_line_size, lines_per_batch, borked_line, + result.state_machine.options, lines_per_batch, borked_line, result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl), result.path); break; case INVALID_STATE: @@ -702,7 +705,7 @@ bool LineError::HandleErrors(StringValueResult &result) { } result.error_handler.Error(csv_error); } - if (is_error_in_line) { + if (is_error_in_line && scan_id != StringValueScanner::LINE_FINDER_ID) { if (result.sniffing) { // If we are sniffing we just remove the line result.RemoveLastLine(); @@ -780,7 +783,7 @@ bool StringValueResult::AddRowInternal() { } current_line_position.begin = current_line_position.end; current_line_position.end = current_line_start; - if (current_line_size > state_machine.options.maximum_line_size) { + if (current_line_size > state_machine.options.maximum_line_size.GetValue()) { current_errors.Insert(MAXIMUM_LINE_SIZE, 1, chunk_col_id, last_position, current_line_size); } if (!state_machine.options.null_padding) { @@ -827,7 +830,7 @@ bool StringValueResult::AddRowInternal() { } else { // If we are not null-padding this is an error if (!state_machine.options.IgnoreErrors()) { - bool first_nl; + bool first_nl = false; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles, PrintErrorLine()); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); @@ -938,8 +941,8 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptrcontext), result_size, iterator.pos.buffer_pos, *error_handler, iterator, buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing, - buffer_manager->GetFilePath()) { - iterator.buffer_size = state_machine->options.buffer_size; + buffer_manager->GetFilePath(), scanner_idx_p) { + iterator.buffer_size = state_machine->options.buffer_size_option.GetValue(); } StringValueScanner::StringValueScanner(const shared_ptr &buffer_manager, @@ -950,8 +953,8 @@ StringValueScanner::StringValueScanner(const shared_ptr &buffe result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size, iterator.pos.buffer_pos, *error_handler, iterator, buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing, - buffer_manager->GetFilePath()) { - iterator.buffer_size = state_machine->options.buffer_size; + buffer_manager->GetFilePath(), 0) { + iterator.buffer_size = state_machine->options.buffer_size_option.GetValue(); } unique_ptr StringValueScanner::GetCSVScanner(ClientContext &context, CSVReaderOptions &options) { @@ -1064,7 +1067,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } result.borked_rows.insert(line_error++); D_ASSERT(state_machine->options.ignore_errors.GetValue()); - // We are ignoring errors. We must continue but ignoring borked rows + // We are ignoring errors. We must continue but ignoring borked-rows for (; line_error < parse_chunk.size(); line_error++) { if (!inserted_column_data.validity.RowIsValid(line_error) && parse_column_data.validity.RowIsValid(line_error)) { @@ -1622,14 +1625,19 @@ bool StringValueScanner::IsRowValid(CSVIterator ¤t_iterator) const { return false; } constexpr idx_t result_size = 1; - auto scan_finder = - make_uniq(0U, buffer_manager, state_machine_strict, make_shared_ptr(), - csv_file_scan, false, current_iterator, result_size); + auto scan_finder = make_uniq(StringValueScanner::LINE_FINDER_ID, buffer_manager, + state_machine_strict, make_shared_ptr(), + csv_file_scan, false, current_iterator, result_size); auto &tuples = scan_finder->ParseChunk(); current_iterator.pos = scan_finder->GetIteratorPosition(); - return (tuples.number_of_rows == 1 || tuples.first_line_is_comment) && tuples.borked_rows.empty() && - !tuples.current_errors.HasError(); - ; + bool has_error = false; + if (tuples.current_errors.HasError()) { + if (tuples.current_errors.Size() != 1 || !tuples.current_errors.HasErrorType(MAXIMUM_LINE_SIZE)) { + // We ignore maximum line size errors + has_error = true; + } + } + return (tuples.number_of_rows == 1 || tuples.first_line_is_comment) && !has_error && tuples.borked_rows.empty(); } ValidRowInfo StringValueScanner::TryRow(CSVState state, idx_t start_pos, idx_t end_pos) const { @@ -1641,9 +1649,6 @@ ValidRowInfo StringValueScanner::TryRow(CSVState state, idx_t start_pos, idx_t e auto iterator_start = current_iterator; idx_t current_pos = current_iterator.pos.buffer_pos; current_iterator.SetEnd(iterator.GetEndPos()); - if (iterator.GetEndPos() == current_pos) { - return {false, current_pos, current_iterator.pos.buffer_idx, current_iterator.pos.buffer_pos, quoted}; - } if (IsRowValid(current_iterator)) { if (!quoted) { quoted = FirstValueEndsOnQuote(iterator_start); @@ -1715,12 +1720,19 @@ void StringValueScanner::SetStart() { iterator.done = true; } else { bool mock; - SkipUntilState(CSVState::STANDARD_NEWLINE, CSVState::RECORD_SEPARATOR, iterator, mock); + if (!SkipUntilState(CSVState::STANDARD_NEWLINE, CSVState::RECORD_SEPARATOR, iterator, mock)) { + iterator.CheckIfDone(); + } } } else { iterator.pos.buffer_pos = best_row.start_pos; - iterator.done = iterator.pos.buffer_pos == cur_buffer_handle->actual_size; + bool is_this_the_end = + best_row.start_pos >= cur_buffer_handle->actual_size && cur_buffer_handle->is_last_buffer; + if (is_this_the_end) { + iterator.done = true; + } } + // 4. We have an error, if we have an error, we let life go on, the scanner will either ignore it // or throw. result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index 08311cfe3..c8740ebde 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -205,15 +205,8 @@ SnifferResult CSVSniffer::SniffCSV(const bool force_match) { buffer_manager->ResetBufferManager(); } buffer_manager->sniffing = false; - if (!best_candidate->error_handler->errors.empty() && !options.ignore_errors.GetValue()) { - for (auto &error_vector : best_candidate->error_handler->errors) { - for (auto &error : error_vector.second) { - if (error.type == MAXIMUM_LINE_SIZE) { - // If it's a maximum line size error, we can do it now. - error_handler->Error(error); - } - } - } + if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue()) { + best_candidate->error_handler->ErrorIfTypeExists(MAXIMUM_LINE_SIZE); } D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size()); // We are done, Set the CSV Options in the reference. Construct and return the result. diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index e7a5237af..49bf105a9 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -425,19 +425,9 @@ void CSVSniffer::DetectTypes() { SetUserDefinedDateTimeFormat(*candidate->state_machine); // Parse chunk and read csv with info candidate auto &data_chunk = candidate->ParseChunk().ToChunk(); - if (!candidate->error_handler->errors.empty()) { - bool break_loop = false; - for (auto &errors : candidate->error_handler->errors) { - for (auto &error : errors.second) { - if (error.type != CSVErrorType::MAXIMUM_LINE_SIZE) { - break_loop = true; - break; - } - } - } - if (break_loop && !candidate->state_machine->options.ignore_errors.GetValue()) { - continue; - } + if (candidate->error_handler->AnyErrors() && !candidate->error_handler->HasError(MAXIMUM_LINE_SIZE) && + !candidate->state_machine->options.ignore_errors.GetValue()) { + continue; } idx_t start_idx_detection = 0; idx_t chunk_size = data_chunk.size(); @@ -463,11 +453,11 @@ void CSVSniffer::DetectTypes() { // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of // best_num_cols. - if (!best_candidate || - (varchar_cols(info_sql_types_candidates.size())>( - static_cast(max_columns_found) * 0.7) && - (!options.ignore_errors.GetValue() || candidate->error_handler->errors.size() < min_errors))) { - min_errors = candidate->error_handler->errors.size(); + const idx_t number_of_errors = candidate->error_handler->GetSize(); + if (!best_candidate || (varchar_cols(info_sql_types_candidates.size())>( + static_cast(max_columns_found) * 0.7) && + (!options.ignore_errors.GetValue() || number_of_errors < min_errors))) { + min_errors = number_of_errors; best_header_row.clear(); // we have a new best_options candidate best_candidate = std::move(candidate); diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index f6d0d386e..25691076d 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -1,5 +1,4 @@ #include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp" - #include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" #include "duckdb/execution/operator/csv_scanner/scanner_boundary.hpp" #include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp" @@ -38,7 +37,7 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptrstart_iterator; - current_boundary.SetCurrentBoundaryToPosition(single_threaded); + current_boundary.SetCurrentBoundaryToPosition(single_threaded, options); if (current_boundary.done && context.client_data->debug_set_max_line_length) { context.client_data->debug_max_line_length = current_boundary.pos.buffer_pos; } @@ -124,7 +123,7 @@ unique_ptr CSVGlobalState::Next(optional_ptrstart_iterator; - current_boundary.SetCurrentBoundaryToPosition(single_threaded); + current_boundary.SetCurrentBoundaryToPosition(single_threaded, bind_data.options); current_buffer_in_use = make_shared_ptr(*file_scans.back()->buffer_manager, current_boundary.GetBufferIdx()); @@ -158,7 +157,7 @@ unique_ptr CSVGlobalState::Next(optional_ptrbuffer_tracker = current_buffer_in_use; // We then produce the next boundary - if (!current_boundary.Next(*current_file.buffer_manager)) { + if (!current_boundary.Next(*current_file.buffer_manager, bind_data.options)) { // This means we are done scanning the current file do { auto current_file_idx = file_scans.back()->file_idx + 1; @@ -169,7 +168,7 @@ unique_ptr CSVGlobalState::Next(optional_ptrstart_iterator; - current_boundary.SetCurrentBoundaryToPosition(single_threaded); + current_boundary.SetCurrentBoundaryToPosition(single_threaded, bind_data.options); current_buffer_in_use = make_shared_ptr(*file_scans.back()->buffer_manager, current_boundary.GetBufferIdx()); } else { @@ -188,8 +187,8 @@ idx_t CSVGlobalState::MaxThreads() const { if (single_threaded || !file_scans.front()->on_disk_file) { return system_threads; } - const idx_t total_threads = file_scans.front()->file_size / CSVIterator::BYTES_PER_THREAD + 1; - + const idx_t bytes_per_thread = CSVIterator::BytesPerThread(file_scans.front()->options); + const idx_t total_threads = file_scans.front()->file_size / bytes_per_thread + 1; if (total_threads < system_threads) { return total_threads; } @@ -217,42 +216,6 @@ void CSVGlobalState::DecrementThread() { } } -bool IsCSVErrorAcceptedReject(CSVErrorType type) { - switch (type) { - case CSVErrorType::CAST_ERROR: - case CSVErrorType::TOO_MANY_COLUMNS: - case CSVErrorType::TOO_FEW_COLUMNS: - case CSVErrorType::MAXIMUM_LINE_SIZE: - case CSVErrorType::UNTERMINATED_QUOTES: - case CSVErrorType::INVALID_UNICODE: - case CSVErrorType::INVALID_STATE: - return true; - default: - return false; - } -} - -string CSVErrorTypeToEnum(CSVErrorType type) { - switch (type) { - case CSVErrorType::CAST_ERROR: - return "CAST"; - case CSVErrorType::TOO_FEW_COLUMNS: - return "MISSING COLUMNS"; - case CSVErrorType::TOO_MANY_COLUMNS: - return "TOO MANY COLUMNS"; - case CSVErrorType::MAXIMUM_LINE_SIZE: - return "LINE SIZE OVER MAXIMUM"; - case CSVErrorType::UNTERMINATED_QUOTES: - return "UNQUOTED VALUE"; - case CSVErrorType::INVALID_UNICODE: - return "INVALID UNICODE"; - case CSVErrorType::INVALID_STATE: - return "INVALID STATE"; - default: - throw InternalException("CSV Error is not valid to be stored in a Rejects Table"); - } -} - void FillScanErrorTable(InternalAppender &scan_appender, idx_t scan_idx, idx_t file_idx, CSVFileScan &file) { CSVReaderOptions &options = file.options; // Add the row to the rejects table @@ -326,70 +289,10 @@ void CSVGlobalState::FillRejectsTable() const { InternalAppender scans_appender(context, scans_table); idx_t scan_idx = context.transaction.GetActiveQuery(); for (auto &file : file_scans) { - idx_t file_idx = rejects->GetCurrentFileIndex(scan_idx); + const idx_t file_idx = rejects->GetCurrentFileIndex(scan_idx); auto file_name = file->file_path; - auto &errors = file->error_handler->errors; - // We first insert the file into the file scans table - for (auto &error_vector : errors) { - for (auto &error : error_vector.second) { - if (!IsCSVErrorAcceptedReject(error.type)) { - continue; - } - // short circuit if we already have too many rejects - if (limit == 0 || rejects->count < limit) { - if (limit != 0 && rejects->count >= limit) { - break; - } - rejects->count++; - auto row_line = file->error_handler->GetLine(error.error_info); - auto col_idx = error.column_idx; - // Add the row to the rejects table - errors_appender.BeginRow(); - // 1. Scan Id - errors_appender.Append(scan_idx); - // 2. File Id - errors_appender.Append(file_idx); - // 3. Row Line - errors_appender.Append(row_line); - // 4. Byte Position of the row error - errors_appender.Append(error.row_byte_position + 1); - // 5. Byte Position where error occurred - if (!error.byte_position.IsValid()) { - // This means this error comes from a flush, and we don't support this yet, so we give it - // a null - errors_appender.Append(Value()); - } else { - errors_appender.Append(error.byte_position.GetIndex() + 1); - } - // 6. Column Index - if (error.type == CSVErrorType::MAXIMUM_LINE_SIZE) { - errors_appender.Append(Value()); - } else { - errors_appender.Append(col_idx + 1); - } - // 7. Column Name (If Applicable) - switch (error.type) { - case CSVErrorType::TOO_MANY_COLUMNS: - case CSVErrorType::MAXIMUM_LINE_SIZE: - errors_appender.Append(Value()); - break; - case CSVErrorType::TOO_FEW_COLUMNS: - D_ASSERT(bind_data.return_names.size() > col_idx + 1); - errors_appender.Append(string_t(bind_data.return_names[col_idx + 1])); - break; - default: - errors_appender.Append(string_t(bind_data.return_names[col_idx])); - } - // 8. Error Type - errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type))); - // 9. Original CSV Line - errors_appender.Append(string_t(error.csv_row)); - // 10. Full Error Message - errors_appender.Append(string_t(error.error_message)); - errors_appender.EndRow(); - } - } - } + file->error_handler->FillRejectsTable(errors_appender, file_idx, scan_idx, *file, *rejects, bind_data, + limit); if (rejects->count != 0) { rejects->count = 0; FillScanErrorTable(scans_appender, scan_idx, file_idx, *file); diff --git a/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp index 5c01d1fe9..f77a93ac0 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -1,7 +1,10 @@ #include "duckdb/execution/operator/csv_scanner/csv_error.hpp" #include "duckdb/common/exception/conversion_exception.hpp" #include "duckdb/common/string_util.hpp" - +#include "duckdb/function/table/read_csv.hpp" +#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp" +#include "duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp" +#include "duckdb/main/appender.hpp" #include namespace duckdb { @@ -41,11 +44,11 @@ void CSVErrorHandler::ThrowError(const CSVError &csv_error) { } } -void CSVErrorHandler::Error(CSVError csv_error, bool force_error) { +void CSVErrorHandler::Error(const CSVError &csv_error, bool force_error) { lock_guard parallel_lock(main_mutex); if ((ignore_errors && !force_error) || (PrintLineNumber(csv_error) && !CanGetLine(csv_error.GetBoundaryIndex()))) { // We store this error, we can't throw it now, or we are ignoring it - errors[csv_error.error_info].push_back(std::move(csv_error)); + errors.push_back(csv_error); return; } // Otherwise we can throw directly @@ -58,10 +61,19 @@ void CSVErrorHandler::ErrorIfNeeded() { // Nothing to error return; } - CSVError first_error = errors.begin()->second[0]; - if (CanGetLine(first_error.error_info.boundary_idx)) { - ThrowError(first_error); + if (CanGetLine(errors[0].error_info.boundary_idx)) { + ThrowError(errors[0]); + } +} + +void CSVErrorHandler::ErrorIfTypeExists(CSVErrorType error_type) { + lock_guard parallel_lock(main_mutex); + for (auto &error : errors) { + if (error.type == error_type) { + // If it's a maximum line size error, we can do it now. + ThrowError(error); + } } } @@ -84,6 +96,131 @@ bool CSVErrorHandler::AnyErrors() { return !errors.empty(); } +bool CSVErrorHandler::HasError(const CSVErrorType error_type) { + lock_guard parallel_lock(main_mutex); + for (const auto &er : errors) { + if (er.type == error_type) { + return true; + } + } + return false; +} + +idx_t CSVErrorHandler::GetSize() { + lock_guard parallel_lock(main_mutex); + return errors.size(); +} + +bool IsCSVErrorAcceptedReject(CSVErrorType type) { + switch (type) { + case CSVErrorType::INVALID_STATE: + case CSVErrorType::CAST_ERROR: + case CSVErrorType::TOO_MANY_COLUMNS: + case CSVErrorType::TOO_FEW_COLUMNS: + case CSVErrorType::MAXIMUM_LINE_SIZE: + case CSVErrorType::UNTERMINATED_QUOTES: + case CSVErrorType::INVALID_UNICODE: + return true; + default: + return false; + } +} +string CSVErrorTypeToEnum(CSVErrorType type) { + switch (type) { + case CSVErrorType::CAST_ERROR: + return "CAST"; + case CSVErrorType::TOO_FEW_COLUMNS: + return "MISSING COLUMNS"; + case CSVErrorType::TOO_MANY_COLUMNS: + return "TOO MANY COLUMNS"; + case CSVErrorType::MAXIMUM_LINE_SIZE: + return "LINE SIZE OVER MAXIMUM"; + case CSVErrorType::UNTERMINATED_QUOTES: + return "UNQUOTED VALUE"; + case CSVErrorType::INVALID_UNICODE: + return "INVALID UNICODE"; + case CSVErrorType::INVALID_STATE: + return "INVALID STATE"; + default: + throw InternalException("CSV Error is not valid to be stored in a Rejects Table"); + } +} + +void CSVErrorHandler::FillRejectsTable(InternalAppender &errors_appender, const idx_t file_idx, const idx_t scan_idx, + const CSVFileScan &file, CSVRejectsTable &rejects, const ReadCSVData &bind_data, + const idx_t limit) { + lock_guard parallel_lock(main_mutex); + // We first insert the file into the file scans table + for (auto &error : file.error_handler->errors) { + if (!IsCSVErrorAcceptedReject(error.type)) { + continue; + } + // short circuit if we already have too many rejects + if (limit == 0 || rejects.count < limit) { + if (limit != 0 && rejects.count >= limit) { + break; + } + rejects.count++; + const auto row_line = file.error_handler->GetLineInternal(error.error_info); + const auto col_idx = error.column_idx; + // Add the row to the rejects table + errors_appender.BeginRow(); + // 1. Scan ID + errors_appender.Append(scan_idx); + // 2. File ID + errors_appender.Append(file_idx); + // 3. Row Line + errors_appender.Append(row_line); + // 4. Byte Position of the row error + errors_appender.Append(error.row_byte_position + 1); + // 5. Byte Position where error occurred + if (!error.byte_position.IsValid()) { + // This means this error comes from a flush, and we don't support this yet, so we give it + // a null + errors_appender.Append(Value()); + } else { + errors_appender.Append(error.byte_position.GetIndex() + 1); + } + // 6. Column Index + if (error.type == CSVErrorType::MAXIMUM_LINE_SIZE) { + errors_appender.Append(Value()); + } else { + errors_appender.Append(col_idx + 1); + } + // 7. Column Name (If Applicable) + switch (error.type) { + case CSVErrorType::TOO_MANY_COLUMNS: + case CSVErrorType::MAXIMUM_LINE_SIZE: + errors_appender.Append(Value()); + break; + case CSVErrorType::TOO_FEW_COLUMNS: + D_ASSERT(bind_data.return_names.size() > col_idx + 1); + errors_appender.Append(string_t(bind_data.return_names[col_idx + 1])); + break; + default: + errors_appender.Append(string_t(bind_data.return_names[col_idx])); + } + // 8. Error Type + errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type))); + // 9. Original CSV Line + errors_appender.Append(string_t(error.csv_row)); + // 10. Full Error Message + errors_appender.Append(string_t(error.error_message)); + errors_appender.EndRow(); + } + } +} + +idx_t CSVErrorHandler::GetMaxLineLength() { + lock_guard parallel_lock(main_mutex); + return max_line_length; +} + +void CSVErrorHandler::DontPrintErrorLine() { + lock_guard parallel_lock(main_mutex); + print_line = false; +} + void CSVErrorHandler::SetIgnoreErrors(bool ignore_errors_p) { lock_guard parallel_lock(main_mutex); ignore_errors = ignore_errors_p; @@ -115,7 +252,6 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ auto it = sql_types_per_column.find(names[i]); if (it != sql_types_per_column.end()) { sql_types_per_column.erase(names[i]); - continue; } } if (sql_types_per_column.empty()) { @@ -165,14 +301,14 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam how_to_fix_it.str(), current_path); } -CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, - string &csv_row, idx_t byte_position, const string ¤t_path) { +CSVError CSVError::LineSizeError(const CSVReaderOptions &options, LinesPerBoundary error_info, string &csv_row, + idx_t byte_position, const string ¤t_path) { std::ostringstream error; - error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; - error << "Actual Size:" << actual_size << " bytes." << '\n'; + error << "Maximum line size of " << options.maximum_line_size.GetValue() << " bytes exceeded. "; + error << "Actual Size:" << csv_row.size() << " bytes." << '\n'; std::ostringstream how_to_fix_it; - how_to_fix_it << "Possible Solution: Change the maximum length size, e.g., max_line_size=" << actual_size + 1 + how_to_fix_it << "Possible Solution: Change the maximum length size, e.g., max_line_size=" << csv_row.size() + 2 << "\n"; return CSVError(error.str(), MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, byte_position, options, @@ -301,6 +437,9 @@ CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string & } error << "* Check you are using the correct file compression, otherwise set it (e.g., compression = \'zstd\')" << '\n'; + error << "* Be sure that the maximum line size is set to an appropriate value, otherwise set it (e.g., " + "max_line_size=10000000)" + << "\n"; if (options.dialect_options.state_machine_options.rfc_4180.GetValue() != false || !options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 1a2ee65da..ac18d42eb 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -250,7 +250,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, if (line_size < 0) { throw BinderException("Invalid value for MAX_LINE_SIZE parameter: it cannot be smaller than 0"); } - maximum_line_size = NumericCast(line_size); + maximum_line_size.Set(NumericCast(line_size)); } else if (loption == "date_format" || loption == "dateformat") { string format = ParseString(value, loption); SetDateFormat(LogicalTypeId::DATE, format, true); @@ -260,8 +260,8 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, } else if (loption == "ignore_errors") { ignore_errors.Set(ParseBoolean(value, loption)); } else if (loption == "buffer_size") { - buffer_size = NumericCast(ParseInteger(value, loption)); - if (buffer_size == 0) { + buffer_size_option.Set(NumericCast(ParseInteger(value, loption))); + if (buffer_size_option == 0) { throw InvalidInputException("Buffer Size option must be higher than 0"); } } else if (loption == "decimal_separator") { @@ -546,6 +546,18 @@ void CSVReaderOptions::Verify() { if (rejects_limit != 0 && !store_rejects.GetValue()) { throw BinderException("REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name"); } + // Validate CSV Buffer and max_line_size do not conflict. + if (buffer_size_option.IsSetByUser() && maximum_line_size.IsSetByUser()) { + if (buffer_size_option.GetValue() < maximum_line_size.GetValue()) { + throw BinderException("BUFFER_SIZE option was set to %d, while MAX_LINE_SIZE was set to %d. BUFFER_SIZE " + "must have always be set to value bigger than MAX_LINE_SIZE", + buffer_size_option.GetValue(), maximum_line_size.GetValue()); + } + } else if (maximum_line_size.IsSetByUser() && maximum_line_size.GetValue() > max_line_size_default) { + // If the max line size is set by the user and bigger than we have by default, we make it part of our buffer + // size decision. + buffer_size_option.Set(CSVBuffer::ROWS_PER_BUFFER * maximum_line_size.GetValue(), false); + } } bool GetBooleanValue(const pair &option) { @@ -727,7 +739,7 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co if (rfc_4180.IsSetByUser()) { named_params["rfc_4180"] = Value(GetRFC4180()); } - named_params["max_line_size"] = Value::BIGINT(NumericCast(maximum_line_size)); + named_params["max_line_size"] = Value::BIGINT(NumericCast(maximum_line_size.GetValue())); if (dialect_options.skip_rows.IsSetByUser()) { named_params["skip"] = Value::UBIGINT(GetSkipRows()); } @@ -748,7 +760,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co named_params["column_names"] = StringVectorToValue(name_list); } named_params["all_varchar"] = Value::BOOLEAN(all_varchar); - named_params["maximum_line_size"] = Value::BIGINT(NumericCast(maximum_line_size)); } } // namespace duckdb diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 6bb466621..30ff6eca3 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4108" +#define DUCKDB_PATCH_VERSION "4-dev4147" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4108" +#define DUCKDB_VERSION "v1.1.4-dev4147" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "acdbf60889" +#define DUCKDB_SOURCE_ID "c8fa9aee78" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp index 039c06abe..99ce3c5ba 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp @@ -55,10 +55,10 @@ class CSVBuffer { shared_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number, bool &has_seaked); //! Gets the buffer actual size - idx_t GetBufferSize(); + idx_t GetBufferSize() const; //! If this buffer is the last buffer of the CSV File - bool IsCSVFileLastBuffer(); + bool IsCSVFileLastBuffer() const; //! Allocates internal buffer, sets 'block' and 'handle' variables. void AllocateBuffer(idx_t buffer_size); @@ -77,13 +77,9 @@ class CSVBuffer { } //! By default, we use CSV_BUFFER_SIZE to allocate each buffer - //! TODO: Should benchmarks other values - static constexpr idx_t CSV_BUFFER_SIZE = 32000000; // 32MB - //! In case the file has a size < 32MB, we will use this size instead - //! This is to avoid mallocing a lot of memory for a small file - //! And if it's a compressed file we can't use the actual size of the file - static constexpr idx_t CSV_MINIMUM_BUFFER_SIZE = 8000000; // 8MB - //! If this is the last buffer of the CSV File + static constexpr idx_t ROWS_PER_BUFFER = 16; + static constexpr idx_t MIN_ROWS_PER_BUFFER = 4; + bool last_buffer = false; private: diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 6dd8a22ec..a31fd6190 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -17,9 +17,12 @@ #include "duckdb/execution/operator/csv_scanner/header_value.hpp" namespace duckdb { +class InternalAppender; +class CSVFileScan; +class CSVRejectsTable; +struct ReadCSVData; //! Object that holds information on how many lines each csv batch read. - class LinesPerBoundary { public: LinesPerBoundary(); @@ -27,13 +30,6 @@ class LinesPerBoundary { idx_t boundary_idx = 0; idx_t lines_in_batch = 0; - - bool operator<(const LinesPerBoundary &other) const { - if (boundary_idx < other.boundary_idx) { - return true; - } - return lines_in_batch < other.lines_in_batch; - } }; enum CSVErrorType : uint8_t { @@ -63,8 +59,8 @@ class CSVError { idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, optional_idx byte_position, LogicalTypeId type, const string ¤t_path); //! Produces error for when the line size exceeds the maximum line size option - static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, - string &csv_row, idx_t byte_position, const string ¤t_path); + static CSVError LineSizeError(const CSVReaderOptions &options, LinesPerBoundary error_info, string &csv_row, + idx_t byte_position, const string ¤t_path); //! Produces error for when the state machine reaches an invalid state static CSVError InvalidState(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, optional_idx byte_position, @@ -121,26 +117,27 @@ class CSVErrorHandler { public: explicit CSVErrorHandler(bool ignore_errors = false); //! Throws the error - void Error(CSVError csv_error, bool force_error = false); + void Error(const CSVError &csv_error, bool force_error = false); //! If we have a cached error, and we can now error, we error. void ErrorIfNeeded(); + //! Throws an error if a given type exists + void ErrorIfTypeExists(CSVErrorType error_type); //! Inserts a finished error info void Insert(idx_t boundary_idx, idx_t rows); idx_t GetLine(const LinesPerBoundary &error_info); void NewMaxLineSize(idx_t scan_line_size); //! Returns true if there are any errors bool AnyErrors(); - //! Set of errors - map> errors; + bool HasError(CSVErrorType error_type); + idx_t GetMaxLineLength(); - idx_t GetMaxLineLength() const { - return max_line_length; - } - void DontPrintErrorLine() { - print_line = false; - } + void DontPrintErrorLine(); void SetIgnoreErrors(bool ignore_errors); + idx_t GetSize(); + + void FillRejectsTable(InternalAppender &errors_appender, idx_t file_idx, idx_t scan_idx, const CSVFileScan &file, + CSVRejectsTable &rejects, const ReadCSVData &bind_data, idx_t limit); private: //! Private methods should always be locked by parent method. @@ -159,6 +156,8 @@ class CSVErrorHandler { idx_t max_line_length = 0; bool ignore_errors = false; bool print_line = true; + //! Set of errors + vector errors; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index 0fc506659..4e0ee86fd 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -87,7 +87,8 @@ struct CSVReaderOptions { //===--------------------------------------------------------------------===// //! Maximum CSV line size: specified because if we reach this amount, we likely have wrong delimiters (default: 2MB) //! note that this is the guaranteed line length that will succeed, longer lines may be accepted if slightly above - idx_t maximum_line_size = 2097152; + static constexpr idx_t max_line_size_default = 2000000; + CSVOption maximum_line_size = max_line_size_default; //! Whether header names shall be normalized bool normalize_names = false; //! True, if column with that index must skip null check @@ -108,7 +109,7 @@ struct CSVReaderOptions { //! Multi-file reader options MultiFileReaderOptions file_options; //! Buffer Size (Parallel Scan) - idx_t buffer_size = CSVBuffer::CSV_BUFFER_SIZE; + CSVOption buffer_size_option = CSVBuffer::ROWS_PER_BUFFER * max_line_size_default; //! Decimal separator when reading as numeric string decimal_separator = "."; //! Whether to pad rows that do not have enough columns with NULL values diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/scanner_boundary.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/scanner_boundary.hpp index 9f0d97820..da796d79e 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/scanner_boundary.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/scanner_boundary.hpp @@ -27,7 +27,7 @@ namespace duckdb { struct CSVBoundary { CSVBoundary(idx_t buffer_idx, idx_t buffer_pos, idx_t boundary_idx, idx_t end_pos); CSVBoundary(); - void Print(); + void Print() const; //! Start Buffer index of the file where we start scanning idx_t buffer_idx = 0; //! Start Buffer position of the buffer of the file where we start scanning @@ -53,10 +53,10 @@ struct CSVIterator { public: CSVIterator(); - void Print(); + void Print() const; //! Moves the boundary to the next one to be scanned, if there are no next boundaries, it returns False //! Otherwise, if there are boundaries, it returns True - bool Next(CSVBufferManager &buffer_manager); + bool Next(CSVBufferManager &buffer_manager, const CSVReaderOptions &reader_options); //! If boundary is set bool IsBoundarySet() const; @@ -67,16 +67,20 @@ struct CSVIterator { void SetCurrentPositionToBoundary(); - void SetCurrentBoundaryToPosition(bool single_threaded); + void SetCurrentBoundaryToPosition(bool single_threaded, const CSVReaderOptions &reader_options); void SetStart(idx_t pos); void SetEnd(idx_t pos); // Gets the current position for the file - idx_t GetGlobalCurrentPos(); + idx_t GetGlobalCurrentPos() const; - //! 8 MB TODO: Should benchmarks other values - static constexpr idx_t BYTES_PER_THREAD = 8000000; + //! Checks if we are done with this iterator + void CheckIfDone(); + + static idx_t BytesPerThread(const CSVReaderOptions &reader_options); + + static constexpr idx_t ROWS_PER_THREAD = 4; CSVPosition pos; diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index ad2250660..a2a3d5372 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -96,7 +96,8 @@ class CurrentError { class LineError { public: - explicit LineError(bool ignore_errors_p) : is_error_in_line(false), ignore_errors(ignore_errors_p) {}; + explicit LineError(const idx_t scan_id_p, const bool ignore_errors_p) + : is_error_in_line(false), ignore_errors(ignore_errors_p), scan_id(scan_id_p) {}; //! We clear up our CurrentError Vector void Reset() { current_errors.clear(); @@ -136,10 +137,15 @@ class LineError { return !current_errors.empty(); } + idx_t Size() const { + return current_errors.size(); + } + private: vector current_errors; bool is_error_in_line; bool ignore_errors; + idx_t scan_id; }; struct ParseTypeInfo { @@ -159,13 +165,14 @@ struct ParseTypeInfo { uint8_t scale; uint8_t width; }; + class StringValueResult : public ScannerResult { public: StringValueResult(CSVStates &states, CSVStateMachine &state_machine, const shared_ptr &buffer_handle, Allocator &buffer_allocator, idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_handler, CSVIterator &iterator, bool store_line_size, shared_ptr csv_file_scan, idx_t &lines_read, bool sniffing, - string path); + string path, idx_t scan_id); ~StringValueResult(); @@ -324,6 +331,8 @@ class StringValueScanner : public BaseScanner { ValidatorLine GetValidationLine(); const idx_t scanner_idx; + //! We use the max of idx_t to signify this is a line finder scanner. + static constexpr idx_t LINE_FINDER_ID = NumericLimits::Maximum(); //! Variable that manages buffer tracking shared_ptr buffer_tracker; diff --git a/src/duckdb/src/storage/serialization/serialize_nodes.cpp b/src/duckdb/src/storage/serialization/serialize_nodes.cpp index 59fc2681d..6c0bc4ab1 100644 --- a/src/duckdb/src/storage/serialization/serialize_nodes.cpp +++ b/src/duckdb/src/storage/serialization/serialize_nodes.cpp @@ -174,7 +174,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault>(102, "null_str", null_str); serializer.WriteProperty(103, "compression", compression); serializer.WritePropertyWithDefault(104, "allow_quoted_nulls", allow_quoted_nulls); - serializer.WritePropertyWithDefault(105, "maximum_line_size", maximum_line_size); + serializer.WriteProperty>(105, "maximum_line_size", maximum_line_size); serializer.WritePropertyWithDefault(106, "normalize_names", normalize_names); serializer.WritePropertyWithDefault>(107, "force_not_null", force_not_null); serializer.WritePropertyWithDefault(108, "all_varchar", all_varchar); @@ -183,7 +183,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(111, "file_path", file_path); serializer.WritePropertyWithDefault(112, "decimal_separator", decimal_separator); serializer.WritePropertyWithDefault(113, "null_padding", null_padding); - serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); + /* [Deleted] (idx_t) "buffer_size" */ serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); serializer.WritePropertyWithDefault(117, "rejects_table_name", rejects_table_name, "reject_errors"); @@ -212,6 +212,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WriteProperty>(140, "rfc_4180", dialect_options.state_machine_options.rfc_4180); serializer.WriteProperty>(141, "multi_byte_delimiter", GetMultiByteDelimiter()); serializer.WritePropertyWithDefault(142, "multi_file_reader", multi_file_reader); + serializer.WriteProperty>(143, "buffer_size_option", buffer_size_option); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { @@ -220,7 +221,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { auto null_str = deserializer.ReadPropertyWithDefault>(102, "null_str"); auto compression = deserializer.ReadProperty(103, "compression"); auto allow_quoted_nulls = deserializer.ReadPropertyWithDefault(104, "allow_quoted_nulls"); - auto maximum_line_size = deserializer.ReadPropertyWithDefault(105, "maximum_line_size"); + auto maximum_line_size = deserializer.ReadProperty>(105, "maximum_line_size"); auto normalize_names = deserializer.ReadPropertyWithDefault(106, "normalize_names"); auto force_not_null = deserializer.ReadPropertyWithDefault>(107, "force_not_null"); auto all_varchar = deserializer.ReadPropertyWithDefault(108, "all_varchar"); @@ -229,7 +230,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { auto file_path = deserializer.ReadPropertyWithDefault(111, "file_path"); auto decimal_separator = deserializer.ReadPropertyWithDefault(112, "decimal_separator"); auto null_padding = deserializer.ReadPropertyWithDefault(113, "null_padding"); - auto buffer_size = deserializer.ReadPropertyWithDefault(114, "buffer_size"); + deserializer.ReadDeletedProperty(114, "buffer_size"); auto file_options = deserializer.ReadProperty(115, "file_options"); auto force_quote = deserializer.ReadPropertyWithDefault>(116, "force_quote"); auto rejects_table_name = deserializer.ReadPropertyWithExplicitDefault(117, "rejects_table_name", "reject_errors"); @@ -272,7 +273,6 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { result.file_path = std::move(file_path); result.decimal_separator = std::move(decimal_separator); result.null_padding = null_padding; - result.buffer_size = buffer_size; result.file_options = file_options; result.force_quote = std::move(force_quote); result.rejects_table_name = std::move(rejects_table_name); @@ -297,6 +297,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { result.encoding = std::move(encoding); result.dialect_options.state_machine_options.rfc_4180 = dialect_options_state_machine_options_rfc_4180; deserializer.ReadPropertyWithDefault(142, "multi_file_reader", result.multi_file_reader); + deserializer.ReadProperty>(143, "buffer_size_option", result.buffer_size_option); return result; } From fb495c09af6e743744fd0f978be7411a62f2de64 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:10:25 +0000 Subject: [PATCH 04/21] vendor: Update vendored sources to duckdb/duckdb@ce33de966a521d1a6e86ec9579e133ff2b2534f4 (#978) More bugfixes (duckdb/duckdb#15605) Co-authored-by: krlmlr --- src/duckdb/extension/parquet/column_reader.cpp | 8 ++++++++ src/duckdb/extension/parquet/include/column_reader.hpp | 4 ++++ src/duckdb/extension/parquet/parquet_extension.cpp | 8 +++++++- src/duckdb/extension/parquet/parquet_reader.cpp | 5 +++-- src/duckdb/src/execution/aggregate_hashtable.cpp | 3 +++ src/duckdb/src/function/scalar/sequence/nextval.cpp | 3 +++ src/duckdb/src/function/scalar/string/length.cpp | 10 ++++++---- .../src/function/table/version/pragma_version.cpp | 6 +++--- .../binder/expression/bind_operator_expression.cpp | 3 +++ .../src/planner/binder/statement/bind_create.cpp | 2 +- src/duckdb/src/storage/arena_allocator.cpp | 9 +++++---- 11 files changed, 46 insertions(+), 15 deletions(-) diff --git a/src/duckdb/extension/parquet/column_reader.cpp b/src/duckdb/extension/parquet/column_reader.cpp index cf1f5ba50..8a2112c35 100644 --- a/src/duckdb/extension/parquet/column_reader.cpp +++ b/src/duckdb/extension/parquet/column_reader.cpp @@ -135,6 +135,14 @@ const SchemaElement &ColumnReader::Schema() const { return schema; } +optional_ptr ColumnReader::GetParentSchema() const { + return parent_schema; +} + +void ColumnReader::SetParentSchema(const SchemaElement &parent_schema_p) { + parent_schema = &parent_schema_p; +} + idx_t ColumnReader::FileIdx() const { return file_idx; } diff --git a/src/duckdb/extension/parquet/include/column_reader.hpp b/src/duckdb/extension/parquet/include/column_reader.hpp index dd89cadd5..23d4fc3d4 100644 --- a/src/duckdb/extension/parquet/include/column_reader.hpp +++ b/src/duckdb/extension/parquet/include/column_reader.hpp @@ -57,6 +57,9 @@ class ColumnReader { ParquetReader &Reader(); const LogicalType &Type() const; const SchemaElement &Schema() const; + optional_ptr GetParentSchema() const; + void SetParentSchema(const SchemaElement &parent_schema); + idx_t FileIdx() const; idx_t MaxDefine() const; idx_t MaxRepeat() const; @@ -140,6 +143,7 @@ class ColumnReader { protected: const SchemaElement &schema; + optional_ptr parent_schema; idx_t file_idx; idx_t max_define; diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp index 8cbea8090..500b89919 100644 --- a/src/duckdb/extension/parquet/parquet_extension.cpp +++ b/src/duckdb/extension/parquet/parquet_extension.cpp @@ -313,9 +313,15 @@ static void InitializeParquetReader(ParquetReader &reader, const ParquetReadBind unordered_map field_id_to_column_index; auto &column_readers = reader.root_reader->Cast().child_readers; for (idx_t column_index = 0; column_index < column_readers.size(); column_index++) { - auto &column_schema = column_readers[column_index]->Schema(); + auto &column_reader = *column_readers[column_index]; + auto &column_schema = column_reader.Schema(); if (column_schema.__isset.field_id) { field_id_to_column_index[column_schema.field_id] = column_index; + } else if (column_reader.GetParentSchema()) { + auto &parent_column_schema = *column_reader.GetParentSchema(); + if (parent_column_schema.__isset.field_id) { + field_id_to_column_index[parent_column_schema.field_id] = column_index; + } } } diff --git a/src/duckdb/extension/parquet/parquet_reader.cpp b/src/duckdb/extension/parquet/parquet_reader.cpp index e6ce7aa92..1d2565a2d 100644 --- a/src/duckdb/extension/parquet/parquet_reader.cpp +++ b/src/duckdb/extension/parquet/parquet_reader.cpp @@ -418,9 +418,10 @@ unique_ptr ParquetReader::CreateReaderRecursive(ClientContext &con } if (is_repeated) { result_type = LogicalType::LIST(result_type); - return make_uniq(*this, result_type, s_ele, this_idx, max_define, max_repeat, - std::move(result)); + result = make_uniq(*this, result_type, s_ele, this_idx, max_define, max_repeat, + std::move(result)); } + result->SetParentSchema(s_ele); return result; } else { // leaf node if (!s_ele.__isset.type) { diff --git a/src/duckdb/src/execution/aggregate_hashtable.cpp b/src/duckdb/src/execution/aggregate_hashtable.cpp index 8a2dd448c..198b77fac 100644 --- a/src/duckdb/src/execution/aggregate_hashtable.cpp +++ b/src/duckdb/src/execution/aggregate_hashtable.cpp @@ -131,6 +131,9 @@ void GroupedAggregateHashTable::Abandon() { // Start over ClearPointerTable(); count = 0; + + // Resetting the id ensures the dict state is reset properly when needed + state.dict_state.dictionary_id = string(); } void GroupedAggregateHashTable::Repartition() { diff --git a/src/duckdb/src/function/scalar/sequence/nextval.cpp b/src/duckdb/src/function/scalar/sequence/nextval.cpp index 7e19dae7d..6738383dc 100644 --- a/src/duckdb/src/function/scalar/sequence/nextval.cpp +++ b/src/duckdb/src/function/scalar/sequence/nextval.cpp @@ -91,6 +91,9 @@ static void NextValFunction(DataChunk &args, ExpressionState &state, Vector &res static unique_ptr NextValBind(ScalarFunctionBindInput &bind_input, ScalarFunction &, vector> &arguments) { + if (arguments[0]->HasParameter() || arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) { + throw ParameterNotResolvedException(); + } if (!arguments[0]->IsFoldable()) { throw NotImplementedException( "currval/nextval requires a constant sequence - non-constant sequences are no longer supported"); diff --git a/src/duckdb/src/function/scalar/string/length.cpp b/src/duckdb/src/function/scalar/string/length.cpp index f2ec8bae1..538646419 100644 --- a/src/duckdb/src/function/scalar/string/length.cpp +++ b/src/duckdb/src/function/scalar/string/length.cpp @@ -110,12 +110,14 @@ static void ArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector static unique_ptr ArrayOrListLengthBind(ClientContext &context, ScalarFunction &bound_function, vector> &arguments) { - if (arguments[0]->HasParameter()) { + if (arguments[0]->HasParameter() || arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) { throw ParameterNotResolvedException(); } - if (arguments[0]->return_type.id() == LogicalTypeId::ARRAY) { + + const auto &arg_type = arguments[0]->return_type.id(); + if (arg_type == LogicalTypeId::ARRAY) { bound_function.function = ArrayLengthFunction; - } else if (arguments[0]->return_type.id() == LogicalTypeId::LIST) { + } else if (arg_type == LogicalTypeId::LIST) { bound_function.function = ListLengthFunction; } else { // Unreachable @@ -183,7 +185,7 @@ static void ArrayLengthBinaryFunction(DataChunk &args, ExpressionState &state, V static unique_ptr ArrayOrListLengthBinaryBind(ClientContext &context, ScalarFunction &bound_function, vector> &arguments) { - if (arguments[0]->HasParameter()) { + if (arguments[0]->HasParameter() || arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) { throw ParameterNotResolvedException(); } auto type = arguments[0]->return_type; diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 30ff6eca3..c1d5d02cd 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4147" +#define DUCKDB_PATCH_VERSION "4-dev4157" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4147" +#define DUCKDB_VERSION "v1.1.4-dev4157" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "c8fa9aee78" +#define DUCKDB_SOURCE_ID "ce33de966a" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp index 81addd406..9326c8849 100644 --- a/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp @@ -134,6 +134,9 @@ BindResult ExpressionBinder::BindExpression(OperatorExpression &op, idx_t depth) D_ASSERT(op.children[0]->GetExpressionClass() == ExpressionClass::BOUND_EXPRESSION); D_ASSERT(op.children[1]->GetExpressionClass() == ExpressionClass::BOUND_EXPRESSION); auto &extract_exp = BoundExpression::GetExpression(*op.children[0]); + if (extract_exp->HasParameter() || extract_exp->return_type.id() == LogicalTypeId::UNKNOWN) { + throw ParameterNotResolvedException(); + } auto &name_exp = BoundExpression::GetExpression(*op.children[1]); const auto &extract_expr_type = extract_exp->return_type; if (extract_expr_type.id() != LogicalTypeId::STRUCT && extract_expr_type.id() != LogicalTypeId::UNION && diff --git a/src/duckdb/src/planner/binder/statement/bind_create.cpp b/src/duckdb/src/planner/binder/statement/bind_create.cpp index 495a687e6..8d53446fc 100644 --- a/src/duckdb/src/planner/binder/statement/bind_create.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_create.cpp @@ -196,7 +196,7 @@ SchemaCatalogEntry &Binder::BindCreateFunctionInfo(CreateInfo &info) { if (param.IsQualified()) { throw BinderException("Invalid parameter name '%s': must be unqualified", param.ToString()); } - dummy_types.emplace_back(LogicalType::SQLNULL); + dummy_types.emplace_back(LogicalType::UNKNOWN); dummy_names.push_back(param.GetColumnName()); } // default parameters diff --git a/src/duckdb/src/storage/arena_allocator.cpp b/src/duckdb/src/storage/arena_allocator.cpp index a67eeb286..000444f75 100644 --- a/src/duckdb/src/storage/arena_allocator.cpp +++ b/src/duckdb/src/storage/arena_allocator.cpp @@ -107,12 +107,13 @@ data_ptr_t ArenaAllocator::Reallocate(data_ptr_t pointer, idx_t old_size, idx_t return pointer; } - auto head_ptr = head->data.get() + head->current_position; + const auto head_ptr = head->data.get() + head->current_position - old_size; + int64_t current_position = NumericCast(head->current_position); int64_t diff = NumericCast(size) - NumericCast(old_size); - if (pointer == head_ptr && (size < old_size || NumericCast(head->current_position) + diff <= - NumericCast(head->maximum_size))) { + if (pointer == head_ptr && + (size < old_size || current_position + diff <= NumericCast(head->maximum_size))) { // passed pointer is the head pointer, and the diff fits on the current chunk - head->current_position += NumericCast(diff); + head->current_position = NumericCast(current_position + diff); return pointer; } else { // allocate new memory From 0dc66384eab9fd22bb29dacc6d1f7060d017f362 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 15:33:33 +0000 Subject: [PATCH 05/21] vendor: Update vendored sources to duckdb/duckdb@69dff93d1a956c68eb9dc603d24d06719e57749d (#979) csv_scanner: fix order of evaluation of arguments to a function (duckdb/duckdb#15609) Co-authored-by: krlmlr --- .../csv_scanner/table_function/global_csv_state.cpp | 3 ++- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 25691076d..b1a4a6165 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -27,7 +27,8 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptrstart_iterator.done && file_scans.size() < files.size()) { - file_scans.emplace_back(make_uniq(context, files[++cur_file_idx], options, cur_file_idx, bind_data, + cur_file_idx++; + file_scans.emplace_back(make_uniq(context, files[cur_file_idx], options, cur_file_idx, bind_data, column_ids, file_schema, false)); } // There are situations where we only support single threaded scanning diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index c1d5d02cd..4641dd23e 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4157" +#define DUCKDB_PATCH_VERSION "4-dev4159" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4157" +#define DUCKDB_VERSION "v1.1.4-dev4159" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "ce33de966a" +#define DUCKDB_SOURCE_ID "69dff93d1a" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" From 9f5e2bb3697d06c8c692b90e9988dd8eab0b2e47 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:01:12 +0000 Subject: [PATCH 06/21] vendor: Update vendored sources to duckdb/duckdb@dcb9627543af52e4322de464003259a6b0e7fdb4 (#980) [Dev] Fix an unnecessary copy in Dictionary compression (duckdb/duckdb#15594) Co-authored-by: krlmlr --- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- .../duckdb/storage/compression/dictionary/compression.hpp | 1 - .../src/storage/compression/dictionary/compression.cpp | 7 ++++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 4641dd23e..5a680f9c6 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4159" +#define DUCKDB_PATCH_VERSION "4-dev4161" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4159" +#define DUCKDB_VERSION "v1.1.4-dev4161" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "69dff93d1a" +#define DUCKDB_SOURCE_ID "dcb9627543" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp b/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp index b0f29dc59..09f1f44bd 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp @@ -47,7 +47,6 @@ struct DictionaryCompressionCompressState : public DictionaryCompressionState { data_ptr_t current_end_ptr; // Buffers and map for current segment - StringHeap heap; string_map_t current_string_map; vector index_buffer; vector selection_buffer; diff --git a/src/duckdb/src/storage/compression/dictionary/compression.cpp b/src/duckdb/src/storage/compression/dictionary/compression.cpp index 064697fc7..64a3db402 100644 --- a/src/duckdb/src/storage/compression/dictionary/compression.cpp +++ b/src/duckdb/src/storage/compression/dictionary/compression.cpp @@ -6,8 +6,7 @@ namespace duckdb { DictionaryCompressionCompressState::DictionaryCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, const CompressionInfo &info) : DictionaryCompressionState(info), checkpoint_data(checkpoint_data_p), - function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_DICTIONARY)), - heap(BufferAllocator::Get(checkpoint_data.GetDatabase())) { + function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_DICTIONARY)) { CreateEmptySegment(checkpoint_data.GetRowGroup().start); } @@ -72,7 +71,9 @@ void DictionaryCompressionCompressState::AddNewString(string_t str) { if (str.IsInlined()) { current_string_map.insert({str, index_buffer.size() - 1}); } else { - current_string_map.insert({heap.AddBlob(str), index_buffer.size() - 1}); + string_t dictionary_string((const char *)dict_pos, UnsafeNumericCast(str.GetSize())); // NOLINT + D_ASSERT(!dictionary_string.IsInlined()); + current_string_map.insert({dictionary_string, index_buffer.size() - 1}); } DictionaryCompression::SetDictionary(*current_segment, current_handle, current_dictionary); From f4313720f00957d4a7c1dff372a573c18a443a76 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 01:17:18 +0000 Subject: [PATCH 07/21] fledge: Bump version to 1.1.3.9035 (#982) Co-authored-by: krlmlr --- DESCRIPTION | 2 +- NEWS.md | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 87528928b..f5190378f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: duckdb Title: DBI Package for the DuckDB Database Management System -Version: 1.1.3.9034 +Version: 1.1.3.9035 Authors@R: c( person("Hannes", "Mühleisen", , "hannes@cwi.nl", role = "aut", comment = c(ORCID = "0000-0001-8552-0029")), diff --git a/NEWS.md b/NEWS.md index 2f6f554ee..0b901bc93 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,18 @@ +# duckdb 1.1.3.9035 + +## vendor + +- Update vendored sources to duckdb/duckdb@dcb9627543af52e4322de464003259a6b0e7fdb4 (#980). + +- Update vendored sources to duckdb/duckdb@69dff93d1a956c68eb9dc603d24d06719e57749d (#979). + +- Update vendored sources to duckdb/duckdb@ce33de966a521d1a6e86ec9579e133ff2b2534f4 (#978). + +- Update vendored sources to duckdb/duckdb@c8fa9aee7858909c625b5c3abcc3a257c5d9d934 (#977). + + # duckdb 1.1.3.9034 ## vendor From d3de74adb3a1954b849f3f9f640cab395a62873a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:46:51 +0000 Subject: [PATCH 08/21] vendor: Update vendored sources to duckdb/duckdb@fbc4d92fe6239536948a49d093196534c3492656 (#983) [Compression] `Dictionary` compression data now also includes the validity data (duckdb/duckdb#15591) Update URLs (duckdb/duckdb#15617) [Julia] Fix incorrect types (duckdb/duckdb#15612) Co-authored-by: krlmlr --- src/duckdb/src/common/enum_util.cpp | 24 ++++- .../src/common/enums/compression_type.cpp | 4 + .../src/function/compression_config.cpp | 2 + .../function/table/version/pragma_version.cpp | 6 +- .../src/include/duckdb/common/enum_util.hpp | 8 ++ .../duckdb/common/enums/compression_type.hpp | 5 +- .../function/compression/compression.hpp | 5 + .../duckdb/function/compression_function.hpp | 6 ++ .../storage/compression/empty_validity.hpp | 100 ++++++++++++++++++ .../duckdb/storage/table/column_data.hpp | 1 + .../table/column_data_checkpointer.hpp | 1 + src/duckdb/src/main/database.cpp | 1 + .../compression/dictionary/decompression.cpp | 11 +- .../compression/dictionary_compression.cpp | 4 +- .../storage/compression/empty_validity.cpp | 15 +++ src/duckdb/src/storage/table/column_data.cpp | 10 ++ .../table/column_data_checkpointer.cpp | 35 ++++-- src/duckdb/ub_src_storage_compression.cpp | 2 + 18 files changed, 223 insertions(+), 17 deletions(-) create mode 100644 src/duckdb/src/include/duckdb/storage/compression/empty_validity.hpp create mode 100644 src/duckdb/src/storage/compression/empty_validity.cpp diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index 23d9c6e27..6baa6854f 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -86,6 +86,7 @@ #include "duckdb/execution/operator/csv_scanner/quote_rules.hpp" #include "duckdb/execution/reservoir_sample.hpp" #include "duckdb/function/aggregate_state.hpp" +#include "duckdb/function/compression_function.hpp" #include "duckdb/function/copy_function.hpp" #include "duckdb/function/function.hpp" #include "duckdb/function/macro_function.hpp" @@ -859,6 +860,7 @@ const StringUtil::EnumStringLiteral *GetCompressionTypeValues() { { static_cast(CompressionType::COMPRESSION_ALPRD), "COMPRESSION_ALPRD" }, { static_cast(CompressionType::COMPRESSION_ZSTD), "COMPRESSION_ZSTD" }, { static_cast(CompressionType::COMPRESSION_ROARING), "COMPRESSION_ROARING" }, + { static_cast(CompressionType::COMPRESSION_EMPTY), "COMPRESSION_EMPTY" }, { static_cast(CompressionType::COMPRESSION_COUNT), "COMPRESSION_COUNT" } }; return values; @@ -866,12 +868,30 @@ const StringUtil::EnumStringLiteral *GetCompressionTypeValues() { template<> const char* EnumUtil::ToChars(CompressionType value) { - return StringUtil::EnumToString(GetCompressionTypeValues(), 15, "CompressionType", static_cast(value)); + return StringUtil::EnumToString(GetCompressionTypeValues(), 16, "CompressionType", static_cast(value)); } template<> CompressionType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetCompressionTypeValues(), 15, "CompressionType", value)); + return static_cast(StringUtil::StringToEnum(GetCompressionTypeValues(), 16, "CompressionType", value)); +} + +const StringUtil::EnumStringLiteral *GetCompressionValidityValues() { + static constexpr StringUtil::EnumStringLiteral values[] { + { static_cast(CompressionValidity::REQUIRES_VALIDITY), "REQUIRES_VALIDITY" }, + { static_cast(CompressionValidity::NO_VALIDITY_REQUIRED), "NO_VALIDITY_REQUIRED" } + }; + return values; +} + +template<> +const char* EnumUtil::ToChars(CompressionValidity value) { + return StringUtil::EnumToString(GetCompressionValidityValues(), 2, "CompressionValidity", static_cast(value)); +} + +template<> +CompressionValidity EnumUtil::FromString(const char *value) { + return static_cast(StringUtil::StringToEnum(GetCompressionValidityValues(), 2, "CompressionValidity", value)); } const StringUtil::EnumStringLiteral *GetConflictManagerModeValues() { diff --git a/src/duckdb/src/common/enums/compression_type.cpp b/src/duckdb/src/common/enums/compression_type.cpp index b0528a7dd..3bc66fe08 100644 --- a/src/duckdb/src/common/enums/compression_type.cpp +++ b/src/duckdb/src/common/enums/compression_type.cpp @@ -24,6 +24,8 @@ bool CompressionTypeIsDeprecated(CompressionType compression_type) { CompressionType CompressionTypeFromString(const string &str) { auto compression = StringUtil::Lower(str); + //! NOTE: this explicitly does not include 'constant' and 'empty validity', these are internal compression functions + //! not general purpose if (compression == "uncompressed") { return CompressionType::COMPRESSION_UNCOMPRESSED; } else if (compression == "rle") { @@ -83,6 +85,8 @@ string CompressionTypeToString(CompressionType type) { return "ALPRD"; case CompressionType::COMPRESSION_ROARING: return "Roaring"; + case CompressionType::COMPRESSION_EMPTY: + return "Empty Validity"; default: throw InternalException("Unrecognized compression type!"); } diff --git a/src/duckdb/src/function/compression_config.cpp b/src/duckdb/src/function/compression_config.cpp index c0a7f1609..deece1a85 100644 --- a/src/duckdb/src/function/compression_config.cpp +++ b/src/duckdb/src/function/compression_config.cpp @@ -28,6 +28,8 @@ static const DefaultCompressionMethod internal_compression_methods[] = { {CompressionType::COMPRESSION_FSST, FSSTFun::GetFunction, FSSTFun::TypeIsSupported}, {CompressionType::COMPRESSION_ZSTD, ZSTDFun::GetFunction, ZSTDFun::TypeIsSupported}, {CompressionType::COMPRESSION_ROARING, RoaringCompressionFun::GetFunction, RoaringCompressionFun::TypeIsSupported}, + {CompressionType::COMPRESSION_EMPTY, EmptyValidityCompressionFun::GetFunction, + EmptyValidityCompressionFun::TypeIsSupported}, {CompressionType::COMPRESSION_AUTO, nullptr, nullptr}}; static optional_ptr FindCompressionFunction(CompressionFunctionSet &set, CompressionType type, diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 5a680f9c6..29ac2f576 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4161" +#define DUCKDB_PATCH_VERSION "4-dev4188" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4161" +#define DUCKDB_VERSION "v1.1.4-dev4188" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "dcb9627543" +#define DUCKDB_SOURCE_ID "fbc4d92fe6" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/enum_util.hpp b/src/duckdb/src/include/duckdb/common/enum_util.hpp index 6057a89b2..6215e10fd 100644 --- a/src/duckdb/src/include/duckdb/common/enum_util.hpp +++ b/src/duckdb/src/include/duckdb/common/enum_util.hpp @@ -102,6 +102,8 @@ enum class CompressedMaterializationDirection : uint8_t; enum class CompressionType : uint8_t; +enum class CompressionValidity : uint8_t; + enum class ConflictManagerMode : uint8_t; enum class ConstraintType : uint8_t; @@ -484,6 +486,9 @@ const char* EnumUtil::ToChars(CompressedMate template<> const char* EnumUtil::ToChars(CompressionType value); +template<> +const char* EnumUtil::ToChars(CompressionValidity value); + template<> const char* EnumUtil::ToChars(ConflictManagerMode value); @@ -1004,6 +1009,9 @@ CompressedMaterializationDirection EnumUtil::FromString CompressionType EnumUtil::FromString(const char *value); +template<> +CompressionValidity EnumUtil::FromString(const char *value); + template<> ConflictManagerMode EnumUtil::FromString(const char *value); diff --git a/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp b/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp index 6a1c30d42..a8753f12b 100644 --- a/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/compression_type.hpp @@ -16,7 +16,7 @@ namespace duckdb { enum class CompressionType : uint8_t { COMPRESSION_AUTO = 0, COMPRESSION_UNCOMPRESSED = 1, - COMPRESSION_CONSTANT = 2, + COMPRESSION_CONSTANT = 2, // internal only COMPRESSION_RLE = 3, COMPRESSION_DICTIONARY = 4, COMPRESSION_PFOR_DELTA = 5, @@ -28,7 +28,8 @@ enum class CompressionType : uint8_t { COMPRESSION_ALPRD = 11, COMPRESSION_ZSTD = 12, COMPRESSION_ROARING = 13, - COMPRESSION_COUNT // This has to stay the last entry of the type! + COMPRESSION_EMPTY = 14, // internal only + COMPRESSION_COUNT // This has to stay the last entry of the type! }; bool CompressionTypeIsDeprecated(CompressionType compression_type); diff --git a/src/duckdb/src/include/duckdb/function/compression/compression.hpp b/src/duckdb/src/include/duckdb/function/compression/compression.hpp index 28113c78e..adf5364b7 100644 --- a/src/duckdb/src/include/duckdb/function/compression/compression.hpp +++ b/src/duckdb/src/include/duckdb/function/compression/compression.hpp @@ -73,4 +73,9 @@ struct RoaringCompressionFun { static bool TypeIsSupported(const PhysicalType physical_type); }; +struct EmptyValidityCompressionFun { + static CompressionFunction GetFunction(PhysicalType type); + static bool TypeIsSupported(const PhysicalType physical_type); +}; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/compression_function.hpp b/src/duckdb/src/include/duckdb/function/compression_function.hpp index d11beaa72..b0546a5a2 100644 --- a/src/duckdb/src/include/duckdb/function/compression_function.hpp +++ b/src/duckdb/src/include/duckdb/function/compression_function.hpp @@ -203,6 +203,8 @@ typedef unique_ptr (*compression_deserialize_state_t)(Deseri //! Function prototype for cleaning up the segment state when the column data is dropped typedef void (*compression_cleanup_state_t)(ColumnSegment &segment); +enum class CompressionValidity : uint8_t { REQUIRES_VALIDITY, NO_VALIDITY_REQUIRED }; + class CompressionFunction { public: CompressionFunction(CompressionType type, PhysicalType data_type, compression_init_analyze_t init_analyze, @@ -297,6 +299,10 @@ class CompressionFunction { compression_deserialize_state_t deserialize_state; //! Cleanup the segment state (optional) compression_cleanup_state_t cleanup_state; + + //! Whether the validity mask should be separately compressed + //! or this compression function can also be used to decompress the validity + CompressionValidity validity = CompressionValidity::REQUIRES_VALIDITY; }; //! The set of compression functions diff --git a/src/duckdb/src/include/duckdb/storage/compression/empty_validity.hpp b/src/duckdb/src/include/duckdb/storage/compression/empty_validity.hpp new file mode 100644 index 000000000..89abe1ce7 --- /dev/null +++ b/src/duckdb/src/include/duckdb/storage/compression/empty_validity.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include "duckdb/function/compression_function.hpp" +#include "duckdb/storage/table/column_data.hpp" +#include "duckdb/storage/table/scan_state.hpp" +#include "duckdb/storage/table/column_data_checkpointer.hpp" + +namespace duckdb { + +class EmptyValidityCompression { +public: + struct EmptyValidityCompressionState : public CompressionState { + public: + explicit EmptyValidityCompressionState(ColumnDataCheckpointData &checkpoint_data, const CompressionInfo &info) + : CompressionState(info), + function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_EMPTY)), + checkpoint_data(checkpoint_data) { + } + ~EmptyValidityCompressionState() override { + } + + public: + optional_ptr function; + ColumnDataCheckpointData &checkpoint_data; + idx_t count = 0; + idx_t non_nulls = 0; + }; + struct EmptyValiditySegmentScanState : public SegmentScanState { + EmptyValiditySegmentScanState() { + } + }; + +public: + static CompressionFunction CreateFunction() { + return CompressionFunction(CompressionType::COMPRESSION_EMPTY, PhysicalType::BIT, nullptr, nullptr, nullptr, + InitCompression, Compress, FinalizeCompress, InitScan, Scan, ScanPartial, FetchRow, + Skip, InitSegment); + } + +public: + static unique_ptr InitCompression(ColumnDataCheckpointData &checkpoint_data, + unique_ptr state_p) { + return make_uniq(checkpoint_data, state_p->info); + } + static void Compress(CompressionState &state_p, Vector &scan_vector, idx_t count) { + auto &state = state_p.Cast(); + UnifiedVectorFormat format; + scan_vector.ToUnifiedFormat(count, format); + state.non_nulls += format.validity.CountValid(count); + state.count += count; + } + static void FinalizeCompress(CompressionState &state_p) { + auto &state = state_p.Cast(); + auto &checkpoint_data = state.checkpoint_data; + + auto &db = checkpoint_data.GetDatabase(); + auto &type = checkpoint_data.GetType(); + auto row_start = checkpoint_data.GetRowGroup().start; + + auto &info = state.info; + auto compressed_segment = ColumnSegment::CreateTransientSegment(db, *state.function, type, row_start, + info.GetBlockSize(), info.GetBlockSize()); + compressed_segment->count = state.count; + if (state.non_nulls != state.count) { + compressed_segment->stats.statistics.SetHasNullFast(); + } + if (state.non_nulls != 0) { + compressed_segment->stats.statistics.SetHasNoNullFast(); + } + + auto &buffer_manager = BufferManager::GetBufferManager(checkpoint_data.GetDatabase()); + auto handle = buffer_manager.Pin(compressed_segment->block); + + auto &checkpoint_state = checkpoint_data.GetCheckpointState(); + checkpoint_state.FlushSegment(std::move(compressed_segment), std::move(handle), 0); + } + static unique_ptr InitScan(ColumnSegment &segment) { + return make_uniq(); + } + static void ScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result, + idx_t result_offset) { + return; + } + static void Scan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result) { + return; + } + static void FetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, + idx_t result_idx) { + return; + } + static void Skip(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count) { + return; + } + static unique_ptr InitSegment(ColumnSegment &segment, block_id_t block_id, + optional_ptr segment_state) { + return nullptr; + } +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/table/column_data.hpp b/src/duckdb/src/include/duckdb/storage/table/column_data.hpp index cbf354f37..adb3ed056 100644 --- a/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/column_data.hpp @@ -99,6 +99,7 @@ class ColumnData { const LogicalType &RootType() const; //! Whether or not the column has any updates bool HasUpdates() const; + bool HasChanges(idx_t start_row, idx_t end_row) const; //! Whether or not we can scan an entire vector virtual ScanVectorType GetVectorScanType(ColumnScanState &state, idx_t scan_count, Vector &result); diff --git a/src/duckdb/src/include/duckdb/storage/table/column_data_checkpointer.hpp b/src/duckdb/src/include/duckdb/storage/table/column_data_checkpointer.hpp index a5bd436bb..0db0e6fbc 100644 --- a/src/duckdb/src/include/duckdb/storage/table/column_data_checkpointer.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/column_data_checkpointer.hpp @@ -74,6 +74,7 @@ class ColumnDataCheckpointer { void WritePersistentSegments(ColumnCheckpointState &state); void InitAnalyze(); void DropSegments(); + bool ValidityCoveredByBasedata(vector &result); private: vector> &checkpoint_states; diff --git a/src/duckdb/src/main/database.cpp b/src/duckdb/src/main/database.cpp index de60b1cf2..3bd387f84 100644 --- a/src/duckdb/src/main/database.cpp +++ b/src/duckdb/src/main/database.cpp @@ -26,6 +26,7 @@ #include "duckdb/storage/storage_manager.hpp" #include "duckdb/transaction/transaction_manager.hpp" #include "duckdb/main/capi/extension_api.hpp" +#include "duckdb/storage/compression/empty_validity.hpp" #ifndef DUCKDB_NO_THREADS #include "duckdb/common/thread.hpp" diff --git a/src/duckdb/src/storage/compression/dictionary/decompression.cpp b/src/duckdb/src/storage/compression/dictionary/decompression.cpp index a81c0e122..4063e9224 100644 --- a/src/duckdb/src/storage/compression/dictionary/decompression.cpp +++ b/src/duckdb/src/storage/compression/dictionary/decompression.cpp @@ -43,15 +43,18 @@ void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initiali block_size = segment.GetBlockManager().GetBlockSize(); dict = DictionaryCompression::GetDictionary(segment, *handle); - dictionary = make_buffer(segment.type, index_buffer_count); - dictionary_size = index_buffer_count; if (!initialize_dictionary) { // Used by fetch, as fetch will never produce a DictionaryVector return; } + dictionary = make_buffer(segment.type, index_buffer_count); + dictionary_size = index_buffer_count; auto dict_child_data = FlatVector::GetData(*(dictionary)); + auto &validity = FlatVector::Validity(*dictionary); + D_ASSERT(index_buffer_count >= 1); + validity.SetInvalid(0); for (uint32_t i = 0; i < index_buffer_count; i++) { // NOTE: the passing of dict_child_vector, will not be used, its for big strings uint16_t str_len = GetStringLength(i); @@ -61,6 +64,7 @@ void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initiali void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count) { auto result_data = FlatVector::GetData(result); + auto &validity = FlatVector::Validity(result); // Handling non-bitpacking-group-aligned start values; idx_t start_offset = start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; @@ -82,6 +86,9 @@ void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_of for (idx_t i = 0; i < scan_count; i++) { // Lookup dict offset in index buffer auto string_number = sel_vec->get_index(i + start_offset); + if (string_number == 0) { + validity.SetInvalid(result_offset + i); + } auto dict_offset = index_buffer_ptr[string_number]; auto str_len = GetStringLength(UnsafeNumericCast(string_number)); result_data[result_offset + i] = FetchStringFromDict(UnsafeNumericCast(dict_offset), str_len); diff --git a/src/duckdb/src/storage/compression/dictionary_compression.cpp b/src/duckdb/src/storage/compression/dictionary_compression.cpp index 6f6b4beed..78915a374 100644 --- a/src/duckdb/src/storage/compression/dictionary_compression.cpp +++ b/src/duckdb/src/storage/compression/dictionary_compression.cpp @@ -157,7 +157,7 @@ void DictionaryCompressionStorage::StringFetchRow(ColumnSegment &segment, Column // Get Function //===--------------------------------------------------------------------===// CompressionFunction DictionaryCompressionFun::GetFunction(PhysicalType data_type) { - return CompressionFunction( + auto res = CompressionFunction( CompressionType::COMPRESSION_DICTIONARY, data_type, DictionaryCompressionStorage ::StringInitAnalyze, DictionaryCompressionStorage::StringAnalyze, DictionaryCompressionStorage::StringFinalAnalyze, DictionaryCompressionStorage::InitCompression, DictionaryCompressionStorage::Compress, @@ -165,6 +165,8 @@ CompressionFunction DictionaryCompressionFun::GetFunction(PhysicalType data_type DictionaryCompressionStorage::StringScan, DictionaryCompressionStorage::StringScanPartial, DictionaryCompressionStorage::StringFetchRow, UncompressedFunctions::EmptySkip, UncompressedStringStorage::StringInitSegment); + res.validity = CompressionValidity::NO_VALIDITY_REQUIRED; + return res; } bool DictionaryCompressionFun::TypeIsSupported(const PhysicalType physical_type) { diff --git a/src/duckdb/src/storage/compression/empty_validity.cpp b/src/duckdb/src/storage/compression/empty_validity.cpp new file mode 100644 index 000000000..0b6534153 --- /dev/null +++ b/src/duckdb/src/storage/compression/empty_validity.cpp @@ -0,0 +1,15 @@ +#include "duckdb/function/compression/compression.hpp" +#include "duckdb/storage/compression/empty_validity.hpp" + +namespace duckdb { + +CompressionFunction EmptyValidityCompressionFun::GetFunction(PhysicalType type) { + return EmptyValidityCompression::CreateFunction(); +} + +bool EmptyValidityCompressionFun::TypeIsSupported(const PhysicalType physical_type) { + D_ASSERT(physical_type == PhysicalType::BIT); + return true; +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/table/column_data.cpp b/src/duckdb/src/storage/table/column_data.cpp index 53e7c5459..985a59415 100644 --- a/src/duckdb/src/storage/table/column_data.cpp +++ b/src/duckdb/src/storage/table/column_data.cpp @@ -63,6 +63,16 @@ bool ColumnData::HasUpdates() const { return updates.get(); } +bool ColumnData::HasChanges(idx_t start_row, idx_t end_row) const { + if (!updates) { + return false; + } + if (updates->HasUpdates(start_row, end_row)) { + return true; + } + return false; +} + void ColumnData::ClearUpdates() { lock_guard update_guard(update_lock); updates.reset(); diff --git a/src/duckdb/src/storage/table/column_data_checkpointer.cpp b/src/duckdb/src/storage/table/column_data_checkpointer.cpp index 13658077d..ae92a6972 100644 --- a/src/duckdb/src/storage/table/column_data_checkpointer.cpp +++ b/src/duckdb/src/storage/table/column_data_checkpointer.cpp @@ -1,6 +1,7 @@ #include "duckdb/storage/table/column_data_checkpointer.hpp" #include "duckdb/main/config.hpp" #include "duckdb/storage/table/update_segment.hpp" +#include "duckdb/storage/compression/empty_validity.hpp" #include "duckdb/storage/data_table.hpp" #include "duckdb/parser/column_definition.hpp" #include "duckdb/storage/table/scan_state.hpp" @@ -273,11 +274,32 @@ void ColumnDataCheckpointer::DropSegments() { } } +bool ColumnDataCheckpointer::ValidityCoveredByBasedata(vector &result) { + if (result.size() != 2) { + return false; + } + if (!has_changes[0]) { + // The base data had no changes so it will not be rewritten + return false; + } + auto &base = result[0]; + D_ASSERT(base.function); + return base.function->validity == CompressionValidity::NO_VALIDITY_REQUIRED; +} + void ColumnDataCheckpointer::WriteToDisk() { DropSegments(); // Analyze the candidate functions to select one of them to use for compression auto analyze_result = DetectBestCompressionMethod(); + if (ValidityCoveredByBasedata(analyze_result)) { + D_ASSERT(analyze_result.size() == 2); + auto &validity = analyze_result[1]; + auto &config = DBConfig::GetConfig(db); + // Override the function to the COMPRESSION_EMPTY + // turning the compression+final compress steps into a no-op, saving a single empty segment + validity.function = config.GetCompressionFunction(CompressionType::COMPRESSION_EMPTY, PhysicalType::BIT); + } // Initialize the compression for the selected function D_ASSERT(analyze_result.size() == checkpoint_states.size()); @@ -328,13 +350,12 @@ bool ColumnDataCheckpointer::HasChanges(ColumnData &col_data) { if (segment->segment_type == ColumnSegmentType::TRANSIENT) { // transient segment: always need to write to disk return true; - } else { - // persistent segment; check if there were any updates or deletions in this segment - idx_t start_row_idx = segment->start - row_group.start; - idx_t end_row_idx = start_row_idx + segment->count; - if (col_data.updates && col_data.updates->HasUpdates(start_row_idx, end_row_idx)) { - return true; - } + } + // persistent segment; check if there were any updates or deletions in this segment + idx_t start_row_idx = segment->start - row_group.start; + idx_t end_row_idx = start_row_idx + segment->count; + if (col_data.HasChanges(start_row_idx, end_row_idx)) { + return true; } } return false; diff --git a/src/duckdb/ub_src_storage_compression.cpp b/src/duckdb/ub_src_storage_compression.cpp index 66e0303b5..491801cd7 100644 --- a/src/duckdb/ub_src_storage_compression.cpp +++ b/src/duckdb/ub_src_storage_compression.cpp @@ -24,3 +24,5 @@ #include "src/storage/compression/fsst.cpp" +#include "src/storage/compression/empty_validity.cpp" + From 422f1ccba03cb982855f76bb7026e8a2cf18909e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:13:51 +0000 Subject: [PATCH 09/21] vendor: Update vendored sources to duckdb/duckdb@3b44a57608458b063303d3576b2e21b6f1908a76 (#984) Issue duckdb/duckdb#14996: Aggregate Secondary Orderings (duckdb/duckdb#15592) Co-authored-by: krlmlr --- .../function/table/version/pragma_version.cpp | 6 +- .../window/window_aggregate_function.cpp | 127 ++++------------ .../src/function/window/window_aggregator.cpp | 9 +- .../window/window_constant_aggregator.cpp | 137 ++++++++++++++---- .../window/window_custom_aggregator.cpp | 23 ++- .../window/window_distinct_aggregator.cpp | 16 +- .../src/function/window/window_executor.cpp | 11 ++ .../window/window_naive_aggregator.cpp | 133 +++++++++++++++-- .../function/window/window_segment_tree.cpp | 17 ++- .../window/window_aggregate_function.hpp | 6 +- .../function/window/window_aggregator.hpp | 10 +- .../window/window_constant_aggregator.hpp | 5 +- .../window/window_custom_aggregator.hpp | 5 +- .../window/window_distinct_aggregator.hpp | 6 +- .../window/window_naive_aggregator.hpp | 10 +- .../function/window/window_segment_tree.hpp | 15 +- .../expression/bound_window_expression.hpp | 1 + .../expression/transform_function.cpp | 2 +- .../expression/bound_window_expression.cpp | 12 +- 19 files changed, 360 insertions(+), 191 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 29ac2f576..1742d5569 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4188" +#define DUCKDB_PATCH_VERSION "4-dev4200" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4188" +#define DUCKDB_VERSION "v1.1.4-dev4200" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "fbc4d92fe6" +#define DUCKDB_SOURCE_ID "3b44a57608" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_aggregate_function.cpp b/src/duckdb/src/function/window/window_aggregate_function.cpp index 32dc0a276..308667601 100644 --- a/src/duckdb/src/function/window/window_aggregate_function.cpp +++ b/src/duckdb/src/function/window/window_aggregate_function.cpp @@ -26,117 +26,50 @@ class WindowAggregateExecutorGlobalState : public WindowExecutorGlobalState { const Expression *filter_ref; }; -bool WindowAggregateExecutor::IsConstantAggregate() { - if (!wexpr.aggregate) { - return false; - } - // window exclusion cannot be handled by constant aggregates - if (wexpr.exclude_clause != WindowExcludeMode::NO_OTHER) { - return false; - } - - // COUNT(*) is already handled efficiently by segment trees. - if (wexpr.children.empty()) { - return false; - } - - /* - The default framing option is RANGE UNBOUNDED PRECEDING, which - is the same as RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT - ROW; it sets the frame to be all rows from the partition start - up through the current row's last peer (a row that the window's - ORDER BY clause considers equivalent to the current row; all - rows are peers if there is no ORDER BY). In general, UNBOUNDED - PRECEDING means that the frame starts with the first row of the - partition, and similarly UNBOUNDED FOLLOWING means that the - frame ends with the last row of the partition, regardless of - RANGE, ROWS or GROUPS mode. In ROWS mode, CURRENT ROW means that - the frame starts or ends with the current row; but in RANGE or - GROUPS mode it means that the frame starts or ends with the - current row's first or last peer in the ORDER BY ordering. The - offset PRECEDING and offset FOLLOWING options vary in meaning - depending on the frame mode. - */ - switch (wexpr.start) { - case WindowBoundary::UNBOUNDED_PRECEDING: - break; - case WindowBoundary::CURRENT_ROW_RANGE: - if (!wexpr.orders.empty()) { - return false; +static BoundWindowExpression &SimplifyWindowedAggregate(BoundWindowExpression &wexpr, ClientContext &context) { + // Remove redundant/irrelevant modifiers (they can be serious performance cliffs) + if (wexpr.aggregate && ClientConfig::GetConfig(context).enable_optimizer) { + const auto &aggr = wexpr.aggregate; + auto &arg_orders = wexpr.arg_orders; + if (aggr->distinct_dependent != AggregateDistinctDependent::DISTINCT_DEPENDENT) { + wexpr.distinct = false; } - break; - default: - return false; - } - - switch (wexpr.end) { - case WindowBoundary::UNBOUNDED_FOLLOWING: - break; - case WindowBoundary::CURRENT_ROW_RANGE: - if (!wexpr.orders.empty()) { - return false; + if (aggr->order_dependent != AggregateOrderDependent::ORDER_DEPENDENT) { + arg_orders.clear(); + } else { + // If the argument order is prefix of the partition ordering, + // then we can just use the partition ordering. + if (BoundWindowExpression::GetSharedOrders(wexpr.orders, arg_orders) == arg_orders.size()) { + arg_orders.clear(); + } } - break; - default: - return false; - } - - return true; -} - -bool WindowAggregateExecutor::IsDistinctAggregate() { - if (!wexpr.aggregate) { - return false; } - return wexpr.distinct; -} - -bool WindowAggregateExecutor::IsCustomAggregate() { - if (!wexpr.aggregate) { - return false; - } - - if (!AggregateObject(wexpr).function.window) { - return false; - } - - return (mode < WindowAggregationMode::COMBINE); -} - -void WindowExecutor::Evaluate(idx_t row_idx, DataChunk &eval_chunk, Vector &result, WindowExecutorLocalState &lstate, - WindowExecutorGlobalState &gstate) const { - auto &lbstate = lstate.Cast(); - lbstate.UpdateBounds(gstate, row_idx, eval_chunk, lstate.range_cursor); - - const auto count = eval_chunk.size(); - EvaluateInternal(gstate, lstate, eval_chunk, result, count, row_idx); - - result.Verify(count); + return wexpr; } WindowAggregateExecutor::WindowAggregateExecutor(BoundWindowExpression &wexpr, ClientContext &context, WindowSharedExpressions &shared, WindowAggregationMode mode) - : WindowExecutor(wexpr, context, shared), mode(mode) { - auto return_type = wexpr.return_type; + : WindowExecutor(SimplifyWindowedAggregate(wexpr, context), context, shared), mode(mode) { // Force naive for SEPARATE mode or for (currently!) unsupported functionality - const auto force_naive = - !ClientConfig::GetConfig(context).enable_optimizer || mode == WindowAggregationMode::SEPARATE; - if (force_naive || (wexpr.distinct && wexpr.exclude_clause != WindowExcludeMode::NO_OTHER)) { - aggregator = make_uniq(wexpr, wexpr.exclude_clause, shared); - } else if (IsDistinctAggregate()) { + if (!ClientConfig::GetConfig(context).enable_optimizer || mode == WindowAggregationMode::SEPARATE) { + aggregator = make_uniq(*this, shared); + } else if (WindowDistinctAggregator::CanAggregate(wexpr)) { // build a merge sort tree // see https://dl.acm.org/doi/pdf/10.1145/3514221.3526184 - aggregator = make_uniq(wexpr, wexpr.exclude_clause, shared, context); - } else if (IsConstantAggregate()) { - aggregator = make_uniq(wexpr, wexpr.exclude_clause, shared); - } else if (IsCustomAggregate()) { - aggregator = make_uniq(wexpr, wexpr.exclude_clause, shared); - } else { + aggregator = make_uniq(wexpr, shared, context); + } else if (WindowConstantAggregator::CanAggregate(wexpr)) { + aggregator = make_uniq(wexpr, shared); + } else if (WindowCustomAggregator::CanAggregate(wexpr, mode)) { + aggregator = make_uniq(wexpr, shared); + } else if (WindowSegmentTree::CanAggregate(wexpr)) { // build a segment tree for frame-adhering aggregates // see http://www.vldb.org/pvldb/vol8/p1058-leis.pdf - aggregator = make_uniq(wexpr, mode, wexpr.exclude_clause, shared); + aggregator = make_uniq(wexpr, shared); + } else { + // No accelerator can handle this combination, so fall back to naïve. + aggregator = make_uniq(*this, shared); } // Compute the FILTER with the other eval columns. diff --git a/src/duckdb/src/function/window/window_aggregator.cpp b/src/duckdb/src/function/window/window_aggregator.cpp index 60499073f..197b89e38 100644 --- a/src/duckdb/src/function/window/window_aggregator.cpp +++ b/src/duckdb/src/function/window/window_aggregator.cpp @@ -12,18 +12,17 @@ namespace duckdb { WindowAggregatorState::WindowAggregatorState() : allocator(Allocator::DefaultAllocator()) { } -WindowAggregator::WindowAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode_p) +WindowAggregator::WindowAggregator(const BoundWindowExpression &wexpr) : wexpr(wexpr), aggr(wexpr), result_type(wexpr.return_type), state_size(aggr.function.state_size(aggr.function)), - exclude_mode(exclude_mode_p) { + exclude_mode(wexpr.exclude_clause) { for (auto &child : wexpr.children) { arg_types.emplace_back(child->return_type); } } -WindowAggregator::WindowAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared) - : WindowAggregator(wexpr, exclude_mode_p) { +WindowAggregator::WindowAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared) + : WindowAggregator(wexpr) { for (auto &child : wexpr.children) { child_idx.emplace_back(shared.RegisterCollection(child, false)); } diff --git a/src/duckdb/src/function/window/window_constant_aggregator.cpp b/src/duckdb/src/function/window/window_constant_aggregator.cpp index d94267f91..4be038229 100644 --- a/src/duckdb/src/function/window/window_constant_aggregator.cpp +++ b/src/duckdb/src/function/window/window_constant_aggregator.cpp @@ -6,7 +6,7 @@ namespace duckdb { //===--------------------------------------------------------------------===// -// WindowConstantAggregator +// WindowConstantAggregatorGlobalState //===--------------------------------------------------------------------===// class WindowConstantAggregatorGlobalState : public WindowAggregatorGlobalState { @@ -24,33 +24,6 @@ class WindowConstantAggregatorGlobalState : public WindowAggregatorGlobalState { unique_ptr results; }; -class WindowConstantAggregatorLocalState : public WindowAggregatorLocalState { -public: - explicit WindowConstantAggregatorLocalState(const WindowConstantAggregatorGlobalState &gstate); - ~WindowConstantAggregatorLocalState() override { - } - - void Sink(DataChunk &sink_chunk, DataChunk &coll_chunk, idx_t input_idx, optional_ptr filter_sel, - idx_t filtered); - void Combine(WindowConstantAggregatorGlobalState &gstate); - -public: - //! The global state we are sharing - const WindowConstantAggregatorGlobalState &gstate; - //! Reusable chunk for sinking - DataChunk inputs; - //! Chunk for referencing the input columns - DataChunk payload_chunk; - //! A vector of pointers to "state", used for intermediate window segment aggregation - Vector statep; - //! Reused result state container for the window functions - WindowAggregateStates statef; - //! The current result partition being read - idx_t partition; - //! Shared SV for evaluation - SelectionVector matches; -}; - WindowConstantAggregatorGlobalState::WindowConstantAggregatorGlobalState(ClientContext &context, const WindowConstantAggregator &aggregator, idx_t group_count, @@ -93,6 +66,36 @@ WindowConstantAggregatorGlobalState::WindowConstantAggregatorGlobalState(ClientC partition_offsets.emplace_back(group_count); } +//===--------------------------------------------------------------------===// +// WindowConstantAggregatorLocalState +//===--------------------------------------------------------------------===// +class WindowConstantAggregatorLocalState : public WindowAggregatorLocalState { +public: + explicit WindowConstantAggregatorLocalState(const WindowConstantAggregatorGlobalState &gstate); + ~WindowConstantAggregatorLocalState() override { + } + + void Sink(DataChunk &sink_chunk, DataChunk &coll_chunk, idx_t input_idx, optional_ptr filter_sel, + idx_t filtered); + void Combine(WindowConstantAggregatorGlobalState &gstate); + +public: + //! The global state we are sharing + const WindowConstantAggregatorGlobalState &gstate; + //! Reusable chunk for sinking + DataChunk inputs; + //! Chunk for referencing the input columns + DataChunk payload_chunk; + //! A vector of pointers to "state", used for intermediate window segment aggregation + Vector statep; + //! Reused result state container for the window functions + WindowAggregateStates statef; + //! The current result partition being read + idx_t partition; + //! Shared SV for evaluation + SelectionVector matches; +}; + WindowConstantAggregatorLocalState::WindowConstantAggregatorLocalState( const WindowConstantAggregatorGlobalState &gstate) : gstate(gstate), statep(Value::POINTER(0)), statef(gstate.statef.aggr), partition(0) { @@ -110,10 +113,80 @@ WindowConstantAggregatorLocalState::WindowConstantAggregatorLocalState( gstate.locals++; } -WindowConstantAggregator::WindowConstantAggregator(const BoundWindowExpression &wexpr, - const WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared) - : WindowAggregator(wexpr, exclude_mode_p) { +//===--------------------------------------------------------------------===// +// WindowConstantAggregator +//===--------------------------------------------------------------------===// +bool WindowConstantAggregator::CanAggregate(const BoundWindowExpression &wexpr) { + if (!wexpr.aggregate) { + return false; + } + // window exclusion cannot be handled by constant aggregates + if (wexpr.exclude_clause != WindowExcludeMode::NO_OTHER) { + return false; + } + + // DISTINCT aggregation cannot be handled by constant aggregation + // TODO: Use a hash table + if (wexpr.distinct) { + return false; + } + + // ORDER BY arguments cannot be handled by constant aggregation + if (!wexpr.arg_orders.empty()) { + return false; + } + + // COUNT(*) is already handled efficiently by segment trees. + if (wexpr.children.empty()) { + return false; + } + + /* + The default framing option is RANGE UNBOUNDED PRECEDING, which + is the same as RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT + ROW; it sets the frame to be all rows from the partition start + up through the current row's last peer (a row that the window's + ORDER BY clause considers equivalent to the current row; all + rows are peers if there is no ORDER BY). In general, UNBOUNDED + PRECEDING means that the frame starts with the first row of the + partition, and similarly UNBOUNDED FOLLOWING means that the + frame ends with the last row of the partition, regardless of + RANGE, ROWS or GROUPS mode. In ROWS mode, CURRENT ROW means that + the frame starts or ends with the current row; but in RANGE or + GROUPS mode it means that the frame starts or ends with the + current row's first or last peer in the ORDER BY ordering. The + offset PRECEDING and offset FOLLOWING options vary in meaning + depending on the frame mode. + */ + switch (wexpr.start) { + case WindowBoundary::UNBOUNDED_PRECEDING: + break; + case WindowBoundary::CURRENT_ROW_RANGE: + if (!wexpr.orders.empty()) { + return false; + } + break; + default: + return false; + } + + switch (wexpr.end) { + case WindowBoundary::UNBOUNDED_FOLLOWING: + break; + case WindowBoundary::CURRENT_ROW_RANGE: + if (!wexpr.orders.empty()) { + return false; + } + break; + default: + return false; + } + + return true; +} + +WindowConstantAggregator::WindowConstantAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared) + : WindowAggregator(wexpr) { // We only need these values for Sink for (auto &child : wexpr.children) { diff --git a/src/duckdb/src/function/window/window_custom_aggregator.cpp b/src/duckdb/src/function/window/window_custom_aggregator.cpp index d75143524..8416e3031 100644 --- a/src/duckdb/src/function/window/window_custom_aggregator.cpp +++ b/src/duckdb/src/function/window/window_custom_aggregator.cpp @@ -1,13 +1,30 @@ #include "duckdb/function/window/window_custom_aggregator.hpp" +#include "duckdb/planner/expression/bound_window_expression.hpp" namespace duckdb { //===--------------------------------------------------------------------===// // WindowCustomAggregator //===--------------------------------------------------------------------===// -WindowCustomAggregator::WindowCustomAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode, - WindowSharedExpressions &shared) - : WindowAggregator(wexpr, exclude_mode, shared) { +bool WindowCustomAggregator::CanAggregate(const BoundWindowExpression &wexpr, WindowAggregationMode mode) { + if (!wexpr.aggregate) { + return false; + } + + if (!wexpr.aggregate->window) { + return false; + } + + // ORDER BY arguments are not currently supported + if (!wexpr.arg_orders.empty()) { + return false; + } + + return (mode < WindowAggregationMode::COMBINE); +} + +WindowCustomAggregator::WindowCustomAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared) + : WindowAggregator(wexpr, shared) { } WindowCustomAggregator::~WindowCustomAggregator() { diff --git a/src/duckdb/src/function/window/window_distinct_aggregator.cpp b/src/duckdb/src/function/window/window_distinct_aggregator.cpp index ba466288f..1dc940f95 100644 --- a/src/duckdb/src/function/window/window_distinct_aggregator.cpp +++ b/src/duckdb/src/function/window/window_distinct_aggregator.cpp @@ -6,6 +6,7 @@ #include "duckdb/function/window/window_aggregate_states.hpp" #include "duckdb/planner/bound_result_modifier.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" +#include "duckdb/planner/expression/bound_window_expression.hpp" #include #include @@ -15,10 +16,17 @@ namespace duckdb { //===--------------------------------------------------------------------===// // WindowDistinctAggregator //===--------------------------------------------------------------------===// -WindowDistinctAggregator::WindowDistinctAggregator(const BoundWindowExpression &wexpr, - const WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared, ClientContext &context) - : WindowAggregator(wexpr, exclude_mode_p, shared), context(context) { +bool WindowDistinctAggregator::CanAggregate(const BoundWindowExpression &wexpr) { + if (!wexpr.aggregate) { + return false; + } + + return wexpr.distinct && wexpr.exclude_clause == WindowExcludeMode::NO_OTHER && wexpr.arg_orders.empty(); +} + +WindowDistinctAggregator::WindowDistinctAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared, + ClientContext &context) + : WindowAggregator(wexpr, shared), context(context) { } class WindowDistinctAggregatorLocalState; diff --git a/src/duckdb/src/function/window/window_executor.cpp b/src/duckdb/src/function/window/window_executor.cpp index 40381f0e6..f4dab47af 100644 --- a/src/duckdb/src/function/window/window_executor.cpp +++ b/src/duckdb/src/function/window/window_executor.cpp @@ -47,6 +47,17 @@ WindowExecutor::WindowExecutor(BoundWindowExpression &wexpr, ClientContext &cont } } +void WindowExecutor::Evaluate(idx_t row_idx, DataChunk &eval_chunk, Vector &result, WindowExecutorLocalState &lstate, + WindowExecutorGlobalState &gstate) const { + auto &lbstate = lstate.Cast(); + lbstate.UpdateBounds(gstate, row_idx, eval_chunk, lstate.range_cursor); + + const auto count = eval_chunk.size(); + EvaluateInternal(gstate, lstate, eval_chunk, result, count, row_idx); + + result.Verify(count); +} + WindowExecutorGlobalState::WindowExecutorGlobalState(const WindowExecutor &executor, const idx_t payload_count, const ValidityMask &partition_mask, const ValidityMask &order_mask) : executor(executor), payload_count(payload_count), partition_mask(partition_mask), order_mask(order_mask) { diff --git a/src/duckdb/src/function/window/window_naive_aggregator.cpp b/src/duckdb/src/function/window/window_naive_aggregator.cpp index 8bf0d54d3..448639e77 100644 --- a/src/duckdb/src/function/window/window_naive_aggregator.cpp +++ b/src/duckdb/src/function/window/window_naive_aggregator.cpp @@ -1,14 +1,21 @@ #include "duckdb/function/window/window_naive_aggregator.hpp" +#include "duckdb/common/sort/sort.hpp" #include "duckdb/function/window/window_collection.hpp" +#include "duckdb/function/window/window_shared_expressions.hpp" +#include "duckdb/planner/expression/bound_window_expression.hpp" +#include "duckdb/function/window/window_aggregate_function.hpp" namespace duckdb { //===--------------------------------------------------------------------===// // WindowNaiveAggregator //===--------------------------------------------------------------------===// -WindowNaiveAggregator::WindowNaiveAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode, - WindowSharedExpressions &shared) - : WindowAggregator(wexpr, exclude_mode, shared) { +WindowNaiveAggregator::WindowNaiveAggregator(const WindowAggregateExecutor &executor, WindowSharedExpressions &shared) + : WindowAggregator(executor.wexpr, shared), executor(executor) { + + for (const auto &order : wexpr.arg_orders) { + arg_order_idx.emplace_back(shared.RegisterCollection(order.expression, false)); + } } WindowNaiveAggregator::~WindowNaiveAggregator() { @@ -76,6 +83,17 @@ class WindowNaiveState : public WindowAggregatorLocalState { Vector hashes; //! The state used for comparing the collection across chunk boundaries unique_ptr comparer; + + //! The state used for scanning ORDER BY values from the collection + unique_ptr arg_orderer; + //! Reusable sort key chunk + DataChunk orderby_sort; + //! Reusable sort payload chunk + DataChunk orderby_payload; + //! Reusable sort key slicer + SelectionVector orderby_sel; + //! Reusable payload layout. + RowLayout payload_layout; }; WindowNaiveState::WindowNaiveState(const WindowNaiveAggregator &aggregator_p) @@ -95,6 +113,13 @@ WindowNaiveState::WindowNaiveState(const WindowNaiveAggregator &aggregator_p) fdata[i] = state_ptr; state_ptr += aggregator.state_size; } + + // Initialise any ORDER BY data + if (!aggregator.arg_order_idx.empty() && !arg_orderer) { + orderby_payload.Initialize(Allocator::DefaultAllocator(), {LogicalType::UBIGINT}); + payload_layout.Initialize(orderby_payload.GetTypes()); + orderby_sel.Initialize(); + } } void WindowNaiveState::Finalize(WindowAggregatorGlobalState &gastate, CollectionPtr collection) { @@ -102,7 +127,19 @@ void WindowNaiveState::Finalize(WindowAggregatorGlobalState &gastate, Collection // Set up the comparison scanner just in case if (!comparer) { - comparer = make_uniq(*collection, gastate.aggregator.child_idx); + comparer = make_uniq(*collection, aggregator.child_idx); + } + + // Set up the argument ORDER BY scanner if needed + if (!aggregator.arg_order_idx.empty() && !arg_orderer) { + arg_orderer = make_uniq(*collection, aggregator.arg_order_idx); + orderby_sort.Initialize(BufferAllocator::Get(gastate.context), arg_orderer->chunk.GetTypes()); + } + + // Initialise the chunks + const auto types = cursor->chunk.GetTypes(); + if (leaves.ColumnCount() == 0 && !types.empty()) { + leaves.Initialize(BufferAllocator::Get(gastate.context), types); } } @@ -175,10 +212,6 @@ void WindowNaiveState::Evaluate(const WindowAggregatorGlobalState &gsink, const auto &filter_mask = gsink.filter_mask; const auto types = cursor->chunk.GetTypes(); - if (leaves.ColumnCount() == 0 && !types.empty()) { - leaves.Initialize(Allocator::DefaultAllocator(), types); - } - auto fdata = FlatVector::GetData(statef); auto pdata = FlatVector::GetData(statep); @@ -190,8 +223,90 @@ void WindowNaiveState::Evaluate(const WindowAggregatorGlobalState &gsink, const auto agg_state = fdata[rid]; aggr.function.initialize(aggr.function, agg_state); - // Just update the aggregate with the unfiltered input rows + // Reset the DISTINCT hash table row_set.clear(); + + // Sort the input rows by the argument + if (arg_orderer) { + auto &context = aggregator.executor.context; + auto &orders = aggregator.wexpr.arg_orders; + auto &buffer_manager = BufferManager::GetBufferManager(context); + GlobalSortState global_sort(buffer_manager, orders, payload_layout); + LocalSortState local_sort; + local_sort.Initialize(global_sort, global_sort.buffer_manager); + + idx_t orderby_count = 0; + auto orderby_row = FlatVector::GetData(orderby_payload.data[0]); + for (const auto &frame : frames) { + for (auto f = frame.start; f < frame.end; ++f) { + // FILTER before the ORDER BY + if (!filter_mask.RowIsValid(f)) { + continue; + } + + if (!arg_orderer->RowIsVisible(f) || orderby_count >= STANDARD_VECTOR_SIZE) { + if (orderby_count) { + orderby_sort.Reference(arg_orderer->chunk); + orderby_sort.Slice(orderby_sel, orderby_count); + orderby_payload.SetCardinality(orderby_count); + local_sort.SinkChunk(orderby_sort, orderby_payload); + } + orderby_count = 0; + arg_orderer->Seek(f); + } + orderby_row[orderby_count] = f; + orderby_sel.set_index(orderby_count++, arg_orderer->RowOffset(f)); + } + } + if (orderby_count) { + orderby_sort.Reference(arg_orderer->chunk); + orderby_sort.Slice(orderby_sel, orderby_count); + orderby_payload.SetCardinality(orderby_count); + local_sort.SinkChunk(orderby_sort, orderby_payload); + } + + global_sort.AddLocalState(local_sort); + if (global_sort.sorted_blocks.empty()) { + return; + } + global_sort.PrepareMergePhase(); + while (global_sort.sorted_blocks.size() > 1) { + global_sort.InitializeMergeRound(); + MergeSorter merge_sorter(global_sort, global_sort.buffer_manager); + merge_sorter.PerformInMergeRound(); + global_sort.CompleteMergeRound(false); + } + + PayloadScanner scanner(global_sort); + while (scanner.Remaining()) { + orderby_payload.Reset(); + scanner.Scan(orderby_payload); + orderby_row = FlatVector::GetData(orderby_payload.data[0]); + for (idx_t i = 0; i < orderby_payload.size(); ++i) { + const auto f = orderby_row[i]; + // Seek to the current position + if (!cursor->RowIsVisible(f)) { + // We need to flush when we cross a chunk boundary + FlushStates(gsink); + cursor->Seek(f); + } + + // Filter out duplicates + if (aggr.IsDistinct() && !row_set.insert(f).second) { + continue; + } + + pdata[flush_count] = agg_state; + update_sel[flush_count++] = cursor->RowOffset(f); + if (flush_count >= STANDARD_VECTOR_SIZE) { + FlushStates(gsink); + } + } + } + return; + } + + // Just update the aggregate with the unfiltered input rows for (const auto &frame : frames) { for (auto f = frame.start; f < frame.end; ++f) { if (!filter_mask.RowIsValid(f)) { diff --git a/src/duckdb/src/function/window/window_segment_tree.cpp b/src/duckdb/src/function/window/window_segment_tree.cpp index 0ccc8615b..6708ae90b 100644 --- a/src/duckdb/src/function/window/window_segment_tree.cpp +++ b/src/duckdb/src/function/window/window_segment_tree.cpp @@ -10,6 +10,18 @@ namespace duckdb { //===--------------------------------------------------------------------===// // WindowSegmentTree //===--------------------------------------------------------------------===// +bool WindowSegmentTree::CanAggregate(const BoundWindowExpression &wexpr) { + if (!wexpr.aggregate) { + return false; + } + + return !wexpr.distinct && wexpr.arg_orders.empty(); +} + +WindowSegmentTree::WindowSegmentTree(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared) + : WindowAggregator(wexpr, shared) { +} + class WindowSegmentTreeGlobalState : public WindowAggregatorGlobalState { public: using AtomicCounters = vector>; @@ -43,11 +55,6 @@ class WindowSegmentTreeGlobalState : public WindowAggregatorGlobalState { static constexpr idx_t TREE_FANOUT = 16; }; -WindowSegmentTree::WindowSegmentTree(const BoundWindowExpression &wexpr, WindowAggregationMode mode_p, - const WindowExcludeMode exclude_mode_p, WindowSharedExpressions &shared) - : WindowAggregator(wexpr, exclude_mode_p, shared), mode(mode_p) { -} - class WindowSegmentTreePart { public: //! Right side nodes need to be cached and processed in reverse order diff --git a/src/duckdb/src/include/duckdb/function/window/window_aggregate_function.hpp b/src/duckdb/src/include/duckdb/function/window/window_aggregate_function.hpp index b110315cd..efc782b8c 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_aggregate_function.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_aggregate_function.hpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +#pragma once + #include "duckdb/function/window/window_executor.hpp" #include "duckdb/common/enums/window_aggregation_mode.hpp" #include "duckdb/function/window/window_aggregator.hpp" @@ -17,10 +19,6 @@ class WindowAggregateExecutor : public WindowExecutor { WindowAggregateExecutor(BoundWindowExpression &wexpr, ClientContext &context, WindowSharedExpressions &shared, WindowAggregationMode mode); - bool IsConstantAggregate(); - bool IsCustomAggregate(); - bool IsDistinctAggregate(); - void Sink(DataChunk &sink_chunk, DataChunk &coll_chunk, const idx_t input_idx, WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate) const override; void Finalize(WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate, diff --git a/src/duckdb/src/include/duckdb/function/window/window_aggregator.hpp b/src/duckdb/src/include/duckdb/function/window/window_aggregator.hpp index c7103ce41..1caa6326d 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_aggregator.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_aggregator.hpp @@ -105,9 +105,8 @@ class WindowAggregator { } } - WindowAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode_p); - WindowAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared); + explicit WindowAggregator(const BoundWindowExpression &wexpr); + WindowAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared); virtual ~WindowAggregator(); // Threading states @@ -144,7 +143,7 @@ class WindowAggregator { class WindowAggregatorGlobalState : public WindowAggregatorState { public: WindowAggregatorGlobalState(ClientContext &context, const WindowAggregator &aggregator_p, idx_t group_count) - : aggregator(aggregator_p), aggr(aggregator.wexpr), locals(0), finalized(0) { + : context(context), aggregator(aggregator_p), aggr(aggregator.wexpr), locals(0), finalized(0) { if (aggr.filter) { // Start with all invalid and set the ones that pass @@ -154,6 +153,9 @@ class WindowAggregatorGlobalState : public WindowAggregatorState { } } + //! The context we are in + ClientContext &context; + //! The aggregator data const WindowAggregator &aggregator; diff --git a/src/duckdb/src/include/duckdb/function/window/window_constant_aggregator.hpp b/src/duckdb/src/include/duckdb/function/window/window_constant_aggregator.hpp index af3c7f88c..fdf07cc7a 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_constant_aggregator.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_constant_aggregator.hpp @@ -14,8 +14,9 @@ namespace duckdb { class WindowConstantAggregator : public WindowAggregator { public: - WindowConstantAggregator(const BoundWindowExpression &wexpr, WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared); + static bool CanAggregate(const BoundWindowExpression &wexpr); + + WindowConstantAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared); ~WindowConstantAggregator() override { } diff --git a/src/duckdb/src/include/duckdb/function/window/window_custom_aggregator.hpp b/src/duckdb/src/include/duckdb/function/window/window_custom_aggregator.hpp index dffad8483..ab664ca59 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_custom_aggregator.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_custom_aggregator.hpp @@ -14,8 +14,9 @@ namespace duckdb { class WindowCustomAggregator : public WindowAggregator { public: - WindowCustomAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode, - WindowSharedExpressions &shared); + static bool CanAggregate(const BoundWindowExpression &wexpr, WindowAggregationMode mode); + + WindowCustomAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared); ~WindowCustomAggregator() override; unique_ptr GetGlobalState(ClientContext &context, idx_t group_count, diff --git a/src/duckdb/src/include/duckdb/function/window/window_distinct_aggregator.hpp b/src/duckdb/src/include/duckdb/function/window/window_distinct_aggregator.hpp index 1cafc9252..7fe9040c0 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_distinct_aggregator.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_distinct_aggregator.hpp @@ -14,8 +14,10 @@ namespace duckdb { class WindowDistinctAggregator : public WindowAggregator { public: - WindowDistinctAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode_p, - WindowSharedExpressions &shared, ClientContext &context); + static bool CanAggregate(const BoundWindowExpression &wexpr); + + WindowDistinctAggregator(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared, + ClientContext &context); // Build unique_ptr GetGlobalState(ClientContext &context, idx_t group_count, diff --git a/src/duckdb/src/include/duckdb/function/window/window_naive_aggregator.hpp b/src/duckdb/src/include/duckdb/function/window/window_naive_aggregator.hpp index 199301bb4..1fdf497ee 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_naive_aggregator.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_naive_aggregator.hpp @@ -12,16 +12,22 @@ namespace duckdb { +class WindowAggregateExecutor; + // Used for validation class WindowNaiveAggregator : public WindowAggregator { public: - WindowNaiveAggregator(const BoundWindowExpression &wexpr, const WindowExcludeMode exclude_mode, - WindowSharedExpressions &shared); + WindowNaiveAggregator(const WindowAggregateExecutor &executor, WindowSharedExpressions &shared); ~WindowNaiveAggregator() override; unique_ptr GetLocalState(const WindowAggregatorState &gstate) const override; void Evaluate(const WindowAggregatorState &gsink, WindowAggregatorState &lstate, const DataChunk &bounds, Vector &result, idx_t count, idx_t row_idx) const override; + + //! The parent executor + const WindowAggregateExecutor &executor; + //! The column indices of any ORDER BY argument expressions + vector arg_order_idx; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/window/window_segment_tree.hpp b/src/duckdb/src/include/duckdb/function/window/window_segment_tree.hpp index f3a7b6f96..0b574b959 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_segment_tree.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_segment_tree.hpp @@ -13,10 +13,10 @@ namespace duckdb { class WindowSegmentTree : public WindowAggregator { - public: - WindowSegmentTree(const BoundWindowExpression &wexpr, WindowAggregationMode mode_p, - const WindowExcludeMode exclude_mode, WindowSharedExpressions &shared); + static bool CanAggregate(const BoundWindowExpression &wexpr); + + WindowSegmentTree(const BoundWindowExpression &wexpr, WindowSharedExpressions &shared); unique_ptr GetGlobalState(ClientContext &context, idx_t group_count, const ValidityMask &partition_mask) const override; @@ -26,15 +26,6 @@ class WindowSegmentTree : public WindowAggregator { void Evaluate(const WindowAggregatorState &gstate, WindowAggregatorState &lstate, const DataChunk &bounds, Vector &result, idx_t count, idx_t row_idx) const override; - -public: - //! Use the combine API, if available - inline bool UseCombineAPI() const { - return mode < WindowAggregationMode::SEPARATE; - } - - //! Use the combine API, if available - WindowAggregationMode mode; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp b/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp index adf15ead3..acd76a42f 100644 --- a/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression/bound_window_expression.hpp @@ -73,6 +73,7 @@ class BoundWindowExpression : public Expression { string ToString() const override; //! The number of ordering clauses the functions share + static idx_t GetSharedOrders(const vector &lhs, const vector &rhs); idx_t GetSharedOrders(const BoundWindowExpression &other) const; bool PartitionsAreEquivalent(const BoundWindowExpression &other) const; diff --git a/src/duckdb/src/parser/transform/expression/transform_function.cpp b/src/duckdb/src/parser/transform/expression/transform_function.cpp index ec8dad678..e410b7049 100644 --- a/src/duckdb/src/parser/transform/expression/transform_function.cpp +++ b/src/duckdb/src/parser/transform/expression/transform_function.cpp @@ -122,8 +122,8 @@ static bool IsOrderableWindowFunction(ExpressionType type) { case ExpressionType::WINDOW_CUME_DIST: case ExpressionType::WINDOW_LEAD: case ExpressionType::WINDOW_LAG: - return true; case ExpressionType::WINDOW_AGGREGATE: + return true; case ExpressionType::WINDOW_RANK_DENSE: return false; default: diff --git a/src/duckdb/src/planner/expression/bound_window_expression.cpp b/src/duckdb/src/planner/expression/bound_window_expression.cpp index 5f778db14..b189b8e6e 100644 --- a/src/duckdb/src/planner/expression/bound_window_expression.cpp +++ b/src/duckdb/src/planner/expression/bound_window_expression.cpp @@ -101,19 +101,23 @@ bool BoundWindowExpression::PartitionsAreEquivalent(const BoundWindowExpression return true; } -idx_t BoundWindowExpression::GetSharedOrders(const BoundWindowExpression &other) const { - const auto overlap = MinValue(orders.size(), other.orders.size()); +idx_t BoundWindowExpression::GetSharedOrders(const vector &lhs, const vector &rhs) { + const auto overlap = MinValue(lhs.size(), rhs.size()); idx_t result = 0; for (; result < overlap; ++result) { - if (!orders[result].Equals(other.orders[result])) { - return false; + if (!lhs.at(result).Equals(rhs.at(result))) { + return 0; } } return result; } +idx_t BoundWindowExpression::GetSharedOrders(const BoundWindowExpression &other) const { + return GetSharedOrders(orders, other.orders); +} + bool BoundWindowExpression::KeysAreCompatible(const BoundWindowExpression &other) const { if (!PartitionsAreEquivalent(other)) { return false; From 16d78de8e253570d409b0d840d8ace36764fcf78 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:42:31 +0000 Subject: [PATCH 10/21] vendor: Update vendored sources to duckdb/duckdb@c93ab6e8171b6cba652439cadc91e79d488ffd01 (#985) Backport ENTRY_VISIBILITY from duckdb/extension-template-c (duckdb/duckdb#15611) Co-authored-by: krlmlr --- .../src/function/table/version/pragma_version.cpp | 6 +++--- src/duckdb/src/include/duckdb_extension.h | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 1742d5569..781b85416 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4200" +#define DUCKDB_PATCH_VERSION "4-dev4202" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4200" +#define DUCKDB_VERSION "v1.1.4-dev4202" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "3b44a57608" +#define DUCKDB_SOURCE_ID "c93ab6e817" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb_extension.h b/src/duckdb/src/include/duckdb_extension.h index 5eca823d5..6f01ef592 100644 --- a/src/duckdb/src/include/duckdb_extension.h +++ b/src/duckdb/src/include/duckdb_extension.h @@ -966,6 +966,12 @@ typedef struct { // Place in global scope of any C/C++ file that needs to access the extension API #define DUCKDB_EXTENSION_EXTERN extern duckdb_ext_api_v1 duckdb_ext_api; +#ifdef _WIN32 +#define DUCKDB_CAPI_ENTRY_VISIBILITY __declspec(dllexport) +#else +#define DUCKDB_CAPI_ENTRY_VISIBILITY __attribute__((visibility("default"))) +#endif + //===--------------------------------------------------------------------===// // Entrypoint Macros //===--------------------------------------------------------------------===// @@ -979,7 +985,7 @@ typedef struct { #define DUCKDB_EXTENSION_ENTRYPOINT \ DUCKDB_EXTENSION_GLOBAL static bool DUCKDB_EXTENSION_GLUE(DUCKDB_EXTENSION_NAME, _init_c_api_internal)( \ duckdb_connection connection, duckdb_extension_info info, struct duckdb_extension_access * access); \ - DUCKDB_EXTENSION_EXTERN_C_GUARD_OPEN DUCKDB_EXTENSION_API bool DUCKDB_EXTENSION_GLUE( \ + DUCKDB_EXTENSION_EXTERN_C_GUARD_OPEN DUCKDB_CAPI_ENTRY_VISIBILITY DUCKDB_EXTENSION_API bool DUCKDB_EXTENSION_GLUE( \ DUCKDB_EXTENSION_NAME, _init_c_api)(duckdb_extension_info info, struct duckdb_extension_access * access) { \ DUCKDB_EXTENSION_API_INIT(info, access, DUCKDB_EXTENSION_API_VERSION_STRING); \ duckdb_database *db = access->get_database(info); \ @@ -998,7 +1004,7 @@ typedef struct { #define DUCKDB_EXTENSION_ENTRYPOINT_CUSTOM \ DUCKDB_EXTENSION_GLOBAL static bool DUCKDB_EXTENSION_GLUE(DUCKDB_EXTENSION_NAME, _init_c_api_internal)( \ duckdb_extension_info info, struct duckdb_extension_access * access); \ - DUCKDB_EXTENSION_EXTERN_C_GUARD_OPEN DUCKDB_EXTENSION_API bool DUCKDB_EXTENSION_GLUE( \ + DUCKDB_EXTENSION_EXTERN_C_GUARD_OPEN DUCKDB_CAPI_ENTRY_VISIBILITY DUCKDB_EXTENSION_API bool DUCKDB_EXTENSION_GLUE( \ DUCKDB_EXTENSION_NAME, _init_c_api)(duckdb_extension_info info, struct duckdb_extension_access * access) { \ DUCKDB_EXTENSION_API_INIT(info, access, DUCKDB_EXTENSION_API_VERSION_STRING); \ return DUCKDB_EXTENSION_GLUE(DUCKDB_EXTENSION_NAME, _init_c_api_internal)(info, access); \ From 99a79b04312f06cd15605a02f6eb9b5f8b68fba3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:06:14 +0000 Subject: [PATCH 11/21] vendor: Update vendored sources to duckdb/duckdb@b48222edb439e42517392eaa2e918b625cafd09a (#986) Adjust list_reduce to use a 1-based indexing (duckdb/duckdb#15614) Co-authored-by: krlmlr --- .../extension/core_functions/scalar/list/list_reduce.cpp | 2 +- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp b/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp index f8758515f..173b52699 100644 --- a/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp +++ b/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp @@ -101,7 +101,7 @@ static bool ExecuteReduce(idx_t loops, ReduceExecuteInfo &execute_info, LambdaFu } // create the index vector - Vector index_vector(Value::BIGINT(UnsafeNumericCast(loops + 1))); + Vector index_vector(Value::BIGINT(UnsafeNumericCast(loops + 2))); // slice the left and right slice execute_info.left_slice->Slice(*execute_info.left_slice, execute_info.left_sel, reduced_row_idx); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 781b85416..3f58a34d9 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4202" +#define DUCKDB_PATCH_VERSION "4-dev4204" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4202" +#define DUCKDB_VERSION "v1.1.4-dev4204" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "c93ab6e817" +#define DUCKDB_SOURCE_ID "b48222edb4" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" From b10616d051e99d9754fee521a780e76246e5bded Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:33:51 +0000 Subject: [PATCH 12/21] vendor: Update vendored sources to duckdb/duckdb@b5393f7d0f8234028f9719889d33178ef793279f (#987) window: fix nullptr dereference (duckdb/duckdb#15610) Co-authored-by: krlmlr --- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- src/duckdb/src/function/window/window_index_tree.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 3f58a34d9..4f5e368f6 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4204" +#define DUCKDB_PATCH_VERSION "4-dev4206" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4204" +#define DUCKDB_VERSION "v1.1.4-dev4206" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "b48222edb4" +#define DUCKDB_SOURCE_ID "b5393f7d0f" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_index_tree.cpp b/src/duckdb/src/function/window/window_index_tree.cpp index c71984f50..5791b2af7 100644 --- a/src/duckdb/src/function/window/window_index_tree.cpp +++ b/src/duckdb/src/function/window/window_index_tree.cpp @@ -56,7 +56,7 @@ idx_t WindowIndexTree::SelectNth(const SubFrames &frames, idx_t n) const { if (mst32) { return mst32->NthElement(mst32->SelectNth(frames, n)); } else { - return mst64->NthElement(mst32->SelectNth(frames, n)); + return mst64->NthElement(mst64->SelectNth(frames, n)); } } From 8cf65d4b8dcf9e085affc6981353c3f1c269d69c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 13:10:16 +0000 Subject: [PATCH 13/21] vendor: Update vendored sources to duckdb/duckdb@b89f77c881fcdb8018bf3562c6de00b9e9849451 (#988) Implement `simple_update` for `first` aggregate function (duckdb/duckdb#15619) Build/test/distribute linux_amd64_musl core extensions (duckdb/duckdb#15607) Improve reading duplicate column names in JSON (duckdb/duckdb#15615) Co-authored-by: krlmlr --- .../aggregate/distributive/first_last_any.cpp | 16 +++++++++++++++- .../function/table/version/pragma_version.cpp | 6 +++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/duckdb/src/function/aggregate/distributive/first_last_any.cpp b/src/duckdb/src/function/aggregate/distributive/first_last_any.cpp index d8cb58e32..4e7455efa 100644 --- a/src/duckdb/src/function/aggregate/distributive/first_last_any.cpp +++ b/src/duckdb/src/function/aggregate/distributive/first_last_any.cpp @@ -216,9 +216,23 @@ struct FirstVectorFunction : FirstFunctionStringBase { } }; +template +static void FirstFunctionSimpleUpdate(Vector inputs[], AggregateInputData &aggregate_input_data, idx_t input_count, + data_ptr_t state, idx_t count) { + auto agg_state = reinterpret_cast *>(state); + if (LAST || !agg_state->is_set) { + // For FIRST, this skips looping over the input once the aggregate state has been set + // FIXME: for LAST we could loop from the back of the Vector instead + AggregateFunction::UnaryUpdate, T, FirstFunction>(inputs, aggregate_input_data, + input_count, state, count); + } +} + template static AggregateFunction GetFirstAggregateTemplated(LogicalType type) { - return AggregateFunction::UnaryAggregate, T, T, FirstFunction>(type, type); + auto result = AggregateFunction::UnaryAggregate, T, T, FirstFunction>(type, type); + result.simple_update = FirstFunctionSimpleUpdate; + return result; } template diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 4f5e368f6..a79fca7dd 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4206" +#define DUCKDB_PATCH_VERSION "4-dev4217" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4206" +#define DUCKDB_VERSION "v1.1.4-dev4217" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "b5393f7d0f" +#define DUCKDB_SOURCE_ID "b89f77c881" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" From afa1140fa101a35162a156533f1d38840af53537 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 13:44:51 +0000 Subject: [PATCH 14/21] vendor: Update vendored sources to duckdb/duckdb@5ec2adafa54af246814bd869bb6f48073a2f660b (#989) Issue duckdb/duckdb#15596: Infinite Value Checks (duckdb/duckdb#15620) Co-authored-by: krlmlr --- .../extension/core_functions/scalar/date/make_date.cpp | 4 ++++ src/duckdb/src/common/types/timestamp.cpp | 6 +++++- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/duckdb/extension/core_functions/scalar/date/make_date.cpp b/src/duckdb/extension/core_functions/scalar/date/make_date.cpp index 71591a227..0fe00a920 100644 --- a/src/duckdb/extension/core_functions/scalar/date/make_date.cpp +++ b/src/duckdb/extension/core_functions/scalar/date/make_date.cpp @@ -103,6 +103,10 @@ struct MakeTimestampOperator { template static RESULT_TYPE Operation(T value) { + const auto result = RESULT_TYPE(value); + if (!Timestamp::IsFinite(result)) { + throw ConversionException("Timestamp microseconds out of range: %ld", value); + } return RESULT_TYPE(value); } }; diff --git a/src/duckdb/src/common/types/timestamp.cpp b/src/duckdb/src/common/types/timestamp.cpp index dbc40a6e8..f90c0f77f 100644 --- a/src/duckdb/src/common/types/timestamp.cpp +++ b/src/duckdb/src/common/types/timestamp.cpp @@ -176,7 +176,11 @@ bool Timestamp::TryFromTimestampNanos(timestamp_t input, int32_t nanos, timestam return false; } - return TryAddOperator::Operation(result.value, int64_t(nanos), result.value); + if (!TryAddOperator::Operation(result.value, int64_t(nanos), result.value)) { + return false; + } + + return IsFinite(result); } TimestampCastResult Timestamp::TryConvertTimestamp(const char *str, idx_t len, timestamp_ns_t &result) { diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index a79fca7dd..ebc810e14 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4217" +#define DUCKDB_PATCH_VERSION "4-dev4220" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4217" +#define DUCKDB_VERSION "v1.1.4-dev4220" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "b89f77c881" +#define DUCKDB_SOURCE_ID "5ec2adafa5" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" From 3b818fc7f736662875b6b763eae7b88a391da328 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:07:22 +0000 Subject: [PATCH 15/21] vendor: Update vendored sources to duckdb/duckdb@bdf0f8c7231c44ed17bc2ede63498b3ef8d85d35 (#990) Issue duckdb/duckdb#15610: Wide Secondary Ordering (duckdb/duckdb#15625) Co-authored-by: krlmlr --- src/duckdb/src/function/table/version/pragma_version.cpp | 6 +++--- src/duckdb/src/function/window/window_merge_sort_tree.cpp | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index ebc810e14..d38747335 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4220" +#define DUCKDB_PATCH_VERSION "4-dev4222" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4220" +#define DUCKDB_VERSION "v1.1.4-dev4222" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "5ec2adafa5" +#define DUCKDB_SOURCE_ID "bdf0f8c723" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_merge_sort_tree.cpp b/src/duckdb/src/function/window/window_merge_sort_tree.cpp index b2d7db4d9..ef22694c4 100644 --- a/src/duckdb/src/function/window/window_merge_sort_tree.cpp +++ b/src/duckdb/src/function/window/window_merge_sort_tree.cpp @@ -11,8 +11,9 @@ WindowMergeSortTree::WindowMergeSortTree(ClientContext &context, const vector::max()) { + if (count < std::numeric_limits::max() && !force_external) { index_type = LogicalType::INTEGER; mst32 = make_uniq(); } else { @@ -40,7 +41,7 @@ WindowMergeSortTree::WindowMergeSortTree(ClientContext &context, const vector(buffer_manager, orders, payload_layout); } - global_sort->external = ClientConfig::GetConfig(context).force_external; + global_sort->external = force_external; } optional_ptr WindowMergeSortTree::AddLocalSort() { From 4b3abdc73e7046f7a8d692111e3937a215518203 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:37:11 +0000 Subject: [PATCH 16/21] vendor: Update vendored sources to duckdb/duckdb@00cbb18d634352d1ae0cdf4dd42d587d187f5a04 (#991) [C API] Expose the DB instance cache in the C API (duckdb/duckdb#15579) Issue template updates (duckdb/duckdb#15618) Co-authored-by: krlmlr --- .../function/table/version/pragma_version.cpp | 6 +- src/duckdb/src/include/duckdb.h | 53 ++++++++++++--- .../duckdb/main/capi/capi_internal.hpp | 9 ++- .../duckdb/main/capi/extension_api.hpp | 10 +++ src/duckdb/src/include/duckdb_extension.h | 14 ++++ src/duckdb/src/main/capi/duckdb-c.cpp | 67 +++++++++++++++---- .../src/main/capi/replacement_scan-c.cpp | 2 +- src/duckdb/src/main/capi/threading-c.cpp | 8 +-- .../src/main/extension/extension_load.cpp | 6 +- 9 files changed, 141 insertions(+), 34 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index d38747335..5f0897e3d 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4222" +#define DUCKDB_PATCH_VERSION "4-dev4230" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4222" +#define DUCKDB_VERSION "v1.1.4-dev4230" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "bdf0f8c723" +#define DUCKDB_SOURCE_ID "00cbb18d63" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb.h b/src/duckdb/src/include/duckdb.h index 5efd2bd6b..ae52abd33 100644 --- a/src/duckdb/src/include/duckdb.h +++ b/src/duckdb/src/include/duckdb.h @@ -441,7 +441,12 @@ typedef struct { void *internal_data; } duckdb_result; -//! A database object. Should be closed with `duckdb_close`. +//! A database instance cache object. Must be destroyed with `duckdb_destroy_instance_cache`. +typedef struct _duckdb_instance_cache { + void *internal_ptr; +} * duckdb_instance_cache; + +//! A database object. Must be closed with `duckdb_close`. typedef struct _duckdb_database { void *internal_ptr; } * duckdb_database; @@ -680,12 +685,44 @@ struct duckdb_extension_access { // Open Connect //===--------------------------------------------------------------------===// +/*! +Creates a new database instance cache. +The instance cache is necessary if a client/program (re)opens multiple databases to the same file within the same +process. Must be destroyed with 'duckdb_destroy_instance_cache'. + +* @return The database instance cache. +*/ +DUCKDB_API duckdb_instance_cache duckdb_create_instance_cache(); + +/*! +Creates a new database instance in the instance cache, or retrieves an existing database instance. +Must be closed with 'duckdb_close'. + +* @param instance_cache The instance cache in which to create the database, or from which to take the database. +* @param path Path to the database file on disk. Both `nullptr` and `:memory:` open or retrieve an in-memory database. +* @param out_database The resulting cached database. +* @param config (Optional) configuration used to create the database. +* @param out_error If set and the function returns `DuckDBError`, this contains the error message. +Note that the error message must be freed using `duckdb_free`. +* @return `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_get_or_create_from_cache(duckdb_instance_cache instance_cache, const char *path, + duckdb_database *out_database, duckdb_config config, + char **out_error); + +/*! +Destroys an existing database instance cache and de-allocates its memory. + +* @param instance_cache The instance cache to destroy. +*/ +DUCKDB_API void duckdb_destroy_instance_cache(duckdb_instance_cache *instance_cache); + /*! Creates a new database or opens an existing database file stored at the given path. If no path is given a new in-memory database is created instead. -The instantiated database should be closed with 'duckdb_close'. +The database must be closed with 'duckdb_close'. -* @param path Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database. +* @param path Path to the database file on disk. Both `nullptr` and `:memory:` open an in-memory database. * @param out_database The result database object. * @return `DuckDBSuccess` on success or `DuckDBError` on failure. */ @@ -693,13 +730,13 @@ DUCKDB_API duckdb_state duckdb_open(const char *path, duckdb_database *out_datab /*! Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the given path. -The instantiated database should be closed with 'duckdb_close'. +The database must be closed with 'duckdb_close'. -* @param path Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database. +* @param path Path to the database file on disk. Both `nullptr` and `:memory:` open an in-memory database. * @param out_database The result database object. -* @param config (Optional) configuration used to start up the database system. -* @param out_error If set and the function returns DuckDBError, this will contain the reason why the start-up failed. -Note that the error must be freed using `duckdb_free`. +* @param config (Optional) configuration used to start up the database. +* @param out_error If set and the function returns `DuckDBError`, this contains the error message. +Note that the error message must be freed using `duckdb_free`. * @return `DuckDBSuccess` on success or `DuckDBError` on failure. */ DUCKDB_API duckdb_state duckdb_open_ext(const char *path, duckdb_database *out_database, duckdb_config config, diff --git a/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp b/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp index f2d825f29..c00425420 100644 --- a/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp +++ b/src/duckdb/src/include/duckdb/main/capi/capi_internal.hpp @@ -16,6 +16,7 @@ #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/main/client_context.hpp" #include "duckdb/planner/expression/bound_parameter_data.hpp" +#include "duckdb/main/db_instance_cache.hpp" #include #include @@ -28,8 +29,12 @@ namespace duckdb { -struct DatabaseData { - unique_ptr database; +struct DBInstanceCacheWrapper { + unique_ptr instance_cache; +}; + +struct DatabaseWrapper { + shared_ptr database; }; struct PreparedStatementWrapper { diff --git a/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp b/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp index 6a8bb4bca..71e6d18bc 100644 --- a/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp +++ b/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp @@ -460,6 +460,13 @@ typedef struct { duckdb_arrow_schema arrow_schema, duckdb_arrow_array arrow_array, duckdb_arrow_stream *out_stream); duckdb_data_chunk (*duckdb_stream_fetch_chunk)(duckdb_result result); + // Exposing the instance cache + + duckdb_instance_cache (*duckdb_create_instance_cache)(); + duckdb_state (*duckdb_get_or_create_from_cache)(duckdb_instance_cache instance_cache, const char *path, + duckdb_database *out_database, duckdb_config config, + char **out_error); + void (*duckdb_destroy_instance_cache)(duckdb_instance_cache *instance_cache); // New append functions that are added duckdb_state (*duckdb_append_default_to_chunk)(duckdb_appender appender, duckdb_data_chunk chunk, idx_t col, @@ -875,6 +882,9 @@ inline duckdb_ext_api_v1 CreateAPIv1() { result.duckdb_arrow_scan = duckdb_arrow_scan; result.duckdb_arrow_array_scan = duckdb_arrow_array_scan; result.duckdb_stream_fetch_chunk = duckdb_stream_fetch_chunk; + result.duckdb_create_instance_cache = duckdb_create_instance_cache; + result.duckdb_get_or_create_from_cache = duckdb_get_or_create_from_cache; + result.duckdb_destroy_instance_cache = duckdb_destroy_instance_cache; result.duckdb_append_default_to_chunk = duckdb_append_default_to_chunk; return result; } diff --git a/src/duckdb/src/include/duckdb_extension.h b/src/duckdb/src/include/duckdb_extension.h index 6f01ef592..47c6a7f87 100644 --- a/src/duckdb/src/include/duckdb_extension.h +++ b/src/duckdb/src/include/duckdb_extension.h @@ -527,6 +527,15 @@ typedef struct { duckdb_data_chunk (*duckdb_stream_fetch_chunk)(duckdb_result result); #endif +// Exposing the instance cache +#ifdef DUCKDB_EXTENSION_API_VERSION_UNSTABLE + duckdb_instance_cache (*duckdb_create_instance_cache)(); + duckdb_state (*duckdb_get_or_create_from_cache)(duckdb_instance_cache instance_cache, const char *path, + duckdb_database *out_database, duckdb_config config, + char **out_error); + void (*duckdb_destroy_instance_cache)(duckdb_instance_cache *instance_cache); +#endif + // New append functions that are added #ifdef DUCKDB_EXTENSION_API_VERSION_UNSTABLE duckdb_state (*duckdb_append_default_to_chunk)(duckdb_appender appender, duckdb_data_chunk chunk, idx_t col, @@ -947,6 +956,11 @@ typedef struct { #define duckdb_arrow_array_scan duckdb_ext_api.duckdb_arrow_array_scan #define duckdb_stream_fetch_chunk duckdb_ext_api.duckdb_stream_fetch_chunk +// Version unstable_instance_cache +#define duckdb_create_instance_cache duckdb_ext_api.duckdb_create_instance_cache +#define duckdb_get_or_create_from_cache duckdb_ext_api.duckdb_get_or_create_from_cache +#define duckdb_destroy_instance_cache duckdb_ext_api.duckdb_destroy_instance_cache + // Version unstable_new_append_functions #define duckdb_append_default_to_chunk duckdb_ext_api.duckdb_append_default_to_chunk diff --git a/src/duckdb/src/main/capi/duckdb-c.cpp b/src/duckdb/src/main/capi/duckdb-c.cpp index ac944578c..ad47d7448 100644 --- a/src/duckdb/src/main/capi/duckdb-c.cpp +++ b/src/duckdb/src/main/capi/duckdb-c.cpp @@ -1,49 +1,88 @@ #include "duckdb/main/capi/capi_internal.hpp" using duckdb::Connection; -using duckdb::DatabaseData; +using duckdb::DatabaseWrapper; using duckdb::DBConfig; +using duckdb::DBInstanceCacheWrapper; using duckdb::DuckDB; using duckdb::ErrorData; -duckdb_state duckdb_open_ext(const char *path, duckdb_database *out, duckdb_config config, char **error) { - auto wrapper = new DatabaseData(); +duckdb_instance_cache duckdb_create_instance_cache() { + auto wrapper = new DBInstanceCacheWrapper(); + wrapper->instance_cache = duckdb::make_uniq(); + return reinterpret_cast(wrapper); +} + +void duckdb_destroy_instance_cache(duckdb_instance_cache *instance_cache) { + if (instance_cache && *instance_cache) { + auto wrapper = reinterpret_cast(*instance_cache); + delete wrapper; + *instance_cache = nullptr; + } +} + +duckdb_state duckdb_open_internal(DBInstanceCacheWrapper *cache, const char *path, duckdb_database *out, + duckdb_config config, char **out_error) { + auto wrapper = new DatabaseWrapper(); try { DBConfig default_config; default_config.SetOptionByName("duckdb_api", "capi"); DBConfig *db_config = &default_config; - DBConfig *user_config = (DBConfig *)config; + DBConfig *user_config = reinterpret_cast(config); if (user_config) { db_config = user_config; } - wrapper->database = duckdb::make_uniq(path, db_config); + if (cache) { + wrapper->database = cache->instance_cache->GetOrCreateInstance(path, *db_config, true); + } else { + wrapper->database = duckdb::make_shared_ptr(path, db_config); + } + } catch (std::exception &ex) { - if (error) { + if (out_error) { ErrorData parsed_error(ex); - *error = strdup(parsed_error.Message().c_str()); + *out_error = strdup(parsed_error.Message().c_str()); } delete wrapper; return DuckDBError; + } catch (...) { // LCOV_EXCL_START - if (error) { - *error = strdup("Unknown error"); + if (out_error) { + *out_error = strdup("Unknown error"); } delete wrapper; return DuckDBError; } // LCOV_EXCL_STOP - *out = (duckdb_database)wrapper; + + *out = reinterpret_cast(wrapper); return DuckDBSuccess; } +duckdb_state duckdb_get_or_create_from_cache(duckdb_instance_cache instance_cache, const char *path, + duckdb_database *out_database, duckdb_config config, char **out_error) { + if (!instance_cache) { + if (out_error) { + *out_error = strdup("instance cache cannot be nullptr"); + } + return DuckDBError; + } + auto cache = reinterpret_cast(instance_cache); + return duckdb_open_internal(cache, path, out_database, config, out_error); +} + +duckdb_state duckdb_open_ext(const char *path, duckdb_database *out, duckdb_config config, char **error) { + return duckdb_open_internal(nullptr, path, out, config, error); +} + duckdb_state duckdb_open(const char *path, duckdb_database *out) { return duckdb_open_ext(path, out, nullptr, nullptr); } void duckdb_close(duckdb_database *database) { if (database && *database) { - auto wrapper = reinterpret_cast(*database); + auto wrapper = reinterpret_cast(*database); delete wrapper; *database = nullptr; } @@ -53,14 +92,16 @@ duckdb_state duckdb_connect(duckdb_database database, duckdb_connection *out) { if (!database || !out) { return DuckDBError; } - auto wrapper = reinterpret_cast(database); + + auto wrapper = reinterpret_cast(database); Connection *connection; try { connection = new Connection(*wrapper->database); } catch (...) { // LCOV_EXCL_START return DuckDBError; } // LCOV_EXCL_STOP - *out = (duckdb_connection)connection; + + *out = reinterpret_cast(connection); return DuckDBSuccess; } diff --git a/src/duckdb/src/main/capi/replacement_scan-c.cpp b/src/duckdb/src/main/capi/replacement_scan-c.cpp index ff3eae4ff..5f23a3169 100644 --- a/src/duckdb/src/main/capi/replacement_scan-c.cpp +++ b/src/duckdb/src/main/capi/replacement_scan-c.cpp @@ -58,7 +58,7 @@ void duckdb_add_replacement_scan(duckdb_database db, duckdb_replacement_callback if (!db || !replacement) { return; } - auto wrapper = reinterpret_cast(db); + auto wrapper = reinterpret_cast(db); auto scan_info = duckdb::make_uniq(); scan_info->callback = replacement; scan_info->extra_data = extra_data; diff --git a/src/duckdb/src/main/capi/threading-c.cpp b/src/duckdb/src/main/capi/threading-c.cpp index b0a002f18..575f22c3c 100644 --- a/src/duckdb/src/main/capi/threading-c.cpp +++ b/src/duckdb/src/main/capi/threading-c.cpp @@ -1,10 +1,10 @@ #include "duckdb/main/capi/capi_internal.hpp" #include "duckdb/parallel/task_scheduler.hpp" -using duckdb::DatabaseData; +using duckdb::DatabaseWrapper; struct CAPITaskState { - CAPITaskState(duckdb::DatabaseInstance &db) + explicit CAPITaskState(duckdb::DatabaseInstance &db) : db(db), marker(duckdb::make_uniq>(true)), execute_count(0) { } @@ -17,7 +17,7 @@ void duckdb_execute_tasks(duckdb_database database, idx_t max_tasks) { if (!database) { return; } - auto wrapper = (DatabaseData *)database; + auto wrapper = reinterpret_cast(database); auto &scheduler = duckdb::TaskScheduler::GetScheduler(*wrapper->database->instance); scheduler.ExecuteTasks(max_tasks); } @@ -26,7 +26,7 @@ duckdb_task_state duckdb_create_task_state(duckdb_database database) { if (!database) { return nullptr; } - auto wrapper = (DatabaseData *)database; + auto wrapper = reinterpret_cast(database); auto state = new CAPITaskState(*wrapper->database->instance); return state; } diff --git a/src/duckdb/src/main/extension/extension_load.cpp b/src/duckdb/src/main/extension/extension_load.cpp index a7a7c62fe..23101f722 100644 --- a/src/duckdb/src/main/extension/extension_load.cpp +++ b/src/duckdb/src/main/extension/extension_load.cpp @@ -48,7 +48,7 @@ struct DuckDBExtensionLoadState { //! This is the duckdb_database struct that will be passed to the extension during initialization. Note that the //! extension does not need to free it. - unique_ptr database_data; + unique_ptr database_data; //! The function pointer struct passed to the extension. The extension is expected to copy this struct during //! initialization @@ -88,8 +88,8 @@ struct ExtensionAccess { try { // Create the duckdb_database - load_state.database_data = make_uniq(); - load_state.database_data->database = make_uniq(load_state.db); + load_state.database_data = make_uniq(); + load_state.database_data->database = make_shared_ptr(load_state.db); return reinterpret_cast(load_state.database_data.get()); } catch (std::exception &ex) { load_state.error_data = ErrorData(ex); From 06dd61fc1a68cc98054011bd26af17f667032f18 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:58:41 +0000 Subject: [PATCH 17/21] vendor: Update vendored sources to duckdb/duckdb@f850786ddef96177354ca8183280bc0de62a66c0 (#992) File url scheme (duckdb/duckdb#15563) Co-authored-by: krlmlr --- src/duckdb/src/common/file_system.cpp | 8 +- src/duckdb/src/common/local_file_system.cpp | 154 ++++++++++++++---- .../function/table/version/pragma_version.cpp | 6 +- .../duckdb/common/local_file_system.hpp | 3 + 4 files changed, 131 insertions(+), 40 deletions(-) diff --git a/src/duckdb/src/common/file_system.cpp b/src/duckdb/src/common/file_system.cpp index 26ac7fee3..fbcf0cede 100644 --- a/src/duckdb/src/common/file_system.cpp +++ b/src/duckdb/src/common/file_system.cpp @@ -115,7 +115,7 @@ string FileSystem::GetEnvVariable(const string &name) { bool FileSystem::IsPathAbsolute(const string &path) { auto path_separator = PathSeparator(path); - return PathMatched(path, path_separator); + return PathMatched(path, path_separator) || StringUtil::StartsWith(path, "file:/"); } string FileSystem::PathSeparator(const string &path) { @@ -237,7 +237,11 @@ string FileSystem::NormalizeAbsolutePath(const string &path) { } string FileSystem::PathSeparator(const string &path) { - return "\\"; + if (StringUtil::StartsWith(path, "file:")) { + return "/"; + } else { + return "\\"; + } } void FileSystem::SetWorkingDirectory(const string &path) { diff --git a/src/duckdb/src/common/local_file_system.cpp b/src/duckdb/src/common/local_file_system.cpp index 2827b33e3..83e242526 100644 --- a/src/duckdb/src/common/local_file_system.cpp +++ b/src/duckdb/src/common/local_file_system.cpp @@ -59,13 +59,13 @@ extern "C" WINBASEAPI BOOL QueryFullProcessImageNameW(HANDLE, DWORD, LPWSTR, PDW #endif namespace duckdb { - #ifndef _WIN32 bool LocalFileSystem::FileExists(const string &filename, optional_ptr opener) { if (!filename.empty()) { - if (access(filename.c_str(), 0) == 0) { + auto normalized_file = NormalizeLocalPath(filename); + if (access(normalized_file, 0) == 0) { struct stat status; - stat(filename.c_str(), &status); + stat(normalized_file, &status); if (S_ISREG(status.st_mode)) { return true; } @@ -77,9 +77,10 @@ bool LocalFileSystem::FileExists(const string &filename, optional_ptr opener) { if (!filename.empty()) { - if (access(filename.c_str(), 0) == 0) { + auto normalized_file = NormalizeLocalPath(filename); + if (access(normalized_file, 0) == 0) { struct stat status; - stat(filename.c_str(), &status); + stat(normalized_file, &status); if (S_ISFIFO(status.st_mode)) { return true; } @@ -90,8 +91,21 @@ bool LocalFileSystem::IsPipe(const string &filename, optional_ptr op } #else +static std::wstring NormalizePathAndConvertToUnicode(const string &path) { + string normalized_path_copy; + const char *normalized_path; + if (StringUtil::StartsWith(path, "file:/")) { + normalized_path_copy = LocalFileSystem::NormalizeLocalPath(path); + normalized_path_copy = LocalFileSystem().ConvertSeparators(normalized_path_copy); + normalized_path = normalized_path_copy.c_str(); + } else { + normalized_path = path.c_str(); + } + return WindowsUtil::UTF8ToUnicode(normalized_path); +} + bool LocalFileSystem::FileExists(const string &filename, optional_ptr opener) { - auto unicode_path = WindowsUtil::UTF8ToUnicode(filename.c_str()); + auto unicode_path = NormalizePathAndConvertToUnicode(filename); const wchar_t *wpath = unicode_path.c_str(); if (_waccess(wpath, 0) == 0) { struct _stati64 status; @@ -103,7 +117,7 @@ bool LocalFileSystem::FileExists(const string &filename, optional_ptr opener) { - auto unicode_path = WindowsUtil::UTF8ToUnicode(filename.c_str()); + auto unicode_path = NormalizePathAndConvertToUnicode(filename); const wchar_t *wpath = unicode_path.c_str(); if (_waccess(wpath, 0) == 0) { struct _stati64 status; @@ -180,7 +194,7 @@ static string AdditionalProcessInfo(FileSystem &fs, pid_t pid) { } string process_name, process_owner; -// macOS >= 10.7 has PROC_PIDT_SHORTBSDINFO + // macOS >= 10.7 has PROC_PIDT_SHORTBSDINFO #ifdef PROC_PIDT_SHORTBSDINFO // try to find out more about the process holding the lock struct proc_bsdshortinfo proc; @@ -261,10 +275,11 @@ static string AdditionalProcessInfo(FileSystem &fs, pid_t pid) { bool LocalFileSystem::IsPrivateFile(const string &path_p, FileOpener *opener) { auto path = FileSystem::ExpandPath(path_p, opener); + auto normalized_path = NormalizeLocalPath(path); struct stat st; - if (lstat(path.c_str(), &st) != 0) { + if (lstat(normalized_path, &st) != 0) { throw IOException( "Failed to stat '%s' when checking file permissions, file may be missing or have incorrect permissions", path.c_str()); @@ -281,6 +296,7 @@ bool LocalFileSystem::IsPrivateFile(const string &path_p, FileOpener *opener) { unique_ptr LocalFileSystem::OpenFile(const string &path_p, FileOpenFlags flags, optional_ptr opener) { auto path = FileSystem::ExpandPath(path_p, opener); + auto normalized_path = NormalizeLocalPath(path); if (flags.Compression() != FileCompressionType::UNCOMPRESSED) { throw NotImplementedException("Unsupported compression type for default file system"); } @@ -338,7 +354,7 @@ unique_ptr LocalFileSystem::OpenFile(const string &path_p, FileOpenF } // Open the file - int fd = open(path.c_str(), open_flags, filesec); + int fd = open(normalized_path, open_flags, filesec); if (fd == -1) { if (flags.ReturnNullIfNotExists() && errno == ENOENT) { @@ -560,9 +576,10 @@ void LocalFileSystem::Truncate(FileHandle &handle, int64_t new_size) { bool LocalFileSystem::DirectoryExists(const string &directory, optional_ptr opener) { if (!directory.empty()) { - if (access(directory.c_str(), 0) == 0) { + auto normalized_dir = NormalizeLocalPath(directory); + if (access(normalized_dir, 0) == 0) { struct stat status; - stat(directory.c_str(), &status); + stat(normalized_dir, &status); if (status.st_mode & S_IFDIR) { return true; } @@ -575,9 +592,10 @@ bool LocalFileSystem::DirectoryExists(const string &directory, optional_ptr opener) { struct stat st; - if (stat(directory.c_str(), &st) != 0) { + auto normalized_dir = NormalizeLocalPath(directory); + if (stat(normalized_dir, &st) != 0) { /* Directory does not exist. EEXIST for race condition */ - if (mkdir(directory.c_str(), 0755) != 0 && errno != EEXIST) { + if (mkdir(normalized_dir, 0755) != 0 && errno != EEXIST) { throw IOException("Failed to create directory \"%s\": %s", {{"errno", std::to_string(errno)}}, directory, strerror(errno)); } @@ -628,11 +646,13 @@ int RemoveDirectoryRecursive(const char *path) { } void LocalFileSystem::RemoveDirectory(const string &directory, optional_ptr opener) { - RemoveDirectoryRecursive(directory.c_str()); + auto normalized_dir = NormalizeLocalPath(directory); + RemoveDirectoryRecursive(normalized_dir); } void LocalFileSystem::RemoveFile(const string &filename, optional_ptr opener) { - if (std::remove(filename.c_str()) != 0) { + auto normalized_file = NormalizeLocalPath(filename); + if (std::remove(normalized_file) != 0) { throw IOException("Could not remove file \"%s\": %s", {{"errno", std::to_string(errno)}}, filename, strerror(errno)); } @@ -640,7 +660,8 @@ void LocalFileSystem::RemoveFile(const string &filename, optional_ptr &callback, FileOpener *opener) { - auto dir = opendir(directory.c_str()); + auto normalized_dir = NormalizeLocalPath(directory); + auto dir = opendir(normalized_dir); if (!dir) { return false; } @@ -657,7 +678,7 @@ bool LocalFileSystem::ListFiles(const string &directory, const std::function opener) { + auto normalized_source = NormalizeLocalPath(source); + auto normalized_target = NormalizeLocalPath(target); //! FIXME: rename does not guarantee atomicity or overwriting target file if it exists - if (rename(source.c_str(), target.c_str()) != 0) { + if (rename(normalized_source, normalized_target) != 0) { throw IOException("Could not rename file!", {{"errno", std::to_string(errno)}}); } } @@ -808,6 +831,7 @@ bool LocalFileSystem::IsPrivateFile(const string &path_p, FileOpener *opener) { unique_ptr LocalFileSystem::OpenFile(const string &path_p, FileOpenFlags flags, optional_ptr opener) { auto path = FileSystem::ExpandPath(path_p, opener); + auto unicode_path = NormalizePathAndConvertToUnicode(path); if (flags.Compression() != FileCompressionType::UNCOMPRESSED) { throw NotImplementedException("Unsupported compression type for default file system"); } @@ -841,7 +865,6 @@ unique_ptr LocalFileSystem::OpenFile(const string &path_p, FileOpenF if (flags.DirectIO()) { flags_and_attributes |= FILE_FLAG_NO_BUFFERING; } - auto unicode_path = WindowsUtil::UTF8ToUnicode(path.c_str()); HANDLE hFile = CreateFileW(unicode_path.c_str(), desired_access, share_mode, NULL, creation_disposition, flags_and_attributes, NULL); if (hFile == INVALID_HANDLE_VALUE) { @@ -1016,10 +1039,14 @@ void LocalFileSystem::Truncate(FileHandle &handle, int64_t new_size) { } static DWORD WindowsGetFileAttributes(const string &filename) { - auto unicode_path = WindowsUtil::UTF8ToUnicode(filename.c_str()); + auto unicode_path = NormalizePathAndConvertToUnicode(filename); return GetFileAttributesW(unicode_path.c_str()); } +static DWORD WindowsGetFileAttributes(const std::wstring &filename) { + return GetFileAttributesW(filename.c_str()); +} + bool LocalFileSystem::DirectoryExists(const string &directory, optional_ptr opener) { DWORD attrs = WindowsGetFileAttributes(directory); return (attrs != INVALID_FILE_ATTRIBUTES && (attrs & FILE_ATTRIBUTE_DIRECTORY)); @@ -1029,7 +1056,7 @@ void LocalFileSystem::CreateDirectory(const string &directory, optional_ptr opener) { - auto unicode_path = WindowsUtil::UTF8ToUnicode(filename.c_str()); + auto unicode_path = NormalizePathAndConvertToUnicode(filename); if (!DeleteFileW(unicode_path.c_str())) { auto error = LocalFileSystem::GetLastErrorAsString(); throw IOException("Failed to delete file \"%s\": %s", filename, error); @@ -1072,8 +1099,7 @@ void LocalFileSystem::RemoveFile(const string &filename, optional_ptr &callback, FileOpener *opener) { string search_dir = JoinPath(directory, "*"); - - auto unicode_path = WindowsUtil::UTF8ToUnicode(search_dir.c_str()); + auto unicode_path = NormalizePathAndConvertToUnicode(search_dir); WIN32_FIND_DATAW ffd; HANDLE hFind = FindFirstFileW(unicode_path.c_str(), &ffd); @@ -1106,8 +1132,9 @@ void LocalFileSystem::FileSync(FileHandle &handle) { } void LocalFileSystem::MoveFile(const string &source, const string &target, optional_ptr opener) { - auto source_unicode = WindowsUtil::UTF8ToUnicode(source.c_str()); - auto target_unicode = WindowsUtil::UTF8ToUnicode(target.c_str()); + auto source_unicode = NormalizePathAndConvertToUnicode(source); + auto target_unicode = NormalizePathAndConvertToUnicode(target); + if (!MoveFileW(source_unicode.c_str(), target_unicode.c_str())) { throw IOException("Could not move file: %s", GetLastErrorAsString()); } @@ -1119,7 +1146,8 @@ FileType LocalFileSystem::GetFileType(FileHandle &handle) { if (strncmp(path.c_str(), PIPE_PREFIX, strlen(PIPE_PREFIX)) == 0) { return FileType::FILE_TYPE_FIFO; } - DWORD attrs = WindowsGetFileAttributes(path.c_str()); + auto normalized_path = NormalizePathAndConvertToUnicode(path); + DWORD attrs = WindowsGetFileAttributes(normalized_path); if (attrs != INVALID_FILE_ATTRIBUTES) { if (attrs & FILE_ATTRIBUTE_DIRECTORY) { return FileType::FILE_TYPE_DIR; @@ -1161,9 +1189,10 @@ static bool HasMultipleCrawl(const vector &splits) { return std::count(splits.begin(), splits.end(), "**") > 1; } static bool IsSymbolicLink(const string &path) { + auto normalized_path = LocalFileSystem::NormalizeLocalPath(path); #ifndef _WIN32 struct stat status; - return (lstat(path.c_str(), &status) != -1 && S_ISLNK(status.st_mode)); + return (lstat(normalized_path, &status) != -1 && S_ISLNK(status.st_mode)); #else auto attributes = WindowsGetFileAttributes(path); if (attributes == INVALID_FILE_ATTRIBUTES) @@ -1230,14 +1259,59 @@ vector LocalFileSystem::FetchFileWithoutGlob(const string &path, FileOpe return result; } +// Helper function to handle file:/ URLs +static idx_t GetFileUrlOffset(const string &path) { + if (!StringUtil::StartsWith(path, "file:/")) { + return 0; + } + + // Url without host: file:/some/path + if (path[6] != '/') { +#ifdef _WIN32 + return 6; +#else + return 5; +#endif + } + + // Url with empty host: file:///some/path + if (path[7] == '/') { +#ifdef _WIN32 + return 8; +#else + return 7; +#endif + } + + // Url with localhost: file://localhost/some/path + if (path.compare(7, 10, "localhost/") == 0) { +#ifdef _WIN32 + return 17; +#else + return 16; +#endif + } + + // unkown file:/ url format + return 0; +} + +const char *LocalFileSystem::NormalizeLocalPath(const string &path) { + return path.c_str() + GetFileUrlOffset(path); +} + vector LocalFileSystem::Glob(const string &path, FileOpener *opener) { if (path.empty()) { return vector(); } // split up the path into separate chunks vector splits; + + bool is_file_url = StringUtil::StartsWith(path, "file:/"); + idx_t file_url_path_offset = GetFileUrlOffset(path); + idx_t last_pos = 0; - for (idx_t i = 0; i < path.size(); i++) { + for (idx_t i = file_url_path_offset; i < path.size(); i++) { if (path[i] == '\\' || path[i] == '/') { if (i == last_pos) { // empty: skip this position @@ -1245,6 +1319,7 @@ vector LocalFileSystem::Glob(const string &path, FileOpener *opener) { continue; } if (splits.empty()) { + // splits.push_back(path.substr(file_url_path_offset, i-file_url_path_offset)); splits.push_back(path.substr(0, i)); } else { splits.push_back(path.substr(last_pos, i - last_pos)); @@ -1255,10 +1330,10 @@ vector LocalFileSystem::Glob(const string &path, FileOpener *opener) { splits.push_back(path.substr(last_pos, path.size() - last_pos)); // handle absolute paths bool absolute_path = false; - if (path[0] == '/') { + if (IsPathAbsolute(path)) { // first character is a slash - unix absolute path absolute_path = true; - } else if (StringUtil::Contains(splits[0], ":")) { + } else if (StringUtil::Contains(splits[0], ":")) { // TODO: this is weird? shouldn't IsPathAbsolute handle this? // first split has a colon - windows absolute path absolute_path = true; } else if (splits[0] == "~") { @@ -1298,7 +1373,16 @@ vector LocalFileSystem::Glob(const string &path, FileOpener *opener) { throw IOException("Cannot use multiple \'**\' in one path"); } - for (idx_t i = absolute_path ? 1 : 0; i < splits.size(); i++) { + idx_t start_index; + if (is_file_url) { + start_index = 1; + } else if (absolute_path) { + start_index = 1; + } else { + start_index = 0; + } + + for (idx_t i = start_index ? 1 : 0; i < splits.size(); i++) { bool is_last_chunk = i + 1 == splits.size(); bool has_glob = HasGlob(splits[i]); // if it's the last chunk we need to find files, otherwise we find directories diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 5f0897e3d..41709636d 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4230" +#define DUCKDB_PATCH_VERSION "4-dev4241" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4230" +#define DUCKDB_VERSION "v1.1.4-dev4241" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "00cbb18d63" +#define DUCKDB_SOURCE_ID "f850786dde" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/local_file_system.hpp b/src/duckdb/src/include/duckdb/common/local_file_system.hpp index fe4e02ad8..c2d38152a 100644 --- a/src/duckdb/src/include/duckdb/common/local_file_system.hpp +++ b/src/duckdb/src/include/duckdb/common/local_file_system.hpp @@ -96,6 +96,9 @@ class LocalFileSystem : public FileSystem { //! Checks a file is private (checks for 600 on linux/macos, TODO: currently always returns true on windows) static bool IsPrivateFile(const string &path_p, FileOpener *opener); + // returns a C-string of the path that trims any file:/ prefix + static const char *NormalizeLocalPath(const string &path); + private: //! Set the file pointer of a file handle to a specified location. Reads and writes will happen from this location void SetFilePointer(FileHandle &handle, idx_t location); From c2d7dd38d314d6394841ae11c35770364f5bdbad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:44:40 +0000 Subject: [PATCH 18/21] vendor: Update vendored sources to duckdb/duckdb@acea8cde22f802b792adc8cfdeea335103edc5fe (#994) Issue duckdb/duckdb#15597: Temporal Error Messages (duckdb/duckdb#15635) Fix an if statement that is always True (duckdb/duckdb#15630) Co-authored-by: krlmlr --- .../core_functions/scalar/date/epoch.cpp | 2 +- src/duckdb/src/common/enum_util.cpp | 27 ++++++- .../src/common/operator/cast_operators.cpp | 33 ++++++--- src/duckdb/src/common/types/date.cpp | 64 +++++++++------- src/duckdb/src/common/types/timestamp.cpp | 74 +++++++++++-------- .../scanner/string_value_scanner.cpp | 5 +- .../csv_scanner/sniffer/type_detection.cpp | 2 +- .../function/table/version/pragma_version.cpp | 6 +- .../src/include/duckdb/common/enum_util.hpp | 8 ++ .../src/include/duckdb/common/types/date.hpp | 13 +++- .../include/duckdb/common/types/timestamp.hpp | 13 ++-- 11 files changed, 164 insertions(+), 83 deletions(-) diff --git a/src/duckdb/extension/core_functions/scalar/date/epoch.cpp b/src/duckdb/extension/core_functions/scalar/date/epoch.cpp index c30530341..cda3232a4 100644 --- a/src/duckdb/extension/core_functions/scalar/date/epoch.cpp +++ b/src/duckdb/extension/core_functions/scalar/date/epoch.cpp @@ -11,7 +11,7 @@ struct EpochSecOperator { static RESULT_TYPE Operation(INPUT_TYPE sec) { int64_t result; if (!TryCast::Operation(sec * Interval::MICROS_PER_SEC, result)) { - throw ConversionException("Could not convert epoch seconds to TIMESTAMP WITH TIME ZONE"); + throw ConversionException("Epoch seconds out of range for TIMESTAMP WITH TIME ZONE"); } return timestamp_t(result); } diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index 6baa6854f..1fd91919b 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -73,6 +73,7 @@ #include "duckdb/common/types/column/column_data_scan_states.hpp" #include "duckdb/common/types/column/partitioned_column_data.hpp" #include "duckdb/common/types/conflict_manager.hpp" +#include "duckdb/common/types/date.hpp" #include "duckdb/common/types/hyperloglog.hpp" #include "duckdb/common/types/row/partitioned_tuple_data.hpp" #include "duckdb/common/types/row/tuple_data_states.hpp" @@ -1009,6 +1010,25 @@ DataFileType EnumUtil::FromString(const char *value) { return static_cast(StringUtil::StringToEnum(GetDataFileTypeValues(), 4, "DataFileType", value)); } +const StringUtil::EnumStringLiteral *GetDateCastResultValues() { + static constexpr StringUtil::EnumStringLiteral values[] { + { static_cast(DateCastResult::SUCCESS), "SUCCESS" }, + { static_cast(DateCastResult::ERROR_INCORRECT_FORMAT), "ERROR_INCORRECT_FORMAT" }, + { static_cast(DateCastResult::ERROR_RANGE), "ERROR_RANGE" } + }; + return values; +} + +template<> +const char* EnumUtil::ToChars(DateCastResult value) { + return StringUtil::EnumToString(GetDateCastResultValues(), 3, "DateCastResult", static_cast(value)); +} + +template<> +DateCastResult EnumUtil::FromString(const char *value) { + return static_cast(StringUtil::StringToEnum(GetDateCastResultValues(), 3, "DateCastResult", value)); +} + const StringUtil::EnumStringLiteral *GetDatePartSpecifierValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(DatePartSpecifier::YEAR), "YEAR" }, @@ -3880,19 +3900,20 @@ const StringUtil::EnumStringLiteral *GetTimestampCastResultValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(TimestampCastResult::SUCCESS), "SUCCESS" }, { static_cast(TimestampCastResult::ERROR_INCORRECT_FORMAT), "ERROR_INCORRECT_FORMAT" }, - { static_cast(TimestampCastResult::ERROR_NON_UTC_TIMEZONE), "ERROR_NON_UTC_TIMEZONE" } + { static_cast(TimestampCastResult::ERROR_NON_UTC_TIMEZONE), "ERROR_NON_UTC_TIMEZONE" }, + { static_cast(TimestampCastResult::ERROR_RANGE), "ERROR_RANGE" } }; return values; } template<> const char* EnumUtil::ToChars(TimestampCastResult value) { - return StringUtil::EnumToString(GetTimestampCastResultValues(), 3, "TimestampCastResult", static_cast(value)); + return StringUtil::EnumToString(GetTimestampCastResultValues(), 4, "TimestampCastResult", static_cast(value)); } template<> TimestampCastResult EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetTimestampCastResultValues(), 3, "TimestampCastResult", value)); + return static_cast(StringUtil::StringToEnum(GetTimestampCastResultValues(), 4, "TimestampCastResult", value)); } const StringUtil::EnumStringLiteral *GetTransactionModifierTypeValues() { diff --git a/src/duckdb/src/common/operator/cast_operators.cpp b/src/duckdb/src/common/operator/cast_operators.cpp index fe1ce26af..9742c6087 100644 --- a/src/duckdb/src/common/operator/cast_operators.cpp +++ b/src/duckdb/src/common/operator/cast_operators.cpp @@ -1470,8 +1470,16 @@ bool TryCastToUUID::Operation(string_t input, hugeint_t &result, Vector &result_ //===--------------------------------------------------------------------===// template <> bool TryCastErrorMessage::Operation(string_t input, date_t &result, CastParameters ¶meters) { - if (!TryCast::Operation(input, result, parameters.strict)) { - HandleCastError::AssignError(Date::ConversionError(input), parameters); + idx_t pos; + bool special = false; + switch (Date::TryConvertDate(input.GetData(), input.GetSize(), pos, result, special, parameters.strict)) { + case DateCastResult::SUCCESS: + break; + case DateCastResult::ERROR_INCORRECT_FORMAT: + HandleCastError::AssignError(Date::FormatError(input), parameters); + return false; + case DateCastResult::ERROR_RANGE: + HandleCastError::AssignError(Date::RangeError(input), parameters); return false; } return true; @@ -1481,7 +1489,8 @@ template <> bool TryCast::Operation(string_t input, date_t &result, bool strict) { idx_t pos; bool special = false; - return Date::TryConvertDate(input.GetData(), input.GetSize(), pos, result, special, strict); + return Date::TryConvertDate(input.GetData(), input.GetSize(), pos, result, special, strict) == + DateCastResult::SUCCESS; } template <> @@ -1545,14 +1554,18 @@ dtime_tz_t Cast::Operation(string_t input) { //===--------------------------------------------------------------------===// template <> bool TryCastErrorMessage::Operation(string_t input, timestamp_t &result, CastParameters ¶meters) { - auto cast_result = Timestamp::TryConvertTimestamp(input.GetData(), input.GetSize(), result); - if (cast_result == TimestampCastResult::SUCCESS) { + switch (Timestamp::TryConvertTimestamp(input.GetData(), input.GetSize(), result)) { + case TimestampCastResult::SUCCESS: return true; - } - if (cast_result == TimestampCastResult::ERROR_INCORRECT_FORMAT) { - HandleCastError::AssignError(Timestamp::ConversionError(input), parameters); - } else { + case TimestampCastResult::ERROR_INCORRECT_FORMAT: + HandleCastError::AssignError(Timestamp::FormatError(input), parameters); + break; + case TimestampCastResult::ERROR_NON_UTC_TIMEZONE: HandleCastError::AssignError(Timestamp::UnsupportedTimezoneError(input), parameters); + break; + case TimestampCastResult::ERROR_RANGE: + HandleCastError::AssignError(Timestamp::RangeError(input), parameters); + break; } return false; } @@ -1578,7 +1591,7 @@ timestamp_ns_t Cast::Operation(string_t input) { const auto ts = Timestamp::FromCString(input.GetData(), input.GetSize(), &nanos); timestamp_ns_t result; if (!Timestamp::TryFromTimestampNanos(ts, nanos, result)) { - throw ConversionException(Timestamp::ConversionError(input)); + throw ConversionException(Timestamp::RangeError(input)); } return result; } diff --git a/src/duckdb/src/common/types/date.cpp b/src/duckdb/src/common/types/date.cpp index 7c9f78ca8..429ee0311 100644 --- a/src/duckdb/src/common/types/date.cpp +++ b/src/duckdb/src/common/types/date.cpp @@ -205,11 +205,12 @@ bool Date::TryConvertDateSpecial(const char *buf, idx_t len, idx_t &pos, const c return true; } -bool Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result, bool &special, bool strict) { +DateCastResult Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result, bool &special, + bool strict) { special = false; pos = 0; if (len == 0) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } int32_t day = 0; @@ -224,13 +225,13 @@ bool Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result } if (pos >= len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } if (buf[pos] == '-') { yearneg = true; pos++; if (pos >= len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } } if (!StringUtil::CharacterIsDigit(buf[pos])) { @@ -240,62 +241,62 @@ bool Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result } else if (TryConvertDateSpecial(buf, len, pos, EPOCH)) { result = date_t::epoch(); } else { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } // skip trailing spaces - parsing must be strict here while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { pos++; } special = true; - return pos == len; + return (pos == len) ? DateCastResult::SUCCESS : DateCastResult::ERROR_INCORRECT_FORMAT; } // first parse the year idx_t year_length = 0; for (; pos < len && StringUtil::CharacterIsDigit(buf[pos]); pos++) { if (year >= 100000000) { - return false; + return DateCastResult::ERROR_RANGE; } year = (buf[pos] - '0') + year * 10; year_length++; } if (year_length < 2 && strict) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } if (yearneg) { year = -year; } if (pos >= len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } // fetch the separator sep = buf[pos++]; if (sep != ' ' && sep != '-' && sep != '/' && sep != '\\') { // invalid separator - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } // parse the month if (!Date::ParseDoubleDigit(buf, len, pos, month)) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } if (pos >= len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } if (buf[pos++] != sep) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } if (pos >= len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } // now parse the day if (!Date::ParseDoubleDigit(buf, len, pos, day)) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } // check for an optional trailing " (BC)"" @@ -303,7 +304,7 @@ bool Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result StringUtil::CharacterToLower(buf[pos + 2]) == 'b' && StringUtil::CharacterToLower(buf[pos + 3]) == 'c' && buf[pos + 4] == ')') { if (yearneg || year == 0) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } year = -year + 1; pos += 5; @@ -317,34 +318,47 @@ bool Date::TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result } // check position. if end was not reached, non-space chars remaining if (pos < len) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } } else { // in non-strict mode, check for any direct trailing digits if (pos < len && StringUtil::CharacterIsDigit(buf[pos])) { - return false; + return DateCastResult::ERROR_INCORRECT_FORMAT; } } - return Date::TryFromDate(year, month, day, result); + return Date::TryFromDate(year, month, day, result) ? DateCastResult::SUCCESS : DateCastResult::ERROR_RANGE; } -string Date::ConversionError(const string &str) { - return StringUtil::Format("date field value out of range: \"%s\", " +string Date::FormatError(const string &str) { + return StringUtil::Format("invalid date field format: \"%s\", " "expected format is (YYYY-MM-DD)", str); } -string Date::ConversionError(string_t str) { - return ConversionError(str.GetString()); +string Date::RangeError(const string &str) { + return StringUtil::Format("date field value out of range: \"%s\"", str); +} + +string Date::RangeError(string_t str) { + return RangeError(str.GetString()); +} + +string Date::FormatError(string_t str) { + return FormatError(str.GetString()); } date_t Date::FromCString(const char *buf, idx_t len, bool strict) { date_t result; idx_t pos; bool special = false; - if (!TryConvertDate(buf, len, pos, result, special, strict)) { - throw ConversionException(ConversionError(string(buf, len))); + switch (TryConvertDate(buf, len, pos, result, special, strict)) { + case DateCastResult::ERROR_INCORRECT_FORMAT: + throw ConversionException(FormatError(string(buf, len))); + case DateCastResult::ERROR_RANGE: + throw ConversionException(RangeError(string(buf, len))); + case DateCastResult::SUCCESS: + break; } return result; } diff --git a/src/duckdb/src/common/types/timestamp.cpp b/src/duckdb/src/common/types/timestamp.cpp index f90c0f77f..d596e7530 100644 --- a/src/duckdb/src/common/types/timestamp.cpp +++ b/src/duckdb/src/common/types/timestamp.cpp @@ -64,25 +64,31 @@ timestamp_t ×tamp_t::operator-=(const int64_t &delta) { return *this; } -bool Timestamp::TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &result, bool &has_offset, string_t &tz, - optional_ptr nanos) { +TimestampCastResult Timestamp::TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &result, bool &has_offset, + string_t &tz, optional_ptr nanos) { idx_t pos; date_t date; dtime_t time; has_offset = false; - if (!Date::TryConvertDate(str, len, pos, date, has_offset)) { - return false; + switch (Date::TryConvertDate(str, len, pos, date, has_offset)) { + case DateCastResult::ERROR_INCORRECT_FORMAT: + return TimestampCastResult::ERROR_INCORRECT_FORMAT; + case DateCastResult::ERROR_RANGE: + return TimestampCastResult::ERROR_RANGE; + default: + break; } if (pos == len) { // no time: only a date or special if (date == date_t::infinity()) { result = timestamp_t::infinity(); - return true; + return TimestampCastResult::SUCCESS; } else if (date == date_t::ninfinity()) { result = timestamp_t::ninfinity(); - return true; + return TimestampCastResult::SUCCESS; } - return Timestamp::TryFromDatetime(date, dtime_t(0), result); + return Timestamp::TryFromDatetime(date, dtime_t(0), result) ? TimestampCastResult::SUCCESS + : TimestampCastResult::ERROR_RANGE; } // try to parse a time field if (str[pos] == ' ' || str[pos] == 'T') { @@ -93,15 +99,15 @@ bool Timestamp::TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &r // operation. Note that we can't pass strict== true here because we // want to process any suffix. if (!Time::TryConvertInterval(str + pos, len - pos, time_pos, time, false, nanos)) { - return false; + return TimestampCastResult::ERROR_INCORRECT_FORMAT; } // We parsed an interval, so make sure it is in range. if (time.micros > Interval::MICROS_PER_DAY) { - return false; + return TimestampCastResult::ERROR_RANGE; } pos += time_pos; if (!Timestamp::TryFromDatetime(date, time, result)) { - return false; + return TimestampCastResult::ERROR_RANGE; } if (pos < len) { // skip a "Z" at the end (as per the ISO8601 specs) @@ -112,13 +118,13 @@ bool Timestamp::TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &r } else if (Timestamp::TryParseUTCOffset(str, pos, len, hour_offset, minute_offset)) { const int64_t delta = hour_offset * Interval::MICROS_PER_HOUR + minute_offset * Interval::MICROS_PER_MINUTE; if (!TrySubtractOperator::Operation(result.value, delta, result.value)) { - return false; + return TimestampCastResult::ERROR_RANGE; } has_offset = true; } else { // Parse a time zone: / [A-Za-z0-9/_]+/ if (str[pos++] != ' ') { - return false; + return TimestampCastResult::ERROR_NON_UTC_TIMEZONE; } auto tz_name = str + pos; for (; pos < len && CharacterIsTimeZone(str[pos]); ++pos) { @@ -136,10 +142,10 @@ bool Timestamp::TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &r pos++; } if (pos < len) { - return false; + return TimestampCastResult::ERROR_INCORRECT_FORMAT; } } - return true; + return TimestampCastResult::SUCCESS; } TimestampCastResult Timestamp::TryConvertTimestamp(const char *str, idx_t len, timestamp_t &result, @@ -148,8 +154,8 @@ TimestampCastResult Timestamp::TryConvertTimestamp(const char *str, idx_t len, t bool has_offset = false; // We don't understand TZ without an extension, so fail if one was provided. auto success = TryConvertTimestampTZ(str, len, result, has_offset, tz, nanos); - if (!success) { - return TimestampCastResult::ERROR_INCORRECT_FORMAT; + if (success != TimestampCastResult::SUCCESS) { + return success; } if (tz.GetSize() == 0) { // no timezone provided - success! @@ -195,8 +201,8 @@ TimestampCastResult Timestamp::TryConvertTimestamp(const char *str, idx_t len, t return TimestampCastResult::SUCCESS; } -string Timestamp::ConversionError(const string &str) { - return StringUtil::Format("timestamp field value out of range: \"%s\", " +string Timestamp::FormatError(const string &str) { + return StringUtil::Format("invalid timestamp field format: \"%s\", " "expected format is (YYYY-MM-DD HH:MM:SS[.US][±HH:MM| ZONE])", str); } @@ -207,25 +213,35 @@ string Timestamp::UnsupportedTimezoneError(const string &str) { str); } -string Timestamp::ConversionError(string_t str) { - return Timestamp::ConversionError(str.GetString()); +string Timestamp::RangeError(const string &str) { + return StringUtil::Format("timestamp field value out of range: \"%s\"", str); +} + +string Timestamp::FormatError(string_t str) { + return Timestamp::FormatError(str.GetString()); } string Timestamp::UnsupportedTimezoneError(string_t str) { return Timestamp::UnsupportedTimezoneError(str.GetString()); } +string Timestamp::RangeError(string_t str) { + return Timestamp::RangeError(str.GetString()); +} + timestamp_t Timestamp::FromCString(const char *str, idx_t len, optional_ptr nanos) { timestamp_t result; - auto cast_result = Timestamp::TryConvertTimestamp(str, len, result, nanos); - if (cast_result == TimestampCastResult::SUCCESS) { - return result; - } - if (cast_result == TimestampCastResult::ERROR_NON_UTC_TIMEZONE) { - throw ConversionException(Timestamp::UnsupportedTimezoneError(string(str, len))); - } else { - throw ConversionException(Timestamp::ConversionError(string(str, len))); + switch (Timestamp::TryConvertTimestamp(str, len, result, nanos)) { + case TimestampCastResult::SUCCESS: + break; + case TimestampCastResult::ERROR_NON_UTC_TIMEZONE: + throw ConversionException(UnsupportedTimezoneError(string(str, len))); + case TimestampCastResult::ERROR_INCORRECT_FORMAT: + throw ConversionException(FormatError(string(str, len))); + case TimestampCastResult::ERROR_RANGE: + throw ConversionException(RangeError(string(str, len))); } + return result; } bool Timestamp::TryParseUTCOffset(const char *str, idx_t &pos, idx_t len, int &hour_offset, int &minute_offset) { @@ -338,7 +354,7 @@ bool Timestamp::TryFromDatetime(date_t date, dtime_tz_t timetz, timestamp_t &res timestamp_t Timestamp::FromDatetime(date_t date, dtime_t time) { timestamp_t result; if (!TryFromDatetime(date, time, result)) { - throw ConversionException("Overflow exception in date/time -> timestamp conversion"); + throw ConversionException("Date and time not in timestamp range"); } return result; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index aeb305a55..c1ab79dfe 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -334,8 +334,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size } else { idx_t pos; bool special; - success = Date::TryConvertDate( - value_ptr, size, pos, static_cast(vector_ptr[chunk_col_id])[number_of_rows], special, false); + success = Date::TryConvertDate(value_ptr, size, pos, + static_cast(vector_ptr[chunk_col_id])[number_of_rows], special, + false) == DateCastResult::SUCCESS; } break; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index 49bf105a9..d6b2b1a6b 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -162,7 +162,7 @@ bool CSVSniffer::CanYouCastIt(ClientContext &context, const string_t value, cons idx_t pos; bool special; date_t dummy_value; - return Date::TryConvertDate(value_ptr, value_size, pos, dummy_value, special, true); + return Date::TryConvertDate(value_ptr, value_size, pos, dummy_value, special, true) == DateCastResult::SUCCESS; } case LogicalTypeId::TIMESTAMP: { timestamp_t dummy_value; diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 41709636d..c61c54141 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4241" +#define DUCKDB_PATCH_VERSION "4-dev4248" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4241" +#define DUCKDB_VERSION "v1.1.4-dev4248" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "f850786dde" +#define DUCKDB_SOURCE_ID "acea8cde22" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/enum_util.hpp b/src/duckdb/src/include/duckdb/common/enum_util.hpp index 6215e10fd..852f43436 100644 --- a/src/duckdb/src/include/duckdb/common/enum_util.hpp +++ b/src/duckdb/src/include/duckdb/common/enum_util.hpp @@ -116,6 +116,8 @@ enum class CopyToType : uint8_t; enum class DataFileType : uint8_t; +enum class DateCastResult : uint8_t; + enum class DatePartSpecifier : uint8_t; enum class DebugInitialize : uint8_t; @@ -507,6 +509,9 @@ const char* EnumUtil::ToChars(CopyToType value); template<> const char* EnumUtil::ToChars(DataFileType value); +template<> +const char* EnumUtil::ToChars(DateCastResult value); + template<> const char* EnumUtil::ToChars(DatePartSpecifier value); @@ -1030,6 +1035,9 @@ CopyToType EnumUtil::FromString(const char *value); template<> DataFileType EnumUtil::FromString(const char *value); +template<> +DateCastResult EnumUtil::FromString(const char *value); + template<> DatePartSpecifier EnumUtil::FromString(const char *value); diff --git a/src/duckdb/src/include/duckdb/common/types/date.hpp b/src/duckdb/src/include/duckdb/common/types/date.hpp index f2cb08d50..ea6b52ce7 100644 --- a/src/duckdb/src/include/duckdb/common/types/date.hpp +++ b/src/duckdb/src/include/duckdb/common/types/date.hpp @@ -83,6 +83,8 @@ struct date_t { // NOLINT } // NOLINT }; +enum class DateCastResult : uint8_t { SUCCESS, ERROR_INCORRECT_FORMAT, ERROR_RANGE }; + //! The Date class is a static class that holds helper functions for the Date type. class Date { public: @@ -127,8 +129,8 @@ class Date { DUCKDB_API static bool TryConvertDateSpecial(const char *buf, idx_t len, idx_t &pos, const char *special); //! Try to convert text in a buffer to a date; returns true if parsing was successful //! If the date was a "special" value, the special flag will be set. - DUCKDB_API static bool TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result, bool &special, - bool strict = false); + DUCKDB_API static DateCastResult TryConvertDate(const char *buf, idx_t len, idx_t &pos, date_t &result, + bool &special, bool strict = false); //! Create a string "YYYY-MM-DD" from a specified (year, month, day) //! combination @@ -202,8 +204,11 @@ class Date { //! Helper function to parse two digits from a string (e.g. "30" -> 30, "03" -> 3, "3" -> 3) DUCKDB_API static bool ParseDoubleDigit(const char *buf, idx_t len, idx_t &pos, int32_t &result); - DUCKDB_API static string ConversionError(const string &str); - DUCKDB_API static string ConversionError(string_t str); + DUCKDB_API static string FormatError(const string &str); + DUCKDB_API static string FormatError(string_t str); + + DUCKDB_API static string RangeError(const string &str); + DUCKDB_API static string RangeError(string_t str); private: static void ExtractYearOffset(int32_t &n, int32_t &year, int32_t &year_offset); diff --git a/src/duckdb/src/include/duckdb/common/types/timestamp.hpp b/src/duckdb/src/include/duckdb/common/types/timestamp.hpp index eaab803e8..c56168251 100644 --- a/src/duckdb/src/include/duckdb/common/types/timestamp.hpp +++ b/src/duckdb/src/include/duckdb/common/types/timestamp.hpp @@ -112,7 +112,7 @@ struct timestamp_tz_t : public timestamp_t { // NOLINT } }; -enum class TimestampCastResult : uint8_t { SUCCESS, ERROR_INCORRECT_FORMAT, ERROR_NON_UTC_TIMEZONE }; +enum class TimestampCastResult : uint8_t { SUCCESS, ERROR_INCORRECT_FORMAT, ERROR_NON_UTC_TIMEZONE, ERROR_RANGE }; //! The static Timestamp class holds helper functions for the timestamp types. class Timestamp { @@ -128,8 +128,9 @@ class Timestamp { //! Convert a string where the offset can also be a time zone string: / [A_Za-z0-9/_]+/ //! If has_offset is true, then the result is an instant that was offset from UTC //! If the tz is not empty, the result is still an instant, but the parts can be extracted and applied to the TZ - DUCKDB_API static bool TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &result, bool &has_offset, - string_t &tz, optional_ptr nanos = nullptr); + DUCKDB_API static TimestampCastResult TryConvertTimestampTZ(const char *str, idx_t len, timestamp_t &result, + bool &has_offset, string_t &tz, + optional_ptr nanos = nullptr); DUCKDB_API static TimestampCastResult TryConvertTimestamp(const char *str, idx_t len, timestamp_t &result, optional_ptr nanos = nullptr); DUCKDB_API static TimestampCastResult TryConvertTimestamp(const char *str, idx_t len, timestamp_ns_t &result); @@ -200,10 +201,12 @@ class Timestamp { DUCKDB_API static bool TryParseUTCOffset(const char *str, idx_t &pos, idx_t len, int &hour_offset, int &minute_offset); - DUCKDB_API static string ConversionError(const string &str); - DUCKDB_API static string ConversionError(string_t str); + DUCKDB_API static string FormatError(const string &str); + DUCKDB_API static string FormatError(string_t str); DUCKDB_API static string UnsupportedTimezoneError(const string &str); DUCKDB_API static string UnsupportedTimezoneError(string_t str); + DUCKDB_API static string RangeError(const string &str); + DUCKDB_API static string RangeError(string_t str); }; } // namespace duckdb From 7b9fa6163282a77fe14025ce5390c3f2f7d65985 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 20:08:09 +0000 Subject: [PATCH 19/21] vendor: Update vendored sources to duckdb/duckdb@2c203c13b52f8d6073f5e03f3226eaadf0973654 (#996) [Tidy] create index info in static function for reusability (duckdb/duckdb#15633) Parallel `union_by_name` for `read_json` (duckdb/duckdb#15593) Co-authored-by: krlmlr --- .../function/table/version/pragma_version.cpp | 6 ++-- .../expression_binder/index_binder.hpp | 2 ++ .../expression_binder/index_binder.cpp | 33 ++++++++++--------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index c61c54141..900d5146e 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4248" +#define DUCKDB_PATCH_VERSION "4-dev4253" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4248" +#define DUCKDB_VERSION "v1.1.4-dev4253" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "acea8cde22" +#define DUCKDB_SOURCE_ID "2c203c13b5" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp b/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp index b15948b34..f8972d13e 100644 --- a/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp @@ -30,6 +30,8 @@ class IndexBinder : public ExpressionBinder { TableCatalogEntry &table_entry, unique_ptr plan, unique_ptr alter_table_info); + static void InitCreateIndexInfo(LogicalGet &get, CreateIndexInfo &info, const string &schema); + protected: BindResult BindExpression(unique_ptr &expr_ptr, idx_t depth, bool root_expression = false) override; diff --git a/src/duckdb/src/planner/expression_binder/index_binder.cpp b/src/duckdb/src/planner/expression_binder/index_binder.cpp index 1d9d46bbc..950edb6e6 100644 --- a/src/duckdb/src/planner/expression_binder/index_binder.cpp +++ b/src/duckdb/src/planner/expression_binder/index_binder.cpp @@ -48,6 +48,23 @@ unique_ptr IndexBinder::BindIndex(const UnboundIndex &unbound_index) return index_type->create_instance(input); } +void IndexBinder::InitCreateIndexInfo(LogicalGet &get, CreateIndexInfo &info, const string &schema) { + auto &column_ids = get.GetColumnIds(); + for (auto &column_id : column_ids) { + if (column_id.IsRowIdColumn()) { + throw BinderException("cannot create an index on the rowid"); + } + auto col_id = column_id.GetPrimaryIndex(); + info.column_ids.push_back(col_id); + info.scan_types.push_back(get.returned_types[col_id]); + } + + info.scan_types.emplace_back(LogicalType::ROW_TYPE); + info.names = get.names; + info.schema = schema; + get.AddColumnId(COLUMN_IDENTIFIER_ROW_ID); +} + unique_ptr IndexBinder::BindCreateIndex(ClientContext &context, unique_ptr create_index_info, TableCatalogEntry &table_entry, @@ -70,23 +87,9 @@ unique_ptr IndexBinder::BindCreateIndex(ClientContext &context, } auto &get = plan->Cast(); - auto &column_ids = get.GetColumnIds(); - for (auto &column_id : column_ids) { - if (column_id.IsRowIdColumn()) { - throw BinderException("cannot create an index on the rowid"); - } - auto col_id = column_id.GetPrimaryIndex(); - create_index_info->column_ids.push_back(col_id); - create_index_info->scan_types.push_back(get.returned_types[col_id]); - } - - create_index_info->scan_types.emplace_back(LogicalType::ROW_TYPE); - create_index_info->names = get.names; - create_index_info->schema = table_entry.schema.name; - + InitCreateIndexInfo(get, *create_index_info, table_entry.schema.name); auto &bind_data = get.bind_data->Cast(); bind_data.is_create_index = true; - get.AddColumnId(COLUMN_IDENTIFIER_ROW_ID); auto result = make_uniq(std::move(create_index_info), std::move(expressions), table_entry, std::move(alter_table_info)); From 1189fa7784dcbfa3c4e7318a79242843904ec59c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 20:36:52 +0000 Subject: [PATCH 20/21] vendor: Update vendored sources to duckdb/duckdb@d3444ef1413102e5c8246a066b201c59ac5c4c04 (#997) [Arrow] Fix a bug related to ArrowArray lifetimes in the arrow scan code (duckdb/duckdb#15632) Co-authored-by: krlmlr --- src/duckdb/src/function/cast/struct_cast.cpp | 1 + src/duckdb/src/function/table/arrow_conversion.cpp | 10 +++++++++- .../src/function/table/version/pragma_version.cpp | 6 +++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/duckdb/src/function/cast/struct_cast.cpp b/src/duckdb/src/function/cast/struct_cast.cpp index e5902b752..051581c54 100644 --- a/src/duckdb/src/function/cast/struct_cast.cpp +++ b/src/duckdb/src/function/cast/struct_cast.cpp @@ -128,6 +128,7 @@ static bool StructToStructCast(Vector &source, Vector &result, idx_t count, Cast source.Flatten(count); auto &result_validity = FlatVector::Validity(result); result_validity = FlatVector::Validity(source); + result.Verify(count); return all_converted; } diff --git a/src/duckdb/src/function/table/arrow_conversion.cpp b/src/duckdb/src/function/table/arrow_conversion.cpp index 18daff2b9..e09f81aac 100644 --- a/src/duckdb/src/function/table/arrow_conversion.cpp +++ b/src/duckdb/src/function/table/arrow_conversion.cpp @@ -725,6 +725,9 @@ static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, const ArrowArray &a D_ASSERT(vector.GetType() == values_type.GetDuckType()); auto &scan_state = array_state.state; + if (vector.GetBuffer()) { + vector.GetBuffer()->SetAuxiliaryData(make_uniq(array_state.owned_data)); + } D_ASSERT(run_ends_array.length == values_array.length); auto compressed_size = NumericCast(run_ends_array.length); @@ -766,6 +769,9 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca auto &scan_state = array_state.state; D_ASSERT(!array.dictionary); + if (vector.GetBuffer()) { + vector.GetBuffer()->SetAuxiliaryData(make_uniq(array_state.owned_data)); + } switch (vector.GetType().id()) { case LogicalTypeId::SQLNULL: vector.Reference(Value()); @@ -1284,6 +1290,9 @@ static bool CanContainNull(const ArrowArray &array, const ValidityMask *parent_m static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset, const ValidityMask *parent_mask, uint64_t parent_offset) { + if (vector.GetBuffer()) { + vector.GetBuffer()->SetAuxiliaryData(make_uniq(array_state.owned_data)); + } D_ASSERT(arrow_type.HasDictionary()); auto &scan_state = array_state.state; const bool has_nulls = CanContainNull(array, parent_mask); @@ -1384,7 +1393,6 @@ void ArrowTableFunction::ArrowToDuckDB(ArrowScanLocalState &scan_state, const ar if (!array_state.owned_data) { array_state.owned_data = scan_state.chunk; } - output.data[idx].GetBuffer()->SetAuxiliaryData(make_uniq(array_state.owned_data)); auto array_physical_type = GetArrowArrayPhysicalType(arrow_type); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 900d5146e..ad6011e7f 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4253" +#define DUCKDB_PATCH_VERSION "4-dev4255" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4253" +#define DUCKDB_VERSION "v1.1.4-dev4255" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "2c203c13b5" +#define DUCKDB_SOURCE_ID "d3444ef141" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" From d732c1228fc2825c7187acab20a5167734c42d90 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:07:01 +0000 Subject: [PATCH 21/21] vendor: Update vendored sources to duckdb/duckdb@d707b4432b74b51f8a176c533b98e24d48f4d165 (#998) Not using Random Device in DuckDB Core (duckdb/duckdb#15540) Co-authored-by: krlmlr --- src/duckdb/src/common/random_engine.cpp | 18 +++++++++++++++++- src/duckdb/src/common/string_util.cpp | 9 +++------ .../function/table/version/pragma_version.cpp | 6 +++--- src/duckdb/third_party/httplib/httplib.hpp | 17 +++-------------- src/duckdb/third_party/skiplist/SkipList.h | 1 - 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/duckdb/src/common/random_engine.cpp b/src/duckdb/src/common/random_engine.cpp index 7741e8f4d..23eb253e3 100644 --- a/src/duckdb/src/common/random_engine.cpp +++ b/src/duckdb/src/common/random_engine.cpp @@ -1,8 +1,13 @@ #include "duckdb/common/random_engine.hpp" #include "duckdb/common/numeric_utils.hpp" #include "pcg_random.hpp" -#include +#ifdef __linux__ +#include +#include +#else +#include +#endif namespace duckdb { struct RandomState { @@ -14,7 +19,18 @@ struct RandomState { RandomEngine::RandomEngine(int64_t seed) : random_state(make_uniq()) { if (seed < 0) { +#ifdef __linux__ + idx_t random_seed; + auto result = syscall(SYS_getrandom, &random_seed, sizeof(random_seed), 0); + if (result == -1) { + // Something went wrong with the syscall, we use chrono + const auto now = std::chrono::high_resolution_clock::now(); + random_seed = now.time_since_epoch().count(); + } + random_state->pcg.seed(random_seed); +#else random_state->pcg.seed(pcg_extras::seed_seq_from()); +#endif } else { random_state->pcg.seed(NumericCast(seed)); } diff --git a/src/duckdb/src/common/string_util.cpp b/src/duckdb/src/common/string_util.cpp index f1bd1e038..f96e3d64e 100644 --- a/src/duckdb/src/common/string_util.cpp +++ b/src/duckdb/src/common/string_util.cpp @@ -6,6 +6,7 @@ #include "duckdb/common/to_string.hpp" #include "duckdb/common/helper.hpp" #include "duckdb/common/exception/parser_exception.hpp" +#include "duckdb/common/random_engine.hpp" #include "jaro_winkler.hpp" #include "utf8proc_wrapper.hpp" @@ -16,7 +17,6 @@ #include #include #include -#include #include #include "yyjson.hpp" @@ -26,13 +26,10 @@ using namespace duckdb_yyjson; // NOLINT namespace duckdb { string StringUtil::GenerateRandomName(idx_t length) { - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> dis(0, 15); - + RandomEngine engine; std::stringstream ss; for (idx_t i = 0; i < length; i++) { - ss << "0123456789abcdef"[dis(gen)]; + ss << "0123456789abcdef"[engine.NextRandomInteger(0, 15)]; } return ss.str(); } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index ad6011e7f..a98a327c6 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev4255" +#define DUCKDB_PATCH_VERSION "4-dev4271" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev4255" +#define DUCKDB_VERSION "v1.1.4-dev4271" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "d3444ef141" +#define DUCKDB_SOURCE_ID "d707b4432b" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/third_party/httplib/httplib.hpp b/src/duckdb/third_party/httplib/httplib.hpp index c7da982b7..6432d5d1a 100644 --- a/src/duckdb/third_party/httplib/httplib.hpp +++ b/src/duckdb/third_party/httplib/httplib.hpp @@ -237,7 +237,6 @@ using socket_t = int; #include #include #include -#include #include #include #include @@ -246,9 +245,8 @@ using socket_t = int; #include #include #include -#include - #include "duckdb/common/re2_regex.hpp" +#include "duckdb/common/random_engine.hpp" #ifdef CPPHTTPLIB_OPENSSL_SUPPORT #ifdef _WIN32 @@ -4646,19 +4644,10 @@ inline std::string make_multipart_data_boundary() { static const char data[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - // std::random_device might actually be deterministic on some - // platforms, but due to lack of support in the c++ standard library, - // doing better requires either some ugly hacks or breaking portability. - std::random_device seed_gen; - - // Request 128 bits of entropy for initialization - std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()}; - std::mt19937 engine(seed_sequence); - std::string result = "--cpp-httplib-multipart-data-"; - + duckdb::RandomEngine engine; for (auto i = 0; i < 16; i++) { - result += data[engine() % (sizeof(data) - 1)]; + result += data[engine.NextRandomInteger32(0,sizeof(data) - 1)]; } return result; diff --git a/src/duckdb/third_party/skiplist/SkipList.h b/src/duckdb/third_party/skiplist/SkipList.h index 5cf27c204..5b015acc3 100644 --- a/src/duckdb/third_party/skiplist/SkipList.h +++ b/src/duckdb/third_party/skiplist/SkipList.h @@ -446,7 +446,6 @@ #include #include // Used for HeadNode::_lacksIntegrityNodeReferencesNotInList() #include // Used for class Exception -#include #include "pcg_random.hpp" #ifdef DEBUG