From 992b56937a6323b4f554fc078a52ca61b03d5127 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 11 Feb 2026 21:35:14 -0500 Subject: [PATCH 1/2] perf: Optimize concat() UDF This commit implements three optimizations: * In `StringViewArrayBuilder`, we re-allocated `block` after every call to `append_offset`. It is cheaper to instead clear and re-use `block`. * In `StringViewArrayBuilder::write()`, we re-validated that a string array consists of valid UTF8 characters. This was unnecessary work and can be skipped. * In the concat() UDF implementation, we miscalculated the initial size of the StringViewArrayBuilder buffer. This didn't lead to incorrect behavior but it resulted in unnecessarily needing to reallocate the buffer. --- datafusion/functions/src/string/concat.rs | 2 +- datafusion/functions/src/string/concat_ws.rs | 6 +---- datafusion/functions/src/strings.rs | 23 ++++++-------------- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 9e565342bafbc..73cfe2870c0b7 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -207,7 +207,7 @@ impl ScalarUDFImpl for ConcatFunc { DataType::Utf8View => { let string_array = as_string_view_array(array)?; - data_size += string_array.len(); + data_size += string_array.total_buffer_bytes_used(); let column = if array.is_nullable() { ColumnarValueRef::NullableStringViewArray(string_array) } else { diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index b08799f434aa6..6e773dd5b360d 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -221,11 +221,7 @@ impl ScalarUDFImpl for ConcatWsFunc { DataType::Utf8View => { let string_array = as_string_view_array(array)?; - data_size += string_array - .data_buffers() - .iter() - .map(|buf| buf.len()) - .sum::(); + data_size += string_array.total_buffer_bytes_used(); let column = if array.is_nullable() { ColumnarValueRef::NullableStringViewArray(string_array) } else { diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index a7be3ef792994..cfddf57b094b5 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -152,43 +152,34 @@ impl StringViewArrayBuilder { } ColumnarValueRef::NullableArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str( - std::str::from_utf8(array.value(i).as_bytes()).unwrap(), - ); + self.block.push_str(array.value(i)); } } ColumnarValueRef::NullableLargeStringArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str( - std::str::from_utf8(array.value(i).as_bytes()).unwrap(), - ); + self.block.push_str(array.value(i)); } } ColumnarValueRef::NullableStringViewArray(array) => { if !CHECK_VALID || array.is_valid(i) { - self.block.push_str( - std::str::from_utf8(array.value(i).as_bytes()).unwrap(), - ); + self.block.push_str(array.value(i)); } } ColumnarValueRef::NonNullableArray(array) => { - self.block - .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str(array.value(i)); } ColumnarValueRef::NonNullableLargeStringArray(array) => { - self.block - .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str(array.value(i)); } ColumnarValueRef::NonNullableStringViewArray(array) => { - self.block - .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap()); + self.block.push_str(array.value(i)); } } } pub fn append_offset(&mut self) { self.builder.append_value(&self.block); - self.block = String::new(); + self.block.clear(); } pub fn finish(mut self) -> StringViewArray { From 88b70bf8ec171b3d31a13d819a94a550d7f8558d Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 18 Feb 2026 09:53:38 -0500 Subject: [PATCH 2/2] Add comment for total_buffer_bytes_used() undercount --- datafusion/functions/src/string/concat.rs | 2 ++ datafusion/functions/src/string/concat_ws.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 73cfe2870c0b7..6e565f5690790 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -207,6 +207,8 @@ impl ScalarUDFImpl for ConcatFunc { DataType::Utf8View => { let string_array = as_string_view_array(array)?; + // This is an estimate; in particular, it will + // undercount arrays of short strings (<= 12 bytes). data_size += string_array.total_buffer_bytes_used(); let column = if array.is_nullable() { ColumnarValueRef::NullableStringViewArray(string_array) diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index 6e773dd5b360d..79a5065dc1a87 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -221,6 +221,8 @@ impl ScalarUDFImpl for ConcatWsFunc { DataType::Utf8View => { let string_array = as_string_view_array(array)?; + // This is an estimate; in particular, it will + // undercount arrays of short strings (<= 12 bytes). data_size += string_array.total_buffer_bytes_used(); let column = if array.is_nullable() { ColumnarValueRef::NullableStringViewArray(string_array)