From 266550b6fc9b8e28b79ad83733b39d957283c8f6 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 15 Feb 2026 13:19:26 -0500 Subject: [PATCH 01/22] Revise benchmark for array_has() The previous implementation tested the cost of building an array_has() `Expr` (!), not actually evaluating the array_has() operation itself. Refactor things along the way. --- .../functions-nested/benches/array_has.rs | 676 +++++++++++------- 1 file changed, 426 insertions(+), 250 deletions(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index d96f26d410dd0..302ef9168650c 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -15,19 +15,31 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::{DataType, Field}; use criterion::{ criterion_group, criterion_main, {BenchmarkId, Criterion}, }; -use datafusion_expr::lit; -use datafusion_functions_nested::expr_fn::{ - array_has, array_has_all, array_has_any, make_array, -}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::array_has::{ArrayHas, ArrayHasAll, ArrayHasAny}; +use rand::Rng; +use rand::SeedableRng; +use rand::rngs::StdRng; use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 10000; +const SEED: u64 = 42; +const NULL_DENSITY: f64 = 0.1; +const NEEDLE_SIZE: usize = 3; // If not explicitly stated, `array` and `array_size` refer to the haystack array. fn criterion_benchmark(c: &mut Criterion) { // Test different array sizes - let array_sizes = vec![1, 10, 100, 1000, 10000]; + let array_sizes = vec![10, 100, 500]; for &size in &array_sizes { bench_array_has(c, size); @@ -39,50 +51,65 @@ fn criterion_benchmark(c: &mut Criterion) { bench_array_has_strings(c); bench_array_has_all_strings(c); bench_array_has_any_strings(c); - - // Edge cases - bench_array_has_edge_cases(c); } fn bench_array_has(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_i64"); - - // Benchmark: element found at beginning - group.bench_with_input( - BenchmarkId::new("found_at_start", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit(0_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }, - ); - - // Benchmark: element found at end + let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("arr", list_array.data_type().clone(), false).into(), + Field::new("el", DataType::Int64, false).into(), + ]; + + // Benchmark: element found + let args_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ]; group.bench_with_input( - BenchmarkId::new("found_at_end", array_size), + BenchmarkId::new("found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit((size - 1) as i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); // Benchmark: element not found + let args_not_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Int64(Some(-999))), + ]; group.bench_with_input( BenchmarkId::new("not_found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit(-1_i64); // Not in array - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -91,45 +118,65 @@ fn bench_array_has(c: &mut Criterion, array_size: usize) { fn bench_array_has_all(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_all"); + let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; // Benchmark: all elements found (small needle) + let needle_found = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_found = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_found), + ]; group.bench_with_input( BenchmarkId::new("all_found_small_needle", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(1_i64), lit(2_i64)]); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); - // Benchmark: all elements found (medium needle - 10% of haystack) + // Benchmark: not all found (needle contains elements outside haystack range) + let needle_missing = + create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); + let args_missing = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_missing), + ]; group.bench_with_input( - BenchmarkId::new("all_found_medium_needle", array_size), + BenchmarkId::new("not_all_found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_size = (size / 10).max(1); - let needle = (0..needle_size).map(|i| lit(i as i64)).collect::>(); - let needle_array = make_array(needle); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) - }, - ); - - // Benchmark: not all found (early exit) - group.bench_with_input( - BenchmarkId::new("early_exit", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(-1_i64)]); // -1 not in array - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_missing.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -138,43 +185,65 @@ fn bench_array_has_all(c: &mut Criterion, array_size: usize) { fn bench_array_has_any(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_any"); - - // Benchmark: first element matches (best case) + let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + // Benchmark: some elements match + let needle_match = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_match), + ]; group.bench_with_input( - BenchmarkId::new("first_match", array_size), + BenchmarkId::new("some_match", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(-1_i64), lit(-2_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) - }, - ); - - // Benchmark: last element matches (worst case) - group.bench_with_input( - BenchmarkId::new("last_match", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(0_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); // Benchmark: no match + let needle_no_match = + create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); + let args_no_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_no_match), + ]; group.bench_with_input( BenchmarkId::new("no_match", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(-3_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -183,29 +252,56 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { fn bench_array_has_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - // Benchmark with string arrays (common use case for tickers, tags, etc.) - let sizes = vec![10, 100, 1000]; + let sizes = vec![10, 100, 500]; for &size in &sizes { - group.bench_with_input(BenchmarkId::new("found", size), &size, |b, &size| { - let array = (0..size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(array); - let needle = lit("TICKER0005"); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + let list_array = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let arg_fields: Vec> = vec![ + Field::new("arr", list_array.data_type().clone(), false).into(), + Field::new("el", DataType::Utf8, false).into(), + ]; + + let args_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("value_1".to_string()))), + ]; + group.bench_with_input(BenchmarkId::new("found", size), &size, |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }); - group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, &size| { - let array = (0..size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(array); - let needle = lit("NOTFOUND"); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + let args_not_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("NOTFOUND".to_string()))), + ]; + group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }); } @@ -214,52 +310,61 @@ fn bench_array_has_strings(c: &mut Criterion) { fn bench_array_has_all_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_all_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - // Realistic scenario: checking if a portfolio contains certain tickers - let portfolio_size = 100; - let check_sizes = vec![1, 3, 5, 10]; - - for &check_size in &check_sizes { - group.bench_with_input( - BenchmarkId::new("all_found", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let checking = (0..check_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_all(list_array.clone(), needle_array.clone())) - }) - }, - ); - - group.bench_with_input( - BenchmarkId::new("some_missing", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let mut checking = (0..check_size - 1) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - checking.push(lit("NOTFOUND".to_string())); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_all(list_array.clone(), needle_array.clone())) - }) - }, - ); + let sizes = vec![10, 100, 500]; + + for &size in &sizes { + let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + let needle_found = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_found = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_found), + ]; + group.bench_with_input(BenchmarkId::new("all_found", size), &size, |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); + + let needle_missing = + create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); + let args_missing = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_missing), + ]; + group.bench_with_input(BenchmarkId::new("not_all_found", size), &size, |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_missing.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); } group.finish(); @@ -267,109 +372,180 @@ fn bench_array_has_all_strings(c: &mut Criterion) { fn bench_array_has_any_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_any_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - let portfolio_size = 100; - let check_sizes = vec![1, 3, 5, 10]; - - for &check_size in &check_sizes { - group.bench_with_input( - BenchmarkId::new("first_matches", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let mut checking = vec![lit("TICKER0000".to_string())]; - checking.extend((1..check_size).map(|_| lit("NOTFOUND".to_string()))); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_any(list_array.clone(), needle_array.clone())) - }) - }, - ); - - group.bench_with_input( - BenchmarkId::new("none_match", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let checking = (0..check_size) - .map(|i| lit(format!("NOTFOUND{i}"))) - .collect::>(); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_any(list_array.clone(), needle_array.clone())) - }) - }, - ); + let sizes = vec![10, 100, 500]; + + for &size in &sizes { + let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + let needle_match = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_match), + ]; + group.bench_with_input(BenchmarkId::new("some_match", size), &size, |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); + + let needle_no_match = + create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); + let args_no_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_no_match), + ]; + group.bench_with_input(BenchmarkId::new("no_match", size), &size, |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); } group.finish(); } -fn bench_array_has_edge_cases(c: &mut Criterion) { - let mut group = c.benchmark_group("array_has_edge_cases"); - - // Empty array - group.bench_function("empty_array", |b| { - let list_array = make_array(vec![]); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Single element array - found - group.bench_function("single_element_found", |b| { - let list_array = make_array(vec![lit(1_i64)]); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Single element array - not found - group.bench_function("single_element_not_found", |b| { - let list_array = make_array(vec![lit(1_i64)]); - let needle = lit(2_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Array with duplicates - group.bench_function("array_with_duplicates", |b| { - let array = vec![lit(1_i64); 1000]; - let list_array = make_array(array); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // array_has_all: empty needle - group.bench_function("array_has_all_empty_needle", |b| { - let array = (0..1000).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![]); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) - }); +fn create_int64_list_array( + num_rows: usize, + array_size: usize, + null_density: f64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..num_rows * array_size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + Some(rng.random_range(0..array_size as i64)) + } + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - // array_has_any: empty needle - group.bench_function("array_has_any_empty_needle", |b| { - let array = (0..1000).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![]); +/// Like `create_int64_list_array` but values are offset so they won't +/// appear in a standard list array (useful for "not found" benchmarks). +fn create_int64_list_array_with_offset( + num_rows: usize, + array_size: usize, + offset: i64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED + 1); + let values = (0..num_rows * array_size) + .map(|_| Some(rng.random_range(0..array_size as i64) + offset)) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) - }); +fn create_string_list_array( + num_rows: usize, + array_size: usize, + null_density: f64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..num_rows * array_size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let idx = rng.random_range(0..array_size); + Some(format!("value_{idx}")) + } + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - group.finish(); +/// Like `create_string_list_array` but values use a different prefix so +/// they won't appear in a standard string list array. +fn create_string_list_array_with_prefix( + num_rows: usize, + array_size: usize, + prefix: &str, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED + 1); + let values = (0..num_rows * array_size) + .map(|_| { + let idx = rng.random_range(0..array_size); + Some(format!("{prefix}{idx}")) + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) } criterion_group!(benches, criterion_benchmark); From 7b144b1cafe5b719bf9ec9c97df2badbf1210391 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 16 Feb 2026 13:36:15 -0500 Subject: [PATCH 02/22] Add benchmark for array_has_any scalar fastpath --- .../functions-nested/benches/array_has.rs | 262 ++++++++++++++++-- 1 file changed, 242 insertions(+), 20 deletions(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index 302ef9168650c..c21513babefa9 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -51,6 +51,9 @@ fn criterion_benchmark(c: &mut Criterion) { bench_array_has_strings(c); bench_array_has_all_strings(c); bench_array_has_any_strings(c); + + // Benchmark for array_has_any with one scalar arg + bench_array_has_any_scalar(c); } fn bench_array_has(c: &mut Criterion, array_size: usize) { @@ -185,20 +188,20 @@ fn bench_array_has_all(c: &mut Criterion, array_size: usize) { fn bench_array_has_any(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_any"); - let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); - let list_type = haystack.data_type().clone(); + let first_arr = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type = first_arr.data_type().clone(); let config_options = Arc::new(ConfigOptions::default()); let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); let arg_fields: Vec> = vec![ - Field::new("haystack", list_type.clone(), false).into(), - Field::new("needle", list_type.clone(), false).into(), + Field::new("first", list_type.clone(), false).into(), + Field::new("second", list_type.clone(), false).into(), ]; // Benchmark: some elements match - let needle_match = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let second_match = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); let args_match = vec![ - ColumnarValue::Array(haystack.clone()), - ColumnarValue::Array(needle_match), + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Array(second_match), ]; group.bench_with_input( BenchmarkId::new("some_match", array_size), @@ -221,11 +224,11 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ); // Benchmark: no match - let needle_no_match = + let second_no_match = create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); let args_no_match = vec![ - ColumnarValue::Array(haystack.clone()), - ColumnarValue::Array(needle_no_match), + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Array(second_no_match), ]; group.bench_with_input( BenchmarkId::new("no_match", array_size), @@ -247,6 +250,58 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { }, ); + // Benchmark: scalar second arg, some match + let scalar_second_match = create_int64_scalar_list(NEEDLE_SIZE, 0); + let args_scalar_match = vec![ + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Scalar(scalar_second_match), + ]; + group.bench_with_input( + BenchmarkId::new("scalar_some_match", array_size), + &array_size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_scalar_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + // Benchmark: scalar second arg, no match + let scalar_second_no_match = create_int64_scalar_list(NEEDLE_SIZE, array_size as i64); + let args_scalar_no_match = vec![ + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Scalar(scalar_second_no_match), + ]; + group.bench_with_input( + BenchmarkId::new("scalar_no_match", array_size), + &array_size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_scalar_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + group.finish(); } @@ -378,17 +433,17 @@ fn bench_array_has_any_strings(c: &mut Criterion) { let sizes = vec![10, 100, 500]; for &size in &sizes { - let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); - let list_type = haystack.data_type().clone(); + let first_arr = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let list_type = first_arr.data_type().clone(); let arg_fields: Vec> = vec![ - Field::new("haystack", list_type.clone(), false).into(), - Field::new("needle", list_type.clone(), false).into(), + Field::new("first", list_type.clone(), false).into(), + Field::new("second", list_type.clone(), false).into(), ]; - let needle_match = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let second_match = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); let args_match = vec![ - ColumnarValue::Array(haystack.clone()), - ColumnarValue::Array(needle_match), + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Array(second_match), ]; group.bench_with_input(BenchmarkId::new("some_match", size), &size, |b, _| { let udf = ArrayHasAny::new(); @@ -406,11 +461,11 @@ fn bench_array_has_any_strings(c: &mut Criterion) { }) }); - let needle_no_match = + let second_no_match = create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); let args_no_match = vec![ - ColumnarValue::Array(haystack.clone()), - ColumnarValue::Array(needle_no_match), + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Array(second_no_match), ]; group.bench_with_input(BenchmarkId::new("no_match", size), &size, |b, _| { let udf = ArrayHasAny::new(); @@ -427,6 +482,141 @@ fn bench_array_has_any_strings(c: &mut Criterion) { ) }) }); + + // Benchmark: scalar second arg, some match + let scalar_second_match = create_string_scalar_list(NEEDLE_SIZE, "value_"); + let args_scalar_match = vec![ + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Scalar(scalar_second_match), + ]; + group.bench_with_input( + BenchmarkId::new("scalar_some_match", size), + &size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_scalar_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + // Benchmark: scalar second arg, no match + let scalar_second_no_match = create_string_scalar_list(NEEDLE_SIZE, "missing_"); + let args_scalar_no_match = vec![ + ColumnarValue::Array(first_arr.clone()), + ColumnarValue::Scalar(scalar_second_no_match), + ]; + group.bench_with_input( + BenchmarkId::new("scalar_no_match", size), + &size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_scalar_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +/// Benchmarks array_has_any with one scalar arg. Varies the scalar argument +/// size while keeping the columnar array small (3 elements per row). +fn bench_array_has_any_scalar(c: &mut Criterion) { + let mut group = c.benchmark_group("array_has_any_scalar"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + + let array_size = 3; + let scalar_sizes = vec![1, 10, 100, 1000]; + + // i64 benchmarks + let first_arr_i64 = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type_i64 = first_arr_i64.data_type().clone(); + let arg_fields_i64: Vec> = vec![ + Field::new("first", list_type_i64.clone(), false).into(), + Field::new("second", list_type_i64.clone(), false).into(), + ]; + + for &scalar_size in &scalar_sizes { + let scalar_arg = create_int64_scalar_list(scalar_size, 0); + let args = vec![ + ColumnarValue::Array(first_arr_i64.clone()), + ColumnarValue::Scalar(scalar_arg), + ]; + group.bench_with_input( + BenchmarkId::new("i64_no_match", scalar_size), + &scalar_size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields_i64.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + } + + // String benchmarks + let first_arr_str = create_string_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type_str = first_arr_str.data_type().clone(); + let arg_fields_str: Vec> = vec![ + Field::new("first", list_type_str.clone(), false).into(), + Field::new("second", list_type_str.clone(), false).into(), + ]; + + for &scalar_size in &scalar_sizes { + let scalar_arg = create_string_scalar_list(scalar_size, "missing_"); + let args = vec![ + ColumnarValue::Array(first_arr_str.clone()), + ColumnarValue::Scalar(scalar_arg), + ]; + group.bench_with_input( + BenchmarkId::new("string_no_match", scalar_size), + &scalar_size, + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields_str.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); } group.finish(); @@ -548,5 +738,37 @@ fn create_string_list_array_with_prefix( ) } +/// Create a `ScalarValue::List` containing a single list of `size` i64 elements, +/// with values starting at `offset`. +fn create_int64_scalar_list(size: usize, offset: i64) -> ScalarValue { + let values = (0..size as i64) + .map(|i| Some(i + offset)) + .collect::(); + let list = ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(vec![0, size as i32].into()), + Arc::new(values), + None, + ) + .unwrap(); + ScalarValue::List(Arc::new(list)) +} + +/// Create a `ScalarValue::List` containing a single list of `size` string elements, +/// with values like "{prefix}0", "{prefix}1", etc. +fn create_string_scalar_list(size: usize, prefix: &str) -> ScalarValue { + let values = (0..size) + .map(|i| Some(format!("{prefix}{i}"))) + .collect::(); + let list = ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(vec![0, size as i32].into()), + Arc::new(values), + None, + ) + .unwrap(); + ScalarValue::List(Arc::new(list)) +} + criterion_group!(benches, criterion_benchmark); criterion_main!(benches); From 3260914c67d77debcad374f733a212f5bbb800bd Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 16 Feb 2026 13:49:15 -0500 Subject: [PATCH 03/22] Optimize array_has_any() with one scalar arg --- datafusion/functions-nested/src/array_has.rs | 215 +++++++++++++++++- .../source/user-guide/sql/scalar_functions.md | 8 +- 2 files changed, 209 insertions(+), 14 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index abc0e7406b2c9..e7a882a9d0ad6 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -38,6 +38,7 @@ use crate::make_array::make_array_udf; use crate::utils::make_scalar_function; use std::any::Any; +use std::collections::HashSet; use std::sync::Arc; // Create static instances of ScalarUDFs for each function @@ -55,7 +56,7 @@ make_udf_expr_and_func!(ArrayHasAll, ); make_udf_expr_and_func!(ArrayHasAny, array_has_any, - haystack_array needle_array, // arg names + first_array second_array, // arg names "returns true if at least one element of the second array appears in the first array; otherwise, it returns false.", // doc array_has_any_udf // internal function name ); @@ -303,10 +304,8 @@ impl<'a> ArrayWrapper<'a> { fn offsets(&self) -> Box + 'a> { match self { ArrayWrapper::FixedSizeList(arr) => { - let offsets = (0..=arr.len()) - .step_by(arr.value_length() as usize) - .collect::>(); - Box::new(offsets.into_iter()) + let value_length = arr.value_length() as usize; + Box::new((0..=arr.len()).map(move |i| i * value_length)) } ArrayWrapper::List(arr) => { Box::new(arr.offsets().iter().map(|o| (*o) as usize)) @@ -316,6 +315,14 @@ impl<'a> ArrayWrapper<'a> { } } } + + fn nulls(&self) -> Option<&arrow::buffer::NullBuffer> { + match self { + ArrayWrapper::FixedSizeList(arr) => arr.nulls(), + ArrayWrapper::List(arr) => arr.nulls(), + ArrayWrapper::LargeList(arr) => arr.nulls(), + } + } } fn array_has_dispatch_for_array<'a>( @@ -476,6 +483,182 @@ fn array_has_any_inner(args: &[ArrayRef]) -> Result { array_has_all_and_any_inner(args, ComparisonType::Any) } +/// Fast path for `array_has_any` when exactly one argument is a scalar. +fn array_has_any_with_scalar( + columnar_arg: &ColumnarValue, + scalar_arg: &ScalarValue, +) -> Result { + if scalar_arg.is_null() { + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + + // Convert the scalar to a 1-element ListArray, then extract the inner values + let scalar_array = scalar_arg.to_array_of_size(1)?; + let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?; + let scalar_values = scalar_list.values(); + + // If scalar list is empty, result is always false + if scalar_values.is_empty() { + return match columnar_arg { + ColumnarValue::Array(arr) => { + let result = BooleanArray::from(BooleanBuffer::new_unset(arr.len())); + Ok(ColumnarValue::Array(Arc::new(result))) + } + ColumnarValue::Scalar(_) => { + Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)))) + } + }; + } + + match scalar_values.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + array_has_any_with_scalar_string(columnar_arg, scalar_values) + } + _ => array_has_any_with_scalar_general(columnar_arg, scalar_values), + } +} + +/// When the scalar argument has more elements than this, the scalar fast path +/// builds a HashSet for O(1) lookups. At or below this threshold, it falls +/// back to a linear scan, since hashing every columnar element is more +/// expensive than a linear scan over a short array. +const SCALAR_SMALL_THRESHOLD: usize = 8; + +/// String-specialized scalar fast path for `array_has_any`. +fn array_has_any_with_scalar_string( + columnar_arg: &ColumnarValue, + scalar_values: &ArrayRef, +) -> Result { + let scalar_strings = string_array_to_vec(scalar_values.as_ref()); + let has_null_scalar = scalar_strings.iter().any(|s| s.is_none()); + + let (col_arr, is_scalar_output) = match columnar_arg { + ColumnarValue::Array(arr) => (Arc::clone(arr), false), + ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true), + }; + + let col_list: ArrayWrapper = col_arr.as_ref().try_into()?; + let all_col_strings = string_array_to_vec(col_list.values().as_ref()); + let col_offsets: Vec = col_list.offsets().collect(); + let col_nulls = col_list.nulls(); + + let mut builder = BooleanArray::builder(col_list.len()); + + if scalar_strings.len() > SCALAR_SMALL_THRESHOLD { + // Large scalar: build HashSet for O(1) lookups + let scalar_set: HashSet = scalar_strings + .into_iter() + .flatten() + .map(|s| s.to_string()) + .collect(); + + for i in 0..col_list.len() { + if col_nulls.is_some_and(|v| !v.is_valid(i)) { + builder.append_null(); + continue; + } + let start = col_offsets[i]; + let end = col_offsets[i + 1]; + let found = (start..end).any(|j| match all_col_strings[j] { + Some(s) => scalar_set.contains(s), + None => has_null_scalar, + }); + builder.append_value(found); + } + } else { + // Small scalar: linear scan avoids HashSet hashing overhead + for i in 0..col_list.len() { + if col_nulls.is_some_and(|v| !v.is_valid(i)) { + builder.append_null(); + continue; + } + let start = col_offsets[i]; + let end = col_offsets[i + 1]; + let found = (start..end).any(|j| match all_col_strings[j] { + Some(s) => scalar_strings.contains(&Some(s)), + None => has_null_scalar, + }); + builder.append_value(found); + } + } + + let result: ArrayRef = Arc::new(builder.finish()); + + if is_scalar_output { + Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( + &result, 0, + )?)) + } else { + Ok(ColumnarValue::Array(result)) + } +} + +/// General scalar fast path for `array_has_any`, using RowConverter for +/// type-erased comparison. +fn array_has_any_with_scalar_general( + columnar_arg: &ColumnarValue, + scalar_values: &ArrayRef, +) -> Result { + let converter = + RowConverter::new(vec![SortField::new(scalar_values.data_type().clone())])?; + let scalar_rows = converter.convert_columns(&[Arc::clone(scalar_values)])?; + + let (col_arr, is_scalar_output) = match columnar_arg { + ColumnarValue::Array(arr) => (Arc::clone(arr), false), + ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true), + }; + + let col_list: ArrayWrapper = col_arr.as_ref().try_into()?; + let col_rows = converter.convert_columns(&[Arc::clone(col_list.values())])?; + let col_offsets: Vec = col_list.offsets().collect(); + let col_nulls = col_list.nulls(); + + let mut builder = BooleanArray::builder(col_list.len()); + let num_scalar = scalar_rows.num_rows(); + + if num_scalar > SCALAR_SMALL_THRESHOLD { + // Large scalar: build HashSet for O(1) lookups + let scalar_set: HashSet> = (0..num_scalar) + .map(|i| Box::from(scalar_rows.row(i).as_ref())) + .collect(); + + for i in 0..col_list.len() { + if col_nulls.is_some_and(|v| !v.is_valid(i)) { + builder.append_null(); + continue; + } + let start = col_offsets[i]; + let end = col_offsets[i + 1]; + let found = + (start..end).any(|j| scalar_set.contains(col_rows.row(j).as_ref())); + builder.append_value(found); + } + } else { + // Small scalar: linear scan avoids HashSet hashing overhead + for i in 0..col_list.len() { + if col_nulls.is_some_and(|v| !v.is_valid(i)) { + builder.append_null(); + continue; + } + let start = col_offsets[i]; + let end = col_offsets[i + 1]; + let found = (start..end) + .any(|j| (0..num_scalar).any(|k| col_rows.row(j) == scalar_rows.row(k))); + builder.append_value(found); + } + } + + let result: ArrayRef = Arc::new(builder.finish()); + + if is_scalar_output { + Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( + &result, 0, + )?)) + } else { + Ok(ColumnarValue::Array(result)) + } +} + #[user_doc( doc_section(label = "Array Functions"), description = "Returns true if all elements of sub-array exist in array.", @@ -552,8 +735,8 @@ impl ScalarUDFImpl for ArrayHasAll { #[user_doc( doc_section(label = "Array Functions"), - description = "Returns true if any elements exist in both arrays.", - syntax_example = "array_has_any(array, sub-array)", + description = "Returns true if the arrays have any elements in common.", + syntax_example = "array_has_any(array1, array2)", sql_example = r#"```sql > select array_has_any([1, 2, 3], [3, 4]); +------------------------------------------+ @@ -563,11 +746,11 @@ impl ScalarUDFImpl for ArrayHasAll { +------------------------------------------+ ```"#, argument( - name = "array", + name = "array1", description = "Array expression. Can be a constant, column, or function, and any combination of array operators." ), argument( - name = "sub-array", + name = "array2", description = "Array expression. Can be a constant, column, or function, and any combination of array operators." ) )] @@ -612,7 +795,19 @@ impl ScalarUDFImpl for ArrayHasAny { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_has_any_inner)(&args.args) + let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?; + + // array_has_any is symmetric: if either argument is scalar, build a + // HashSet from it and probe with the rows of the other argument. + match (&first_arg, &second_arg) { + (_, ColumnarValue::Scalar(scalar)) => { + array_has_any_with_scalar(first_arg, scalar) + } + (ColumnarValue::Scalar(scalar), _) => { + array_has_any_with_scalar(second_arg, scalar) + } + _ => make_scalar_function(array_has_any_inner)(&args.args), + } } fn aliases(&self) -> &[String] { diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index e09c4cb7cbc32..2de0981c86c30 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -3543,16 +3543,16 @@ array_has_all(array, sub-array) ### `array_has_any` -Returns true if any elements exist in both arrays. +Returns true if the arrays have any elements in common. ```sql -array_has_any(array, sub-array) +array_has_any(array1, array2) ``` #### Arguments -- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. -- **sub-array**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators. #### Example From dacf850722c51cf1e651a46d18ea3141ab002dce Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 16 Feb 2026 19:09:12 -0500 Subject: [PATCH 04/22] Tweak to avoid some allocations in string path --- datafusion/functions-nested/src/array_has.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index e7a882a9d0ad6..6e0af2f8a1879 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -546,10 +546,10 @@ fn array_has_any_with_scalar_string( if scalar_strings.len() > SCALAR_SMALL_THRESHOLD { // Large scalar: build HashSet for O(1) lookups - let scalar_set: HashSet = scalar_strings - .into_iter() + let scalar_set: HashSet<&str> = scalar_strings + .iter() + .copied() .flatten() - .map(|s| s.to_string()) .collect(); for i in 0..col_list.len() { From b3bbf3a8d90be7704f24b1f9d6ce0e01dbff6d2c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 16 Feb 2026 21:03:34 -0500 Subject: [PATCH 05/22] cargo fmt --- datafusion/functions-nested/src/array_has.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 6e0af2f8a1879..1dace233c983d 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -546,11 +546,8 @@ fn array_has_any_with_scalar_string( if scalar_strings.len() > SCALAR_SMALL_THRESHOLD { // Large scalar: build HashSet for O(1) lookups - let scalar_set: HashSet<&str> = scalar_strings - .iter() - .copied() - .flatten() - .collect(); + let scalar_set: HashSet<&str> = + scalar_strings.iter().copied().flatten().collect(); for i in 0..col_list.len() { if col_nulls.is_some_and(|v| !v.is_valid(i)) { From 9c4a653a19a508ccb94508501798ec774fa95fcf Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 19 Feb 2026 07:22:41 -0500 Subject: [PATCH 06/22] Fix i64_no_match benchmark Co-authored-by: Martin Grigorov --- datafusion/functions-nested/benches/array_has.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index c21513babefa9..61fc5fafb78b7 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -558,7 +558,7 @@ fn bench_array_has_any_scalar(c: &mut Criterion) { ]; for &scalar_size in &scalar_sizes { - let scalar_arg = create_int64_scalar_list(scalar_size, 0); + let scalar_arg = create_int64_scalar_list(scalar_size, array_size as i64); let args = vec![ ColumnarValue::Array(first_arr_i64.clone()), ColumnarValue::Scalar(scalar_arg), From ef941572ab60af0ddb35a887461be6d010603c99 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 07:42:47 -0500 Subject: [PATCH 07/22] Simplify scalar list returning false --- datafusion/functions-nested/src/array_has.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 1dace233c983d..81c753641f7e1 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -499,15 +499,7 @@ fn array_has_any_with_scalar( // If scalar list is empty, result is always false if scalar_values.is_empty() { - return match columnar_arg { - ColumnarValue::Array(arr) => { - let result = BooleanArray::from(BooleanBuffer::new_unset(arr.len())); - Ok(ColumnarValue::Array(Arc::new(result))) - } - ColumnarValue::Scalar(_) => { - Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)))) - } - }; + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)))); } match scalar_values.data_type() { From 162f168a9b7a0bbbafd59359913cb5a2771ecf5e Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 07:43:34 -0500 Subject: [PATCH 08/22] Simplify dispatch for scalar path, per code review Co-authored-by: Jeffrey Vo --- datafusion/functions-nested/src/array_has.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 81c753641f7e1..07ced80171e08 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -789,11 +789,8 @@ impl ScalarUDFImpl for ArrayHasAny { // array_has_any is symmetric: if either argument is scalar, build a // HashSet from it and probe with the rows of the other argument. match (&first_arg, &second_arg) { - (_, ColumnarValue::Scalar(scalar)) => { - array_has_any_with_scalar(first_arg, scalar) - } - (ColumnarValue::Scalar(scalar), _) => { - array_has_any_with_scalar(second_arg, scalar) + (cv, ColumnarValue::Scalar(scalar)) | (ColumnarValue::Scalar(scalar), cv) => { + array_has_any_with_scalar(cv, scalar) } _ => make_scalar_function(array_has_any_inner)(&args.args), } From 1c13b5dcf0a2d473d096e992976385ab755dd1b3 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 07:46:20 -0500 Subject: [PATCH 09/22] Simplify NULL check, per Jeffrey --- datafusion/functions-nested/src/array_has.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 07ced80171e08..7a6c4914348ae 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -542,7 +542,7 @@ fn array_has_any_with_scalar_string( scalar_strings.iter().copied().flatten().collect(); for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| !v.is_valid(i)) { + if col_nulls.is_some_and(|v| v.is_null(i)) { builder.append_null(); continue; } @@ -557,7 +557,7 @@ fn array_has_any_with_scalar_string( } else { // Small scalar: linear scan avoids HashSet hashing overhead for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| !v.is_valid(i)) { + if col_nulls.is_some_and(|v| v.is_null(i)) { builder.append_null(); continue; } @@ -612,7 +612,7 @@ fn array_has_any_with_scalar_general( .collect(); for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| !v.is_valid(i)) { + if col_nulls.is_some_and(|v| v.is_null(i)) { builder.append_null(); continue; } @@ -625,7 +625,7 @@ fn array_has_any_with_scalar_general( } else { // Small scalar: linear scan avoids HashSet hashing overhead for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| !v.is_valid(i)) { + if col_nulls.is_some_and(|v| v.is_null(i)) { builder.append_null(); continue; } From c8f0b98b5937d80646c1057b6d9b34378cddd853 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 07:48:49 -0500 Subject: [PATCH 10/22] Improve null check, per code review Co-authored-by: Jeffrey Vo --- datafusion/functions-nested/src/array_has.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 7a6c4914348ae..3376b71331800 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -522,7 +522,7 @@ fn array_has_any_with_scalar_string( scalar_values: &ArrayRef, ) -> Result { let scalar_strings = string_array_to_vec(scalar_values.as_ref()); - let has_null_scalar = scalar_strings.iter().any(|s| s.is_none()); + let has_null_scalar = scalar_values.null_count() > 0; let (col_arr, is_scalar_output) = match columnar_arg { ColumnarValue::Array(arr) => (Arc::clone(arr), false), From 04d3375bf5b9a5e10cb26508e88a8e6b504d1690 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 08:54:32 -0500 Subject: [PATCH 11/22] . --- datafusion/functions-nested/src/array_has.rs | 124 +++++++++++++------ 1 file changed, 85 insertions(+), 39 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 3376b71331800..73a48a797a855 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -17,7 +17,7 @@ //! [`ScalarUDFImpl`] definitions for array_has, array_has_all and array_has_any functions. -use arrow::array::{Array, ArrayRef, BooleanArray, Datum, Scalar}; +use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, Datum, Scalar, StringArrayType}; use arrow::buffer::BooleanBuffer; use arrow::datatypes::DataType; use arrow::row::{RowConverter, Rows, SortField}; @@ -521,7 +521,6 @@ fn array_has_any_with_scalar_string( columnar_arg: &ColumnarValue, scalar_values: &ArrayRef, ) -> Result { - let scalar_strings = string_array_to_vec(scalar_values.as_ref()); let has_null_scalar = scalar_values.null_count() > 0; let (col_arr, is_scalar_output) = match columnar_arg { @@ -530,48 +529,45 @@ fn array_has_any_with_scalar_string( }; let col_list: ArrayWrapper = col_arr.as_ref().try_into()?; - let all_col_strings = string_array_to_vec(col_list.values().as_ref()); + let col_values = col_list.values(); let col_offsets: Vec = col_list.offsets().collect(); let col_nulls = col_list.nulls(); - let mut builder = BooleanArray::builder(col_list.len()); - - if scalar_strings.len() > SCALAR_SMALL_THRESHOLD { - // Large scalar: build HashSet for O(1) lookups - let scalar_set: HashSet<&str> = - scalar_strings.iter().copied().flatten().collect(); - - for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| v.is_null(i)) { - builder.append_null(); - continue; - } - let start = col_offsets[i]; - let end = col_offsets[i + 1]; - let found = (start..end).any(|j| match all_col_strings[j] { - Some(s) => scalar_set.contains(s), - None => has_null_scalar, - }); - builder.append_value(found); - } + let scalar_lookup = if scalar_values.len() > SCALAR_SMALL_THRESHOLD { + ScalarStringLookup::Set( + string_array_to_vec(scalar_values.as_ref()) + .into_iter() + .flatten() + .collect(), + ) } else { - // Small scalar: linear scan avoids HashSet hashing overhead - for i in 0..col_list.len() { - if col_nulls.is_some_and(|v| v.is_null(i)) { - builder.append_null(); - continue; - } - let start = col_offsets[i]; - let end = col_offsets[i + 1]; - let found = (start..end).any(|j| match all_col_strings[j] { - Some(s) => scalar_strings.contains(&Some(s)), - None => has_null_scalar, - }); - builder.append_value(found); - } - } + ScalarStringLookup::List(string_array_to_vec(scalar_values.as_ref())) + }; - let result: ArrayRef = Arc::new(builder.finish()); + let result = match col_values.data_type() { + DataType::Utf8 => array_has_any_string_inner( + col_values.as_string::(), + &col_offsets, + col_nulls, + has_null_scalar, + &scalar_lookup, + ), + DataType::LargeUtf8 => array_has_any_string_inner( + col_values.as_string::(), + &col_offsets, + col_nulls, + has_null_scalar, + &scalar_lookup, + ), + DataType::Utf8View => array_has_any_string_inner( + col_values.as_string_view(), + &col_offsets, + col_nulls, + has_null_scalar, + &scalar_lookup, + ), + _ => unreachable!("array_has_any_with_scalar_string called with non-string type"), + }; if is_scalar_output { Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( @@ -582,6 +578,56 @@ fn array_has_any_with_scalar_string( } } +/// Pre-computed lookup structure for the scalar string values. +enum ScalarStringLookup<'a> { + /// Large scalar: HashSet for O(1) lookups. + Set(HashSet<&'a str>), + /// Small scalar: Vec for linear scan (avoids hashing overhead). + List(Vec>), +} + +impl ScalarStringLookup<'_> { + fn contains(&self, value: &str) -> bool { + match self { + ScalarStringLookup::Set(set) => set.contains(value), + ScalarStringLookup::List(list) => list.contains(&Some(value)), + } + } +} + +/// Inner implementation of the string scalar fast path, generic over string +/// array type so we can access column elements by index without materializing +/// a `Vec>` for the column values. +fn array_has_any_string_inner<'a, C: StringArrayType<'a>>( + col_strings: C, + col_offsets: &[usize], + col_nulls: Option<&arrow::buffer::NullBuffer>, + has_null_scalar: bool, + scalar_lookup: &ScalarStringLookup<'_>, +) -> ArrayRef { + let num_rows = col_offsets.len() - 1; + let mut builder = BooleanArray::builder(num_rows); + + for i in 0..num_rows { + if col_nulls.is_some_and(|v| v.is_null(i)) { + builder.append_null(); + continue; + } + let start = col_offsets[i]; + let end = col_offsets[i + 1]; + let found = (start..end).any(|j| { + if col_strings.is_null(j) { + has_null_scalar + } else { + scalar_lookup.contains(col_strings.value(j)) + } + }); + builder.append_value(found); + } + + Arc::new(builder.finish()) +} + /// General scalar fast path for `array_has_any`, using RowConverter for /// type-erased comparison. fn array_has_any_with_scalar_general( From b7074e06a146c490255c8329535e5bb9201384e7 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:00:26 -0500 Subject: [PATCH 12/22] . --- datafusion/functions-nested/src/array_has.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 73a48a797a855..e72a46ae4ae54 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -17,7 +17,9 @@ //! [`ScalarUDFImpl`] definitions for array_has, array_has_all and array_has_any functions. -use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, Datum, Scalar, StringArrayType}; +use arrow::array::{ + Array, ArrayRef, AsArray, BooleanArray, Datum, Scalar, StringArrayType, +}; use arrow::buffer::BooleanBuffer; use arrow::datatypes::DataType; use arrow::row::{RowConverter, Rows, SortField}; From 52f1c32f74f3f2709bab0034915e41d068846d28 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:07:18 -0500 Subject: [PATCH 13/22] Tweak --- datafusion/functions-nested/src/array_has.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index e72a46ae4ae54..6bc7686ab694c 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -580,11 +580,11 @@ fn array_has_any_with_scalar_string( } } -/// Pre-computed lookup structure for the scalar string values. +/// Pre-computed lookup structure for the scalar string fastpath. enum ScalarStringLookup<'a> { /// Large scalar: HashSet for O(1) lookups. Set(HashSet<&'a str>), - /// Small scalar: Vec for linear scan (avoids hashing overhead). + /// Small scalar: Vec for linear scan. List(Vec>), } From fb696f27773d0bf6108c587cb686676d9f02138d Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:09:37 -0500 Subject: [PATCH 14/22] . --- datafusion/functions-nested/src/array_has.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 6bc7686ab694c..a8743f31d486b 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -598,8 +598,7 @@ impl ScalarStringLookup<'_> { } /// Inner implementation of the string scalar fast path, generic over string -/// array type so we can access column elements by index without materializing -/// a `Vec>` for the column values. +/// array type to allow direct element access by index. fn array_has_any_string_inner<'a, C: StringArrayType<'a>>( col_strings: C, col_offsets: &[usize], From 8fb60df61e51633e708f487d9336e74729ab9139 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:26:16 -0500 Subject: [PATCH 15/22] Avoid vec alloc when building ScalarStringLookup::Set --- datafusion/functions-nested/src/array_has.rs | 38 ++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index a8743f31d486b..e0de29457e7c1 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -523,8 +523,6 @@ fn array_has_any_with_scalar_string( columnar_arg: &ColumnarValue, scalar_values: &ArrayRef, ) -> Result { - let has_null_scalar = scalar_values.null_count() > 0; - let (col_arr, is_scalar_output) = match columnar_arg { ColumnarValue::Array(arr) => (Arc::clone(arr), false), ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true), @@ -535,16 +533,8 @@ fn array_has_any_with_scalar_string( let col_offsets: Vec = col_list.offsets().collect(); let col_nulls = col_list.nulls(); - let scalar_lookup = if scalar_values.len() > SCALAR_SMALL_THRESHOLD { - ScalarStringLookup::Set( - string_array_to_vec(scalar_values.as_ref()) - .into_iter() - .flatten() - .collect(), - ) - } else { - ScalarStringLookup::List(string_array_to_vec(scalar_values.as_ref())) - }; + let scalar_lookup = ScalarStringLookup::new(scalar_values); + let has_null_scalar = scalar_values.null_count() > 0; let result = match col_values.data_type() { DataType::Utf8 => array_has_any_string_inner( @@ -588,7 +578,27 @@ enum ScalarStringLookup<'a> { List(Vec>), } -impl ScalarStringLookup<'_> { +impl<'a> ScalarStringLookup<'a> { + fn new(scalar_values: &'a ArrayRef) -> Self { + if scalar_values.len() > SCALAR_SMALL_THRESHOLD { + let set = match scalar_values.data_type() { + DataType::Utf8 => { + scalar_values.as_string::().iter().flatten().collect() + } + DataType::LargeUtf8 => { + scalar_values.as_string::().iter().flatten().collect() + } + DataType::Utf8View => { + scalar_values.as_string_view().iter().flatten().collect() + } + _ => unreachable!(), + }; + ScalarStringLookup::Set(set) + } else { + ScalarStringLookup::List(string_array_to_vec(scalar_values.as_ref())) + } + } + fn contains(&self, value: &str) -> bool { match self { ScalarStringLookup::Set(set) => set.contains(value), @@ -599,7 +609,7 @@ impl ScalarStringLookup<'_> { /// Inner implementation of the string scalar fast path, generic over string /// array type to allow direct element access by index. -fn array_has_any_string_inner<'a, C: StringArrayType<'a>>( +fn array_has_any_string_inner<'a, C: StringArrayType<'a> + Copy>( col_strings: C, col_offsets: &[usize], col_nulls: Option<&arrow::buffer::NullBuffer>, From c83c70d211347a210c07d2c4f910b73d46158afc Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:33:29 -0500 Subject: [PATCH 16/22] Revert avoiding Vec for ScalarStringLookup::Set construction, cheap in practice --- datafusion/functions-nested/src/array_has.rs | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index e0de29457e7c1..0d8ce727c4bf3 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -580,22 +580,11 @@ enum ScalarStringLookup<'a> { impl<'a> ScalarStringLookup<'a> { fn new(scalar_values: &'a ArrayRef) -> Self { - if scalar_values.len() > SCALAR_SMALL_THRESHOLD { - let set = match scalar_values.data_type() { - DataType::Utf8 => { - scalar_values.as_string::().iter().flatten().collect() - } - DataType::LargeUtf8 => { - scalar_values.as_string::().iter().flatten().collect() - } - DataType::Utf8View => { - scalar_values.as_string_view().iter().flatten().collect() - } - _ => unreachable!(), - }; - ScalarStringLookup::Set(set) + let strings = string_array_to_vec(scalar_values.as_ref()); + if strings.len() > SCALAR_SMALL_THRESHOLD { + ScalarStringLookup::Set(strings.into_iter().flatten().collect()) } else { - ScalarStringLookup::List(string_array_to_vec(scalar_values.as_ref())) + ScalarStringLookup::List(strings) } } From a14c7eadb7d739090054e4cb3f89cfcc625d95a3 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 09:57:03 -0500 Subject: [PATCH 17/22] Defend against a possible sliced array --- datafusion/functions-nested/src/array_has.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 0d8ce727c4bf3..c7a7f20b6b9ec 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -497,7 +497,8 @@ fn array_has_any_with_scalar( // Convert the scalar to a 1-element ListArray, then extract the inner values let scalar_array = scalar_arg.to_array_of_size(1)?; let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?; - let scalar_values = scalar_list.values(); + let offsets: Vec = scalar_list.offsets().collect(); + let scalar_values = scalar_list.values().slice(offsets[0], offsets[1] - offsets[0]); // If scalar list is empty, result is always false if scalar_values.is_empty() { @@ -506,9 +507,9 @@ fn array_has_any_with_scalar( match scalar_values.data_type() { DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { - array_has_any_with_scalar_string(columnar_arg, scalar_values) + array_has_any_with_scalar_string(columnar_arg, &scalar_values) } - _ => array_has_any_with_scalar_general(columnar_arg, scalar_values), + _ => array_has_any_with_scalar_general(columnar_arg, &scalar_values), } } From 8e9cc91ba96ee85f198704c64274e685b98ecb97 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 10:49:30 -0500 Subject: [PATCH 18/22] Don't use NEEDLE_SIZE constant for array_has_any benchmarks --- .../functions-nested/benches/array_has.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index 61fc5fafb78b7..d28c4fcf654da 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -186,6 +186,8 @@ fn bench_array_has_all(c: &mut Criterion, array_size: usize) { group.finish(); } +const SMALL_ARRAY_SIZE: usize = NEEDLE_SIZE; + fn bench_array_has_any(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_any"); let first_arr = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); @@ -198,7 +200,7 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ]; // Benchmark: some elements match - let second_match = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let second_match = create_int64_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0); let args_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Array(second_match), @@ -225,7 +227,7 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { // Benchmark: no match let second_no_match = - create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); + create_int64_list_array_with_offset(NUM_ROWS, SMALL_ARRAY_SIZE, array_size as i64); let args_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Array(second_no_match), @@ -251,7 +253,7 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ); // Benchmark: scalar second arg, some match - let scalar_second_match = create_int64_scalar_list(NEEDLE_SIZE, 0); + let scalar_second_match = create_int64_scalar_list(SMALL_ARRAY_SIZE, 0); let args_scalar_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_match), @@ -277,7 +279,7 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ); // Benchmark: scalar second arg, no match - let scalar_second_no_match = create_int64_scalar_list(NEEDLE_SIZE, array_size as i64); + let scalar_second_no_match = create_int64_scalar_list(SMALL_ARRAY_SIZE, array_size as i64); let args_scalar_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_no_match), @@ -440,7 +442,7 @@ fn bench_array_has_any_strings(c: &mut Criterion) { Field::new("second", list_type.clone(), false).into(), ]; - let second_match = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let second_match = create_string_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0); let args_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Array(second_match), @@ -462,7 +464,7 @@ fn bench_array_has_any_strings(c: &mut Criterion) { }); let second_no_match = - create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); + create_string_list_array_with_prefix(NUM_ROWS, SMALL_ARRAY_SIZE, "missing_"); let args_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Array(second_no_match), @@ -484,7 +486,7 @@ fn bench_array_has_any_strings(c: &mut Criterion) { }); // Benchmark: scalar second arg, some match - let scalar_second_match = create_string_scalar_list(NEEDLE_SIZE, "value_"); + let scalar_second_match = create_string_scalar_list(SMALL_ARRAY_SIZE, "value_"); let args_scalar_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_match), @@ -510,7 +512,7 @@ fn bench_array_has_any_strings(c: &mut Criterion) { ); // Benchmark: scalar second arg, no match - let scalar_second_no_match = create_string_scalar_list(NEEDLE_SIZE, "missing_"); + let scalar_second_no_match = create_string_scalar_list(SMALL_ARRAY_SIZE, "missing_"); let args_scalar_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_no_match), From 3c98d4d85e1ee7b9c3ff7de0c8c59eea6c3191ac Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 10:52:58 -0500 Subject: [PATCH 19/22] cargo fmt --- datafusion/functions-nested/benches/array_has.rs | 13 +++++++++---- datafusion/functions-nested/src/array_has.rs | 4 +++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index d28c4fcf654da..f5e66d56c0efe 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -226,8 +226,11 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ); // Benchmark: no match - let second_no_match = - create_int64_list_array_with_offset(NUM_ROWS, SMALL_ARRAY_SIZE, array_size as i64); + let second_no_match = create_int64_list_array_with_offset( + NUM_ROWS, + SMALL_ARRAY_SIZE, + array_size as i64, + ); let args_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Array(second_no_match), @@ -279,7 +282,8 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { ); // Benchmark: scalar second arg, no match - let scalar_second_no_match = create_int64_scalar_list(SMALL_ARRAY_SIZE, array_size as i64); + let scalar_second_no_match = + create_int64_scalar_list(SMALL_ARRAY_SIZE, array_size as i64); let args_scalar_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_no_match), @@ -512,7 +516,8 @@ fn bench_array_has_any_strings(c: &mut Criterion) { ); // Benchmark: scalar second arg, no match - let scalar_second_no_match = create_string_scalar_list(SMALL_ARRAY_SIZE, "missing_"); + let scalar_second_no_match = + create_string_scalar_list(SMALL_ARRAY_SIZE, "missing_"); let args_scalar_no_match = vec![ ColumnarValue::Array(first_arr.clone()), ColumnarValue::Scalar(scalar_second_no_match), diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 425e5380517ab..8411a32202e6c 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -510,7 +510,9 @@ fn array_has_any_with_scalar( let scalar_array = scalar_arg.to_array_of_size(1)?; let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?; let offsets: Vec = scalar_list.offsets().collect(); - let scalar_values = scalar_list.values().slice(offsets[0], offsets[1] - offsets[0]); + let scalar_values = scalar_list + .values() + .slice(offsets[0], offsets[1] - offsets[0]); // If scalar list is empty, result is always false if scalar_values.is_empty() { From c7abde1338e613824de9c14b3746639d92c55089 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Feb 2026 21:59:57 -0500 Subject: [PATCH 20/22] Tweak comment --- datafusion/functions-nested/src/array_has.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 8411a32202e6c..7173efadf3eeb 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -847,8 +847,7 @@ impl ScalarUDFImpl for ArrayHasAny { ) -> Result { let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?; - // array_has_any is symmetric: if either argument is scalar, build a - // HashSet from it and probe with the rows of the other argument. + // If either argument is scalar, use the fast path. match (&first_arg, &second_arg) { (cv, ColumnarValue::Scalar(scalar)) | (ColumnarValue::Scalar(scalar), cv) => { array_has_any_with_scalar(cv, scalar) From 242f3c2092d745b5adbe724c481dd4b96d71b2c7 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 23 Feb 2026 09:30:20 -0500 Subject: [PATCH 21/22] Switch to hashbrown from std HashSet --- Cargo.lock | 1 + datafusion/functions-nested/Cargo.toml | 1 + datafusion/functions-nested/src/array_has.rs | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 7fd39099579fa..765e93b2021a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,6 +2302,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", + "hashbrown 0.16.1", "itertools 0.14.0", "log", "paste", diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index e5e601f30ae84..d885a2ca96dac 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -57,6 +57,7 @@ datafusion-functions-aggregate = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-macros = { workspace = true } datafusion-physical-expr-common = { workspace = true } +hashbrown = { workspace = true } itertools = { workspace = true, features = ["use_std"] } log = { workspace = true } paste = { workspace = true } diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 7173efadf3eeb..df9c0b4f51c40 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -41,7 +41,7 @@ use crate::make_array::make_array_udf; use crate::utils::make_scalar_function; use std::any::Any; -use std::collections::HashSet; +use hashbrown::HashSet; use std::sync::Arc; // Create static instances of ScalarUDFs for each function From 1d3464ec300d10074b059d8ed3dc572b3197f6c1 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 23 Feb 2026 10:12:41 -0500 Subject: [PATCH 22/22] cargo fmt --- datafusion/functions-nested/src/array_has.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index df9c0b4f51c40..ace69de66f5c3 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -40,8 +40,8 @@ use itertools::Itertools; use crate::make_array::make_array_udf; use crate::utils::make_scalar_function; -use std::any::Any; use hashbrown::HashSet; +use std::any::Any; use std::sync::Arc; // Create static instances of ScalarUDFs for each function