Skip to content

Commit

Permalink
Optimize performance of initcap function (~2x faster) (apache#13691)
Browse files Browse the repository at this point in the history
* Optimize performance of initcap (~2x faster)

Signed-off-by: Tai Le Manh <[email protected]>

* format

---------

Signed-off-by: Tai Le Manh <[email protected]>
  • Loading branch information
tlm365 authored Dec 12, 2024
1 parent aeddbd9 commit 320e4d6
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 13 deletions.
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,8 @@ required-features = ["unicode_expressions"]
harness = false
name = "trunc"
required-features = ["math_expressions"]

[[bench]]
harness = false
name = "initcap"
required-features = ["string_expressions"]
93 changes: 93 additions & 0 deletions datafusion/functions/benches/initcap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use arrow::array::OffsetSizeTrait;
use arrow::datatypes::DataType;
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string;
use std::sync::Arc;

fn create_args<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
force_view_types: bool,
) -> Vec<ColumnarValue> {
if force_view_types {
let string_array =
Arc::new(create_string_view_array_with_len(size, 0.2, str_len, false));

vec![ColumnarValue::Array(string_array)]
} else {
let string_array =
Arc::new(create_string_array_with_len::<O>(size, 0.2, str_len));

vec![ColumnarValue::Array(string_array)]
}
}

fn criterion_benchmark(c: &mut Criterion) {
let initcap = string::initcap();
for size in [1024, 4096] {
let args = create_args::<i32>(size, 8, true);
c.bench_function(
format!("initcap string view shorter than 12 [size={}]", size).as_str(),
|b| {
b.iter(|| {
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
number_rows: size,
return_type: &DataType::Utf8View,
}))
})
},
);

let args = create_args::<i32>(size, 16, true);
c.bench_function(
format!("initcap string view longer than 12 [size={}]", size).as_str(),
|b| {
b.iter(|| {
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
number_rows: size,
return_type: &DataType::Utf8View,
}))
})
},
);

let args = create_args::<i32>(size, 16, false);
c.bench_function(format!("initcap string [size={}]", size).as_str(), |b| {
b.iter(|| {
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
number_rows: size,
return_type: &DataType::Utf8,
}))
})
});
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
27 changes: 14 additions & 13 deletions datafusion/functions/src/string/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,22 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
Ok(Arc::new(result) as ArrayRef)
}

fn initcap_string(string: Option<&str>) -> Option<String> {
let mut char_vector = Vec::<char>::new();
string.map(|string: &str| {
char_vector.clear();
let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
fn initcap_string(input: Option<&str>) -> Option<String> {
input.map(|s| {
let mut result = String::with_capacity(s.len());
let mut prev_is_alphanumeric = false;

for c in s.chars() {
let transformed = if prev_is_alphanumeric {
c.to_ascii_lowercase()
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number =
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
c.to_ascii_uppercase()
};
result.push(transformed);
prev_is_alphanumeric = c.is_ascii_alphanumeric();
}
char_vector.iter().collect::<String>()

result
})
}

Expand Down

0 comments on commit 320e4d6

Please sign in to comment.