Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor signatures for lpad, rpad, left, and right #13420

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions datafusion/functions/src/unicode/left.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ use datafusion_common::cast::{
as_generic_string_array, as_int64_array, as_string_view_array,
};
use datafusion_common::exec_err;
use datafusion_common::types::{logical_int64, logical_string};
use datafusion_common::Result;
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
};
Expand All @@ -50,14 +50,9 @@ impl Default for LeftFunc {

impl LeftFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8View, Int64]),
Exact(vec![Utf8, Int64]),
Exact(vec![LargeUtf8, Int64]),
],
signature: Signature::coercible(
vec![logical_string(), logical_int64()],
Volatility::Immutable,
),
}
Expand Down
22 changes: 8 additions & 14 deletions datafusion/functions/src/unicode/lpad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use arrow::array::{
OffsetSizeTrait, StringViewArray,
};
use arrow::datatypes::DataType;
use datafusion_common::types::{logical_int64, logical_string};
use unicode_segmentation::UnicodeSegmentation;
use DataType::{LargeUtf8, Utf8, Utf8View};

Expand All @@ -32,7 +33,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
use datafusion_common::cast::as_int64_array;
use datafusion_common::{exec_err, Result};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::TypeSignature;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
};
Expand All @@ -50,22 +51,15 @@ impl Default for LPadFunc {

impl LPadFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8View, Int64]),
Exact(vec![Utf8View, Int64, Utf8View]),
Exact(vec![Utf8View, Int64, Utf8]),
Exact(vec![Utf8View, Int64, LargeUtf8]),
Exact(vec![Utf8, Int64]),
Exact(vec![Utf8, Int64, Utf8View]),
Exact(vec![Utf8, Int64, Utf8]),
Exact(vec![Utf8, Int64, LargeUtf8]),
Exact(vec![LargeUtf8, Int64]),
Exact(vec![LargeUtf8, Int64, Utf8View]),
Exact(vec![LargeUtf8, Int64, Utf8]),
Exact(vec![LargeUtf8, Int64, LargeUtf8]),
TypeSignature::Coercible(vec![logical_string(), logical_int64()]),
TypeSignature::Coercible(vec![
logical_string(),
logical_int64(),
logical_string(),
]),
],
Volatility::Immutable,
),
Expand Down
11 changes: 3 additions & 8 deletions datafusion/functions/src/unicode/right.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ use datafusion_common::cast::{
as_generic_string_array, as_int64_array, as_string_view_array,
};
use datafusion_common::exec_err;
use datafusion_common::types::{logical_int64, logical_string};
use datafusion_common::Result;
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
};
Expand All @@ -50,14 +50,9 @@ impl Default for RightFunc {

impl RightFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8View, Int64]),
Exact(vec![Utf8, Int64]),
Exact(vec![LargeUtf8, Int64]),
],
signature: Signature::coercible(
vec![logical_string(), logical_int64()],
Volatility::Immutable,
),
}
Expand Down
22 changes: 8 additions & 14 deletions datafusion/functions/src/unicode/rpad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ use arrow::array::{
};
use arrow::datatypes::DataType;
use datafusion_common::cast::as_int64_array;
use datafusion_common::types::{logical_int64, logical_string};
use datafusion_common::DataFusionError;
use datafusion_common::{exec_err, Result};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::TypeSignature;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
};
Expand All @@ -49,22 +50,15 @@ impl Default for RPadFunc {

impl RPadFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8View, Int64]),
Exact(vec![Utf8View, Int64, Utf8View]),
Exact(vec![Utf8View, Int64, Utf8]),
Exact(vec![Utf8View, Int64, LargeUtf8]),
Exact(vec![Utf8, Int64]),
Exact(vec![Utf8, Int64, Utf8View]),
Exact(vec![Utf8, Int64, Utf8]),
Exact(vec![Utf8, Int64, LargeUtf8]),
Exact(vec![LargeUtf8, Int64]),
Exact(vec![LargeUtf8, Int64, Utf8View]),
Exact(vec![LargeUtf8, Int64, Utf8]),
Exact(vec![LargeUtf8, Int64, LargeUtf8]),
TypeSignature::Coercible(vec![logical_string(), logical_int64()]),
TypeSignature::Coercible(vec![
logical_string(),
logical_int64(),
logical_string(),
]),
],
Volatility::Immutable,
),
Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/scalar.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1864,10 +1864,10 @@ query TT
EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string;
----
logical_plan
01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1))
01)Projection: simple_string.letter, simple_string.letter = left(CAST(simple_string.letter2 AS Utf8View), Int64(1))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking maybe we should avoid casting type if they have the same Logical type like utf8 -> utf8view. 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would agree. If the signature allows for a string and receives a Utf8 it should accept it as is unless it needs to be coerced to a common type for some other reason. The less casting the better imho

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

100% -- casts can often require trivial computation during query -- in this particular case casting letter2 to a Utf8View means it will copy at least an additional 128 bytes for each row (each view is 128 bytes)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion. I implement not to cast if their logical types are the same. However, it is failing in some cases where Dictionary is in the signature. In those cases, the logical type is the same, but the native type is Dictionary causing type mismatch. The test can be reproduced

cargo test --test sqllogictests -- jctest

Any suggestions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need to implement the kernel for dictionary type.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the fn invoke function for each function.

For example lpad

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
        match args[0].data_type() {
            Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
            LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args),
            other => exec_err!("Unsupported data type {other:?} for function lpad"),
        }
    }

It support utf8/utf8view/largeutf8, but not dictionary. You can rewrite it like this

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
        invoke_inner(args, &args[0].data_type())
}

fn invoke_inner(args: &[ColumnarValue], data_type: &DataType) -> Result<ColumnarValue> {
    match data_type {
        DataType::Dictionary(_, v) => {
            invoke_inner(args, v.as_ref())
        }
        Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
        LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args),
        other => exec_err!("Unsupported data type {other:?} for function lpad"),
    }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the pointer! I added the support for left function. Can you give a read of whether that is a good design?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @jayzhan211, I have added dictionary support for those four functions and repeat because the logical type casting skipping fails some relevant tests. Please let me know if you have any thoughts on the current implementation.

I also have a question when getting values from Dictionary to get the actual string, it drops NULL values, which causes some tests to fail. Is there any helper method I can use to get NULL preserving values from the Dictionary?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dictionary(_key_type, value_type) => {
possibly @jiashenC ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that iterating values in Dict is quite tricky

        let test = vec!["a", "a", "b", "c"];
        let array : DictionaryArray<Int32Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
        let typed = array.downcast_dict::<StringArray>().unwrap();
        for v in typed {
            println!("iter value in dict: {:?}", v);
        }

02)--TableScan: simple_string projection=[letter, letter2]
physical_plan
01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))]
01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(CAST(letter2@1 AS Utf8View), 1) as simple_string.letter = left(simple_string.letter2,Int64(1))]
02)--MemoryExec: partitions=1, partition_sizes=[1]

query TB
Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8(" ")) AS c1
01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8View(" ")) AS c1
02)--TableScan: test projection=[column1_utf8view]

query TT
Expand All @@ -672,7 +672,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1
01)Projection: lpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1
02)--TableScan: test projection=[column2_large_utf8, column1_utf8view]

query TT
Expand Down Expand Up @@ -826,7 +826,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: rpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1
01)Projection: rpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1
02)--TableScan: test projection=[column2_large_utf8, column1_utf8view]

query TT
Expand Down