From c6500d1741fb092636c89ea7c762cbfad1c3f3e3 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Sun, 1 Dec 2024 14:51:39 +0000 Subject: [PATCH 1/5] chore: datafusion 44 fix for PartitionedFile Signed-off-by: R. Tyler Croy --- crates/core/src/delta_datafusion/cdf/scan_utils.rs | 1 + crates/core/src/delta_datafusion/mod.rs | 3 +++ 2 files changed, 4 insertions(+) diff --git a/crates/core/src/delta_datafusion/cdf/scan_utils.rs b/crates/core/src/delta_datafusion/cdf/scan_utils.rs index 27285179f6..459157f4c8 100644 --- a/crates/core/src/delta_datafusion/cdf/scan_utils.rs +++ b/crates/core/src/delta_datafusion/cdf/scan_utils.rs @@ -84,6 +84,7 @@ pub fn create_partition_values( extensions: None, range: None, statistics: None, + metadata_size_hint: None, }; file_groups.entry(new_part_values).or_default().push(part); diff --git a/crates/core/src/delta_datafusion/mod.rs b/crates/core/src/delta_datafusion/mod.rs index e692dd054b..2a8c269b5d 100644 --- a/crates/core/src/delta_datafusion/mod.rs +++ b/crates/core/src/delta_datafusion/mod.rs @@ -1054,6 +1054,7 @@ fn partitioned_file_from_action( range: None, extensions: None, statistics: None, + metadata_size_hint: None, } } @@ -1821,6 +1822,7 @@ mod tests { use crate::operations::write::SchemaMode; use crate::writer::test_utils::get_delta_schema; use arrow::array::StructArray; + use arrow::datatypes::DataType; use arrow::datatypes::{Field, Schema}; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_sorted_eq; @@ -1948,6 +1950,7 @@ mod tests { range: None, extensions: None, statistics: None, + metadata_size_hint: None, }; assert_eq!(file.partition_values, ref_file.partition_values) } From b43e54f5f61afa8da509a34bd0ae7d4592a99350 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Sun, 1 Dec 2024 15:27:55 +0000 Subject: [PATCH 2/5] chore: datafusion 44 prototype change on make_array Signed-off-by: R. Tyler Croy --- crates/core/src/delta_datafusion/expr.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/crates/core/src/delta_datafusion/expr.rs b/crates/core/src/delta_datafusion/expr.rs index b633cae141..fe960bdb6d 100644 --- a/crates/core/src/delta_datafusion/expr.rs +++ b/crates/core/src/delta_datafusion/expr.rs @@ -99,14 +99,13 @@ impl ScalarUDFImpl for MakeParquetArray { r_type } - fn invoke(&self, args: &[ColumnarValue]) -> Result { + fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result { let mut data_type = DataType::Null; for arg in args { data_type = arg.data_type(); } - #[allow(deprecated)] - match self.actual.invoke(args)? { + match self.actual.invoke_batch(args, number_rows)? { ColumnarValue::Scalar(ScalarValue::List(df_array)) => { let field = Arc::new(Field::new("element", data_type, true)); let result = Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new( @@ -127,10 +126,6 @@ impl ScalarUDFImpl for MakeParquetArray { } } - fn invoke_no_args(&self, number_rows: usize) -> Result { - self.actual.invoke_batch(&[], number_rows) - } - fn aliases(&self) -> &[String] { &self.aliases } From 6f4e1296afe833e5e673d4c018bfa7a62eee0f33 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 21 Dec 2024 14:05:20 -0500 Subject: [PATCH 3/5] Pin to pre-release Datafusion --- Cargo.toml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index ff564e8656..a28b9d1c9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,3 +75,17 @@ async-trait = { version = "0.1" } futures = { version = "0.3" } tokio = { version = "1" } num_cpus = { version = "1" } + +# temporary datafusion patches +[patch.crates-io] +datafusion = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-execution = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-ffi = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-functions-aggregate = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-proto = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } +datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "a50ed3488f77743a192d9e8dd9c99f00df659ef1" } From ad82822fce573c0875a2f0dc1588c4cb388415b1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 21 Dec 2024 14:06:12 -0500 Subject: [PATCH 4/5] chore: Update to latest sqlparser --- crates/core/Cargo.toml | 2 +- crates/sql/src/parser.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 0797b77b9a..965041bdd0 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -96,7 +96,7 @@ tracing = { workspace = true } rand = "0.8" z85 = "3.0.5" maplit = "1" -sqlparser = { version = "0.52.0" } +sqlparser = { version = "0.53.0" } [dev-dependencies] criterion = "0.5" diff --git a/crates/sql/src/parser.rs b/crates/sql/src/parser.rs index 19bf3f00b0..4844c7fb2c 100644 --- a/crates/sql/src/parser.rs +++ b/crates/sql/src/parser.rs @@ -224,9 +224,9 @@ impl<'a> DeltaParser<'a> { #[cfg(test)] mod tests { - use datafusion_sql::sqlparser::ast::Ident; - use super::*; + use datafusion_sql::sqlparser::ast::Ident; + use datafusion_sql::sqlparser::tokenizer::Span; fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), ParserError> { let statements = DeltaParser::parse_sql(sql)?; @@ -245,6 +245,7 @@ mod tests { table: ObjectName(vec![Ident { value: "data_table".to_string(), quote_style: None, + span: Span::empty(), }]), retention_hours: None, dry_run: false, @@ -255,6 +256,7 @@ mod tests { table: ObjectName(vec![Ident { value: "data_table".to_string(), quote_style: None, + span: Span::empty(), }]), retention_hours: Some(10), dry_run: false, @@ -265,6 +267,7 @@ mod tests { table: ObjectName(vec![Ident { value: "data_table".to_string(), quote_style: None, + span: Span::empty(), }]), retention_hours: Some(10), dry_run: true, @@ -275,6 +278,7 @@ mod tests { table: ObjectName(vec![Ident { value: "data_table".to_string(), quote_style: None, + span: Span::empty(), }]), retention_hours: None, dry_run: true, From d0787ed7943191ad759130b9bd317dc606f24e2b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 22 Dec 2024 10:38:00 -0500 Subject: [PATCH 5/5] chore: Reduce compiler warnings by updating to use non deprecated DataFusion APIs (#3077) - targets https://github.com/delta-io/delta-rs/pull/3073 from @rtyler This PR makes some small changes to reduce the build warnings by using non deprecated APIs --- crates/catalog-unity/src/error.rs | 3 --- crates/core/src/delta_datafusion/mod.rs | 1 - crates/core/src/operations/optimize.rs | 10 ++++------ crates/sql/src/parser.rs | 4 ++-- crates/test/src/datafusion.rs | 10 +--------- 5 files changed, 7 insertions(+), 21 deletions(-) diff --git a/crates/catalog-unity/src/error.rs b/crates/catalog-unity/src/error.rs index 67610c07fe..a13c3dc401 100644 --- a/crates/catalog-unity/src/error.rs +++ b/crates/catalog-unity/src/error.rs @@ -10,7 +10,6 @@ pub enum UnityCatalogError { }, /// A generic error qualified in the message - #[error("{source}")] Retry { /// Error message @@ -19,7 +18,6 @@ pub enum UnityCatalogError { }, #[error("Request error: {source}")] - /// Error from reqwest library RequestError { /// The underlying reqwest_middleware::Error @@ -35,7 +33,6 @@ pub enum UnityCatalogError { }, /// Error caused by invalid access token value - #[error("Invalid Databricks personal access token")] InvalidAccessToken, } diff --git a/crates/core/src/delta_datafusion/mod.rs b/crates/core/src/delta_datafusion/mod.rs index 2a8c269b5d..f326d7fbc7 100644 --- a/crates/core/src/delta_datafusion/mod.rs +++ b/crates/core/src/delta_datafusion/mod.rs @@ -1822,7 +1822,6 @@ mod tests { use crate::operations::write::SchemaMode; use crate::writer::test_utils::get_delta_schema; use arrow::array::StructArray; - use arrow::datatypes::DataType; use arrow::datatypes::{Field, Schema}; use chrono::{TimeZone, Utc}; use datafusion::assert_batches_sorted_eq; diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index fe76a3647d..d756dcb157 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -1215,10 +1215,7 @@ pub(super) mod zorder { use url::Url; use ::datafusion::{ - execution::{ - memory_pool::FairSpillPool, - runtime_env::{RuntimeConfig, RuntimeEnv}, - }, + execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder}, prelude::{SessionConfig, SessionContext}, }; use arrow_schema::DataType; @@ -1245,8 +1242,9 @@ pub(super) mod zorder { let columns = columns.into(); let memory_pool = FairSpillPool::new(max_spill_size); - let config = RuntimeConfig::new().with_memory_pool(Arc::new(memory_pool)); - let runtime = Arc::new(RuntimeEnv::try_new(config)?); + let runtime = RuntimeEnvBuilder::new() + .with_memory_pool(Arc::new(memory_pool)) + .build_arc()?; runtime.register_object_store(&Url::parse("delta-rs://").unwrap(), object_store); let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), runtime); diff --git a/crates/sql/src/parser.rs b/crates/sql/src/parser.rs index 4844c7fb2c..ccb52a3c3a 100644 --- a/crates/sql/src/parser.rs +++ b/crates/sql/src/parser.rs @@ -5,7 +5,7 @@ use datafusion_sql::parser::{DFParser, Statement as DFStatement}; use datafusion_sql::sqlparser::ast::{ObjectName, Value}; use datafusion_sql::sqlparser::dialect::{keywords::Keyword, Dialect, GenericDialect}; use datafusion_sql::sqlparser::parser::{Parser, ParserError}; -use datafusion_sql::sqlparser::tokenizer::{Token, TokenWithLocation, Tokenizer}; +use datafusion_sql::sqlparser::tokenizer::{Token, TokenWithSpan, Tokenizer}; // Use `Parser::expected` instead, if possible macro_rules! parser_err { @@ -129,7 +129,7 @@ impl<'a> DeltaParser<'a> { } /// Report an unexpected token - fn expected(&self, expected: &str, found: TokenWithLocation) -> Result { + fn expected(&self, expected: &str, found: TokenWithSpan) -> Result { parser_err!(format!("Expected {expected}, found: {found}")) } diff --git a/crates/test/src/datafusion.rs b/crates/test/src/datafusion.rs index 5ca73a742e..602c115bd6 100644 --- a/crates/test/src/datafusion.rs +++ b/crates/test/src/datafusion.rs @@ -1,18 +1,10 @@ use deltalake_core::datafusion::execution::context::SessionContext; -use deltalake_core::datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use deltalake_core::datafusion::execution::session_state::SessionStateBuilder; -use deltalake_core::datafusion::prelude::SessionConfig; use deltalake_core::delta_datafusion::DeltaTableFactory; use std::sync::Arc; pub fn context_with_delta_table_factory() -> SessionContext { - let cfg = RuntimeConfig::new(); - let env = RuntimeEnv::try_new(cfg).unwrap(); - let ses = SessionConfig::new(); - let mut state = SessionStateBuilder::new() - .with_config(ses) - .with_runtime_env(Arc::new(env)) - .build(); + let mut state = SessionStateBuilder::new().build(); state .table_factories_mut() .insert("DELTATABLE".to_string(), Arc::new(DeltaTableFactory {}));