From a1a33ce08ac6d0842efeccc0cc7bf9a8d4bb0482 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Fri, 20 Sep 2024 12:43:22 +0000 Subject: [PATCH] fix: adopt the right array item name which changed in kernel 0.3.1 see delta-incubator/delta-kernel-rs#301 --- crates/core/src/writer/stats.rs | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/crates/core/src/writer/stats.rs b/crates/core/src/writer/stats.rs index eee28b8be8..c1f0363083 100644 --- a/crates/core/src/writer/stats.rs +++ b/crates/core/src/writer/stats.rs @@ -474,6 +474,10 @@ impl AddAssign for AggregatedStats { /// the list and items fields from the path, but also need to handle the /// peculiar case where the user named the list field "list" or "item". /// +/// NOTE: As of delta_kernel 0.3.1 the name switched from `item` to `element` to line up with the +/// parquet spec, see +/// [here](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) +/// /// For example: /// /// * ["some_nested_list", "list", "item", "list", "item"] -> "some_nested_list" @@ -495,9 +499,9 @@ fn get_list_field_name(column_descr: &Arc) -> Option { while let Some(part) = column_path_parts.pop() { match (part.as_str(), lists_seen, items_seen) { ("list", seen, _) if seen == max_rep_levels => return Some("list".to_string()), - ("item", _, seen) if seen == max_rep_levels => return Some("item".to_string()), + ("element", _, seen) if seen == max_rep_levels => return Some("element".to_string()), ("list", _, _) => lists_seen += 1, - ("item", _, _) => items_seen += 1, + ("element", _, _) => items_seen += 1, (other, _, _) => return Some(other.to_string()), } } @@ -789,9 +793,21 @@ mod tests { let mut null_count_keys = vec!["some_list", "some_nested_list"]; null_count_keys.extend_from_slice(min_max_keys.as_slice()); - assert_eq!(min_max_keys.len(), stats.min_values.len()); - assert_eq!(min_max_keys.len(), stats.max_values.len()); - assert_eq!(null_count_keys.len(), stats.null_count.len()); + assert_eq!( + min_max_keys.len(), + stats.min_values.len(), + "min values don't match" + ); + assert_eq!( + min_max_keys.len(), + stats.max_values.len(), + "max values don't match" + ); + assert_eq!( + null_count_keys.len(), + stats.null_count.len(), + "null counts don't match" + ); // assert on min values for (k, v) in stats.min_values.iter() { @@ -820,7 +836,7 @@ mod tests { ("uuid", ColumnValueStat::Value(v)) => { assert_eq!("176c770d-92af-4a21-bf76-5d8c5261d659", v.as_str().unwrap()) } - _ => panic!("Key should not be present"), + k => panic!("Key {k:?} should not be present in min_values"), } } @@ -851,7 +867,7 @@ mod tests { ("uuid", ColumnValueStat::Value(v)) => { assert_eq!("a98bea04-d119-4f21-8edc-eb218b5849af", v.as_str().unwrap()) } - _ => panic!("Key should not be present"), + k => panic!("Key {k:?} should not be present in max_values"), } } @@ -878,7 +894,7 @@ mod tests { ("some_nested_list", ColumnCountStat::Value(v)) => assert_eq!(100, *v), ("date", ColumnCountStat::Value(v)) => assert_eq!(0, *v), ("uuid", ColumnCountStat::Value(v)) => assert_eq!(0, *v), - _ => panic!("Key should not be present"), + k => panic!("Key {k:?} should not be present in null_count"), } } }