Skip to content

Commit

Permalink
fix: prevent empty stats struct during parquet write (#2125)
Browse files Browse the repository at this point in the history
# Description
When building the arrow schema for delta checkpoints, List, Map, and
Binary max/min stats are not collected. If you have a Struct column with
only a List Map, or Binary field, then the arrow schema gets an empty
Struct. Parquet writer fails with this:

```
ParquetParseError { source: ArrowError("Parquet does not support writing empty structs") }
```
  • Loading branch information
alexwilcoxson-rel authored Jan 28, 2024
1 parent 7fbc02b commit 0f6790f
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
12 changes: 7 additions & 5 deletions crates/deltalake-core/src/kernel/arrow/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -564,11 +564,13 @@ fn max_min_schema_for_fields(dest: &mut Vec<ArrowField>, f: &ArrowField) {
max_min_schema_for_fields(&mut child_dest, f);
}

dest.push(ArrowField::new(
f.name(),
ArrowDataType::Struct(child_dest.into()),
true,
));
if !child_dest.is_empty() {
dest.push(ArrowField::new(
f.name(),
ArrowDataType::Struct(child_dest.into()),
true,
));
}
}
// don't compute min or max for list, map or binary types
ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ }
Expand Down
40 changes: 39 additions & 1 deletion crates/deltalake-core/src/protocol/checkpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,8 @@ fn apply_stats_conversion(
mod tests {
use std::sync::Arc;

use arrow_array::{ArrayRef, RecordBatch};
use arrow_array::builder::{Int32Builder, ListBuilder, StructBuilder};
use arrow_array::{ArrayRef, Int32Array, RecordBatch};
use arrow_schema::Schema as ArrowSchema;
use chrono::Duration;
use lazy_static::lazy_static;
Expand Down Expand Up @@ -903,6 +904,43 @@ mod tests {
);
}

#[tokio::test]
async fn test_struct_with_single_list_field() {
// you need another column otherwise the entire stats struct is empty
// which also fails parquet write during checkpoint
let other_column_array: ArrayRef = Arc::new(Int32Array::from(vec![1]));

let mut list_item_builder = Int32Builder::new();
list_item_builder.append_value(1);

let mut list_in_struct_builder = ListBuilder::new(list_item_builder);
list_in_struct_builder.append(true);

let mut struct_builder = StructBuilder::new(
vec![arrow_schema::Field::new(
"list_in_struct",
arrow_schema::DataType::List(Arc::new(arrow_schema::Field::new(
"item",
arrow_schema::DataType::Int32,
true,
))),
true,
)],
vec![Box::new(list_in_struct_builder)],
);
struct_builder.append(true);

let struct_with_list_array: ArrayRef = Arc::new(struct_builder.finish());
let batch = RecordBatch::try_from_iter(vec![
("other_column", other_column_array),
("struct_with_list", struct_with_list_array),
])
.unwrap();
let table = DeltaOps::new_in_memory().write(vec![batch]).await.unwrap();

create_checkpoint(&table).await.unwrap();
}

lazy_static! {
static ref SCHEMA: Value = json!({
"type": "struct",
Expand Down

0 comments on commit 0f6790f

Please sign in to comment.