Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: no longer load full table into ram in write #2265

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions crates/benchmarks/src/bin/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,13 @@ pub async fn convert_tpcds_web_returns(input_path: String, table_path: String) -
.await
.unwrap();

let tbl = table.collect().await.unwrap();
let _schema = tbl[0].schema().clone();

DeltaOps::try_from_uri(table_path)
.await
.unwrap()
.write(table.collect().await.unwrap())
.write(tbl.into())
.with_partition_columns(vec!["wr_returned_date_sk"])
.await
.unwrap();
Expand Down Expand Up @@ -551,7 +554,7 @@ async fn main() {
]));

let batch = RecordBatch::try_new(
schema,
schema.clone(),
vec![
Arc::new(StringArray::from(group_ids)),
Arc::new(StringArray::from(name)),
Expand All @@ -565,7 +568,7 @@ async fn main() {
DeltaOps::try_from_uri(output)
.await
.unwrap()
.write(vec![batch])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand Down
5 changes: 3 additions & 2 deletions crates/core/src/delta_datafusion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,7 @@ impl From<Column> for DeltaColumn {

#[cfg(test)]
mod tests {
use crate::operations::write::WriteData;
use crate::writer::test_utils::get_delta_schema;
use arrow::array::StructArray;
use arrow::datatypes::{DataType, Field, Schema};
Expand Down Expand Up @@ -1756,7 +1757,7 @@ mod tests {
.unwrap();
// write some data
let table = crate::DeltaOps(table)
.write(vec![batch.clone()])
.write(batch.clone().into())
.with_save_mode(crate::protocol::SaveMode::Append)
.await
.unwrap();
Expand Down Expand Up @@ -1820,7 +1821,7 @@ mod tests {
.unwrap();
// write some data
let table = crate::DeltaOps::new_in_memory()
.write(vec![batch.clone()])
.write(batch.clone().into())
.with_save_mode(crate::protocol::SaveMode::Append)
.await
.unwrap();
Expand Down
19 changes: 10 additions & 9 deletions crates/core/src/operations/constraints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ mod tests {
use arrow_schema::{DataType as ArrowDataType, Field, Schema as ArrowSchema};
use datafusion_expr::{col, lit};

use crate::operations::write::WriteData;
use crate::writer::test_utils::{create_bare_table, get_arrow_schema, get_record_batch};
use crate::{DeltaOps, DeltaResult, DeltaTable};

Expand Down Expand Up @@ -271,7 +272,7 @@ mod tests {
async fn add_constraint_with_invalid_data() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

Expand All @@ -287,7 +288,7 @@ mod tests {
async fn add_valid_constraint() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

Expand All @@ -312,7 +313,7 @@ mod tests {
// Add constraint by providing a datafusion expression.
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

Expand Down Expand Up @@ -355,7 +356,7 @@ mod tests {
)
.unwrap();

let table = DeltaOps::new_in_memory().write(vec![batch]).await.unwrap();
let table = DeltaOps::new_in_memory().write(batch.into()).await.unwrap();

let mut table = DeltaOps(table)
.add_constraint()
Expand All @@ -378,7 +379,7 @@ mod tests {
async fn add_conflicting_named_constraint() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

Expand All @@ -400,7 +401,7 @@ mod tests {
async fn write_data_that_violates_constraint() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;

let table = DeltaOps(write)
Expand All @@ -414,7 +415,7 @@ mod tests {
Arc::new(StringArray::from(vec!["2021-02-02"])),
];
let batch = RecordBatch::try_new(get_arrow_schema(&None), invalid_values)?;
let err = table.write(vec![batch]).await;
let err = table.write(batch.into()).await;
assert!(err.is_err());
Ok(())
}
Expand All @@ -423,11 +424,11 @@ mod tests {
async fn write_data_that_does_not_violate_constraint() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

let err = table.write(vec![batch]).await;
let err = table.write(batch.into()).await;

assert!(err.is_ok());
Ok(())
Expand Down
18 changes: 11 additions & 7 deletions crates/core/src/operations/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ impl std::future::IntoFuture for DeleteBuilder {

#[cfg(test)]
mod tests {
use crate::operations::write::WriteData;
use crate::operations::DeltaOps;
use crate::protocol::*;
use crate::writer::test_utils::datafusion::get_data;
Expand Down Expand Up @@ -401,7 +402,7 @@ mod tests {
.unwrap();
// write some data
let table = DeltaOps(table)
.write(vec![batch.clone()])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand Down Expand Up @@ -463,7 +464,7 @@ mod tests {

// write some data
let table = DeltaOps(table)
.write(vec![batch])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand All @@ -487,7 +488,7 @@ mod tests {

// write some data
let table = DeltaOps(table)
.write(vec![batch])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand Down Expand Up @@ -554,7 +555,7 @@ mod tests {
)
.unwrap();

DeltaOps::new_in_memory().write(vec![batch]).await.unwrap()
DeltaOps::new_in_memory().write(batch.into()).await.unwrap()
}

// Validate behaviour of greater than
Expand Down Expand Up @@ -643,7 +644,7 @@ mod tests {

// write some data
let table = DeltaOps(table)
.write(vec![batch])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand Down Expand Up @@ -701,7 +702,7 @@ mod tests {

// write some data
let table = DeltaOps(table)
.write(vec![batch])
.write(batch.into())
.with_save_mode(SaveMode::Append)
.await
.unwrap();
Expand Down Expand Up @@ -770,7 +771,10 @@ mod tests {
];
let batches = vec![RecordBatch::try_new(schema.clone(), data).unwrap()];

let table = DeltaOps::new_in_memory().write(batches).await.unwrap();
let table = DeltaOps::new_in_memory()
.write(batches.into())
.await
.unwrap();

let (table, _metrics) = DeltaOps(table)
.delete()
Expand Down
7 changes: 4 additions & 3 deletions crates/core/src/operations/drop_constraints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ impl std::future::IntoFuture for DropConstraintBuilder {
#[cfg(feature = "datafusion")]
#[cfg(test)]
mod tests {
use crate::operations::write::WriteData;
use crate::writer::test_utils::{create_bare_table, get_record_batch};
use crate::{DeltaOps, DeltaResult, DeltaTable};

Expand All @@ -146,7 +147,7 @@ mod tests {
async fn drop_valid_constraint() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;
let table = DeltaOps(write);

Expand All @@ -170,7 +171,7 @@ mod tests {
async fn drop_invalid_constraint_not_existing() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;

let table = DeltaOps(write)
Expand All @@ -186,7 +187,7 @@ mod tests {
async fn drop_invalid_constraint_ignore() -> DeltaResult<()> {
let batch = get_record_batch(None, false);
let write = DeltaOps(create_bare_table())
.write(vec![batch.clone()])
.write(batch.clone().into())
.await?;

let version = write.version();
Expand Down
9 changes: 7 additions & 2 deletions crates/core/src/operations/load.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ impl std::future::IntoFuture for LoadBuilder {

#[cfg(test)]
mod tests {
use crate::operations::write::WriteData;
use crate::operations::{collect_sendable_stream, DeltaOps};
use crate::writer::test_utils::{get_record_batch, TestResult};
use crate::DeltaTableBuilder;
Expand Down Expand Up @@ -114,7 +115,9 @@ mod tests {
#[tokio::test]
async fn test_write_load() -> TestResult {
let batch = get_record_batch(None, false);
let table = DeltaOps::new_in_memory().write(vec![batch.clone()]).await?;
let table = DeltaOps::new_in_memory()
.write(batch.clone().into())
.await?;

let (_table, stream) = DeltaOps(table).load().await?;
let data = collect_sendable_stream(stream).await?;
Expand Down Expand Up @@ -145,7 +148,9 @@ mod tests {
#[tokio::test]
async fn test_load_with_columns() -> TestResult {
let batch = get_record_batch(None, false);
let table = DeltaOps::new_in_memory().write(vec![batch.clone()]).await?;
let table = DeltaOps::new_in_memory()
.write(batch.clone().into())
.await?;

let (_table, stream) = DeltaOps(table).load().with_columns(["id", "value"]).await?;
let data = collect_sendable_stream(stream).await?;
Expand Down
3 changes: 2 additions & 1 deletion crates/core/src/operations/merge/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,7 @@ mod tests {
use crate::kernel::StructField;
use crate::operations::merge::generalize_filter;
use crate::operations::merge::try_construct_early_filter;
use crate::operations::write::WriteData;
use crate::operations::DeltaOps;
use crate::protocol::*;
use crate::writer::test_utils::datafusion::get_data;
Expand Down Expand Up @@ -1609,7 +1610,7 @@ mod tests {
.unwrap();
// write some data
DeltaOps(table)
.write(vec![batch.clone()])
.write(batch.clone().into())
.with_save_mode(SaveMode::Append)
.await
.unwrap()
Expand Down
7 changes: 3 additions & 4 deletions crates/core/src/operations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ use self::{
};
#[cfg(feature = "datafusion")]
pub use ::datafusion::physical_plan::common::collect as collect_sendable_stream;
#[cfg(feature = "datafusion")]
use arrow::record_batch::RecordBatch;

use optimize::OptimizeBuilder;
use restore::RestoreBuilder;

Expand Down Expand Up @@ -137,8 +136,8 @@ impl DeltaOps {
/// Write data to Delta table
#[cfg(feature = "datafusion")]
#[must_use]
pub fn write(self, batches: impl IntoIterator<Item = RecordBatch>) -> WriteBuilder {
WriteBuilder::new(self.0.log_store, self.0.state).with_input_batches(batches)
pub fn write(self, data: write::WriteData) -> WriteBuilder {
WriteBuilder::new(self.0.log_store, self.0.state).with_data(data)
}

/// Vacuum stale files from delta table
Expand Down
4 changes: 3 additions & 1 deletion crates/core/src/operations/optimize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1233,6 +1233,8 @@ pub(super) mod zorder {

#[cfg(test)]
mod tests {
use crate::operations::write::WriteData;

use super::*;
use ::datafusion::assert_batches_eq;
use arrow_array::{Int32Array, StringArray};
Expand Down Expand Up @@ -1356,7 +1358,7 @@ pub(super) mod zorder {
.unwrap();
// write some data
let table = crate::DeltaOps::new_in_memory()
.write(vec![batch.clone()])
.write(batch.clone().into())
.with_save_mode(crate::protocol::SaveMode::Append)
.await
.unwrap();
Expand Down
3 changes: 2 additions & 1 deletion crates/core/src/operations/update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ mod tests {
use crate::kernel::PrimitiveType;
use crate::kernel::StructField;
use crate::kernel::StructType;
use crate::operations::write::WriteData;
use crate::operations::DeltaOps;
use crate::writer::test_utils::datafusion::get_data;
use crate::writer::test_utils::datafusion::write_batch;
Expand Down Expand Up @@ -525,7 +526,7 @@ mod tests {
)
.unwrap();

DeltaOps::new_in_memory().write(vec![batch]).await.unwrap()
DeltaOps::new_in_memory().write(batch.into()).await.unwrap()
}

#[tokio::test]
Expand Down
Loading
Loading