Enable V2 streaming for DV tables

huan233usc · huan233usc · commit 369e04d0cd26 · 2026-01-22T03:54:26.000Z
This PR adds V2 connector support for streaming reads on tables with
deletion vectors:

1. Fix getSnapshotFiles() to include all files in initial snapshot
   - Use Scan.getScanFiles() directly instead of StreamingHelper.getDataChangeAdd()
   - Initial snapshot should include all files regardless of dataChange flag

2. Fix DV file path issue by using Path.toString() instead of toUri().toString()
   - Avoids URL encoding issues that caused FileNotFoundException

3. Add DeltaSourceV2DeletionVectorsSuite for V2 streaming DV tests
   - Tests use loadStreamWithOptions() for V2 connector routing
   - Override executeSql() to use V1 for write operations (DELETE not yet supported in V2)

4. Update DeltaSourceDeletionVectorTests to support both V1 and V2 connectors
   - Add DeltaSourceConnectorTrait for connector abstraction
   - Add executeSql() hook for V2 write operation workaround
diff --git a/spark-unified/src/test/scala/org/apache/spark/sql/delta/test/DeltaSourceV2DeletionVectorsSuite.scala b/spark-unified/src/test/scala/org/apache/spark/sql/delta/test/DeltaSourceV2DeletionVectorsSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.test
+
+import org.apache.spark.sql.delta.{DeltaSourceDeletionVectorTests, DeltaSourceSuiteBase, PersistentDVEnabled}
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
+
+/**
+ * Test suite that runs DeltaSourceDeletionVectorTests using the V2 connector.
+ */
+class DeltaSourceV2DeletionVectorsSuite extends DeltaSourceSuiteBase
+  with DeltaSQLCommandTest
+  with DeltaSourceDeletionVectorTests
+  with PersistentDVEnabled
+  with V2ForceTest {
+
+  override protected def useDsv2: Boolean = true
+
+  // Override executeSql to use V1 connector for write operations (DELETE/INSERT)
+  // V2 connector doesn't support write operations yet
+  override protected def executeSql(sqlText: String): Unit = {
+    withSQLConf(DeltaSQLConf.V2_ENABLE_MODE.key -> "NONE") {
+      sql(sqlText)
+    }
+  }
+
+  private lazy val shouldPassTests = Set(
+    "allow to delete files before starting a streaming query",
+    "allow to delete files before staring a streaming query without checkpoint",
+    "multiple deletion vectors per file with initial snapshot"
+  )
+
+  private lazy val shouldFailTests = Set(
+    // These tests use ignoreDeletes/ignoreChanges options not yet supported in V2
+    "deleting files fails query if ignoreDeletes = false",
+    "allow to delete files after staring a streaming query when ignoreFileDeletion is true",
+    "allow to delete files after staring a streaming query when ignoreDeletes is true",
+    "updating the source table causes failure when ignoreChanges = false - using DELETE",
+    "allow to update the source table when ignoreChanges = true - using DELETE",
+    "deleting files when ignoreChanges = true doesn't fail the query",
+    "updating source table when ignoreDeletes = true fails the query - using DELETE",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE - List()",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((ignoreDeletes,true))",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((ignoreChanges,true))",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((skipChangeCommits,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE - List()",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((ignoreDeletes,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((ignoreChanges,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((skipChangeCommits,true))",
+    "multiple deletion vectors per file - List((ignoreFileDeletion,true))",
+    "multiple deletion vectors per file - List((ignoreChanges,true))"
+  )
+
+  override protected def shouldFail(testName: String): Boolean = {
+    val inPassList = shouldPassTests.contains(testName)
+    val inFailList = shouldFailTests.contains(testName)
+
+    assert(inPassList || inFailList, s"Test '$testName' not in shouldPassTests or shouldFailTests")
+    assert(!(inPassList && inFailList),
+      s"Test '$testName' in both shouldPassTests and shouldFailTests")
+
+    inFailList
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.delta.Relocated.StreamExecution
-import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+import org.apache.spark.sql.delta.test.{DeltaSQLCommandTest, DeltaSQLTestUtils}
 import org.apache.hadoop.fs.Path
 import org.scalatest.concurrent.Eventually
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
@@ -30,28 +30,31 @@ import org.apache.spark.sql.streaming.{StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 
 trait DeltaSourceDeletionVectorTests extends StreamTest
-  with DeletionVectorsTestUtils {
+  with DeletionVectorsTestUtils
+  with DeltaSourceConnectorTrait {
+  self: DeltaSQLTestUtils =>
 
   import testImplicits._
 
+  /** Execute SQL statement. Override in V2 tests to use V1 connector for write operations. */
+  protected def executeSql(sqlText: String): Unit = sql(sqlText)
+
   test("allow to delete files before starting a streaming query") {
     withTempDir { inputDir =>
       val deltaLog = DeltaLog.forTable(spark, new Path(inputDir.toURI))
       (0 until 5).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
-      sql(s"DELETE FROM delta.`$inputDir`")
+      executeSql(s"DELETE FROM delta.`$inputDir`")
       (5 until 10).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
       deltaLog.checkpoint()
       assert(deltaLog.readLastCheckpointFile().nonEmpty, "this test requires a checkpoint")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(inputDir.getCanonicalPath)
+      val df = loadStreamWithOptions(inputDir.getCanonicalPath, Map.empty)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -69,16 +72,14 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
-      sql(s"DELETE FROM delta.`$inputDir`")
+      executeSql(s"DELETE FROM delta.`$inputDir`")
       (5 until 7).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
       assert(deltaLog.readLastCheckpointFile().isEmpty, "this test requires no checkpoint")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(inputDir.getCanonicalPath)
+      val df = loadStreamWithOptions(inputDir.getCanonicalPath, Map.empty)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -115,7 +116,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       Seq(i, i + 1).toDF().coalesce(1).write.format("delta").mode("append").save(inputDir)
     }
 
-    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    val df = loadStreamWithOptions(inputDir, sourceOptions.toMap)
     val expectDVs = commandShouldProduceDVs.getOrElse(
       sqlCommand.toUpperCase().startsWith("DELETE"))
 
@@ -126,7 +127,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       },
       CheckAnswer((0 until 10): _*),
       AssertOnQuery { q =>
-        sql(sqlCommand)
+        executeSql(sqlCommand)
         deletionVectorsPresentIfExpected(inputDir, expectDVs)
       })
 
@@ -148,7 +149,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
     }
     val log = DeltaLog.forTable(spark, inputDir)
     val commitVersionBeforeDML = log.update().version
-    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    val df = loadStreamWithOptions(inputDir, sourceOptions.toMap)
     def expectDVsInCommand(shouldProduceDVs: Option[Boolean], command: String): Boolean = {
       shouldProduceDVs.getOrElse(command.toUpperCase().startsWith("DELETE"))
     }
@@ -177,11 +178,11 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         true
       },
       AssertOnQuery { q =>
-        sql(sqlCommand1)
+        executeSql(sqlCommand1)
         deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand1)
       },
       AssertOnQuery { q =>
-        sql(sqlCommand2)
+        executeSql(sqlCommand2)
         deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand2)
       },
       AssertOnQuery { q =>
@@ -416,21 +417,19 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       (0 until 10).toDF("value").coalesce(1).write.format("delta").save(path)
 
       // V1: Delete row 0
-      sql(s"DELETE FROM delta.`$path` WHERE value = 0")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 0")
 
       // V2: Delete row 1
-      sql(s"DELETE FROM delta.`$path` WHERE value = 1")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 1")
 
       // V3: Delete row 2
-      sql(s"DELETE FROM delta.`$path` WHERE value = 2")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 2")
 
       // Verify DVs are present
       assert(getFilesWithDeletionVectors(deltaLog).nonEmpty,
         "This test requires deletion vectors to be present")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(path)
+      val df = loadStreamWithOptions(path, Map.empty)
 
       testStream(df)(
         // Process the initial snapshot
@@ -457,10 +456,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       // V0: 10 rows in a single file
       (0 until 10).toDF("value").coalesce(1).write.format("delta").save(path)
 
-      val df = spark.readStream
-        .format("delta")
-        .options(sourceOptions.toMap)
-        .load(path)
+      val df = loadStreamWithOptions(path, sourceOptions.toMap)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -470,12 +466,12 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         CheckAnswer((0 until 10): _*),
         AssertOnQuery { q =>
           // V1: Delete row 0 - creates first DV (version 1)
-          sql(s"DELETE FROM delta.`$path` WHERE value = 0")
+          executeSql(s"DELETE FROM delta.`$path` WHERE value = 0")
           true
         },
         AssertOnQuery { q =>
           // V2: Delete row 1 - updates DV (version 2). DV is cumulative: {0, 1}
-          sql(s"DELETE FROM delta.`$path` WHERE value = 1")
+          executeSql(s"DELETE FROM delta.`$path` WHERE value = 1")
           true
         },
         AssertOnQuery { q =>
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java
@@ -24,6 +24,7 @@
 import io.delta.kernel.Snapshot;
 import io.delta.kernel.data.ColumnarBatch;
 import io.delta.kernel.data.FilteredColumnarBatch;
+import io.delta.kernel.data.Row;
 import io.delta.kernel.defaults.engine.DefaultEngine;
 import io.delta.kernel.engine.Engine;
 import io.delta.kernel.exceptions.UnsupportedTableFeatureException;
@@ -439,7 +440,9 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) {
     Seq<FilePartition> filePartitions =
         FilePartition$.MODULE$.getFilePartitions(
             spark, JavaConverters.asScalaBuffer(partitionedFiles).toSeq(), maxSplitBytes);
-    return JavaConverters.seqAsJavaList(filePartitions).toArray(new InputPartition[0]);
+    InputPartition[] result =
+        JavaConverters.seqAsJavaList(filePartitions).toArray(new InputPartition[0]);
+    return result;
   }
 
   @Override
@@ -982,9 +985,15 @@ private CloseableIterator<IndexedFile> getSnapshotFiles(long version) {
     try (CloseableIterator<FilteredColumnarBatch> filesIter = scan.getScanFiles(engine)) {
       while (filesIter.hasNext()) {
         FilteredColumnarBatch filteredBatch = filesIter.next();
-        ColumnarBatch batch = filteredBatch.getData();
-        for (int rowId = 0; rowId < batch.getSize(); rowId++) {
-          StreamingHelper.getDataChangeAdd(batch, rowId).ifPresent(addFiles::add);
+        // getScanFiles returns rows with schema {add: struct, tableRoot: string}
+        // Extract AddFile directly from each row
+        try (CloseableIterator<Row> rowIter = filteredBatch.getRows()) {
+          while (rowIter.hasNext()) {
+            Row scanFileRow = rowIter.next();
+            // addFile struct is at index 0 in scan file schema
+            Row addFileRow = scanFileRow.getStruct(0);
+            addFiles.add(new AddFile(addFileRow));
+          }
         }
       }
     } catch (IOException e) {
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/PartitionUtils.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/PartitionUtils.java
@@ -193,7 +193,9 @@ public static PartitionReaderFactory createDeltaParquetReaderFactory(
     SnapshotImpl snapshotImpl = (SnapshotImpl) snapshot;
     Protocol protocol = snapshotImpl.getProtocol();
     Metadata metadata = snapshotImpl.getMetadata();
-    String tablePath = snapshotImpl.getDataPath().toUri().toString();
+    // Use Path.toString() instead of toUri().toString() to avoid URL encoding issues
+    // This matches V1 connector behavior in PreprocessTableWithDVs.scala
+    String tablePath = snapshotImpl.getDataPath().toString();
 
     // Create DV schema context if table supports deletion vectors
     boolean isTableSupportDv =
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/DvSchemaContextTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/DvSchemaContextTest.java
@@ -20,13 +20,15 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.spark.sql.delta.DeltaParquetFileFormat;
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 public class DvSchemaContextTest {
 
@@ -36,8 +38,12 @@ public class DvSchemaContextTest {
   private static final StructType PARTITION_SCHEMA =
       new StructType().add("date", DataTypes.StringType);
 
+  static Stream<Arguments> schemaWithDvColumnArgs() {
+    return Stream.of(Arguments.of(false, 3, 2), Arguments.of(true, 4, 3));
+  }
+
   @ParameterizedTest(name = "useMetadataRowIndex={0}")
-  @CsvSource({"false, 3, 2", "true, 4, 3"})
+  @MethodSource("schemaWithDvColumnArgs")
   void testSchemaWithDvColumn(
       boolean useMetadataRowIndex, int expectedFieldCount, int expectedDvIndex) {
     DvSchemaContext context =
@@ -57,16 +63,24 @@ void testSchemaWithDvColumn(
         schemaWithDv.fields()[expectedDvIndex].name());
   }
 
+  static Stream<Arguments> inputColumnCountArgs() {
+    return Stream.of(Arguments.of(false, 4), Arguments.of(true, 5));
+  }
+
   @ParameterizedTest(name = "useMetadataRowIndex={0}")
-  @CsvSource({"false, 4", "true, 5"})
+  @MethodSource("inputColumnCountArgs")
   void testInputColumnCount(boolean useMetadataRowIndex, int expectedCount) {
     DvSchemaContext context =
         new DvSchemaContext(DATA_SCHEMA, PARTITION_SCHEMA, useMetadataRowIndex);
     assertEquals(expectedCount, context.getInputColumnCount());
   }
 
+  static Stream<Arguments> outputColumnOrdinalsArgs() {
+    return Stream.of(Arguments.of(false, "0,1,3"), Arguments.of(true, "0,1,4"));
+  }
+
   @ParameterizedTest(name = "useMetadataRowIndex={0}")
-  @CsvSource({"false, '0,1,3'", "true, '0,1,4'"})
+  @MethodSource("outputColumnOrdinalsArgs")
   void testOutputColumnOrdinals(boolean useMetadataRowIndex, String expectedOrdinalsStr) {
     DvSchemaContext context =
         new DvSchemaContext(DATA_SCHEMA, PARTITION_SCHEMA, useMetadataRowIndex);