From 5075d7d1f0aee3eac8518d74a5e9f6226ce5a0d2 Mon Sep 17 00:00:00 2001
From: Yan Feng <fengyan_@mail.ustc.edu.cn>
Date: Thu, 21 Nov 2024 11:05:31 +0800
Subject: [PATCH 1/2] Add a test for ORC write with more than one stripe

Signed-off-by: Yan Feng <fengyan_@mail.ustc.edu.cn>
---
 .../src/main/python/orc_write_test.py              | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
index ddb69524ac4..d4af4b19636 100644
--- a/integration_tests/src/main/python/orc_write_test.py
+++ b/integration_tests/src/main/python/orc_write_test.py
@@ -91,6 +91,20 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
             data_path,
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
+@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+@allow_non_gpu(*non_utc_allow)
+def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+            # Generate a large enough dataframe to produce more than one stripe(typically 64MB)
+            # Preferably use only one partition to avoid splitting the data
+            lambda spark, path: gen_df(spark, gen_list, 12800, num_slices=1).write.orc(path),
+            lambda spark, path: spark.read.orc(path),
+            data_path,
+            conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
+
 @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):

From d51c3ad976a0d380999e714f535e958ed6bd7e33 Mon Sep 17 00:00:00 2001
From: Yan Feng <fengyan_@mail.ustc.edu.cn>
Date: Thu, 21 Nov 2024 17:41:31 +0800
Subject: [PATCH 2/2] Mark the test case as xfail due to known issue #11736

Signed-off-by: Yan Feng <fengyan_@mail.ustc.edu.cn>
---
 integration_tests/src/main/python/orc_write_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
index d4af4b19636..c830d585693 100644
--- a/integration_tests/src/main/python/orc_write_test.py
+++ b/integration_tests/src/main/python/orc_write_test.py
@@ -91,14 +91,14 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
             data_path,
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
-@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
+@pytest.mark.parametrize('orc_gen', [pytest.param(boolean_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11736'))], ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 @allow_non_gpu(*non_utc_allow)
-def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gens, orc_impl):
-    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gen, orc_impl):
+    gen_list = [('_c0', orc_gen)]
     data_path = spark_tmp_path + '/ORC_DATA'
     assert_gpu_and_cpu_writes_are_equal_collect(
-            # Generate a large enough dataframe to produce more than one stripe(typically 64MB)
+            # Generate a large enough dataframe to produce more than one stripe
             # Preferably use only one partition to avoid splitting the data
             lambda spark, path: gen_df(spark, gen_list, 12800, num_slices=1).write.orc(path),
             lambda spark, path: spark.read.orc(path),