From 5075d7d1f0aee3eac8518d74a5e9f6226ce5a0d2 Mon Sep 17 00:00:00 2001 From: Yan Feng Date: Thu, 21 Nov 2024 11:05:31 +0800 Subject: [PATCH 1/2] Add a test for ORC write with more than one stripe Signed-off-by: Yan Feng --- .../src/main/python/orc_write_test.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index ddb69524ac4..d4af4b19636 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -91,6 +91,20 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) +@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) +@pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@allow_non_gpu(*non_utc_allow) +def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gens, orc_impl): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + data_path = spark_tmp_path + '/ORC_DATA' + assert_gpu_and_cpu_writes_are_equal_collect( + # Generate a large enough dataframe to produce more than one stripe(typically 64MB) + # Preferably use only one partition to avoid splitting the data + lambda spark, path: gen_df(spark, gen_list, 12800, num_slices=1).write.orc(path), + lambda spark, path: spark.read.orc(path), + data_path, + conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) + @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): From d51c3ad976a0d380999e714f535e958ed6bd7e33 Mon Sep 17 00:00:00 2001 From: Yan Feng Date: Thu, 21 Nov 2024 17:41:31 +0800 Subject: [PATCH 2/2] Mark the test case as xfail due to known issue #11736 Signed-off-by: Yan Feng --- integration_tests/src/main/python/orc_write_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index d4af4b19636..c830d585693 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -91,14 +91,14 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) -@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) +@pytest.mark.parametrize('orc_gen', [pytest.param(boolean_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11736'))], ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @allow_non_gpu(*non_utc_allow) -def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gens, orc_impl): - gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] +def test_write_more_than_one_stripe_round_trip(spark_tmp_path, orc_gen, orc_impl): + gen_list = [('_c0', orc_gen)] data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( - # Generate a large enough dataframe to produce more than one stripe(typically 64MB) + # Generate a large enough dataframe to produce more than one stripe # Preferably use only one partition to avoid splitting the data lambda spark, path: gen_df(spark, gen_list, 12800, num_slices=1).write.orc(path), lambda spark, path: spark.read.orc(path),