Skip to content

Commit 0804ba6

Browse files
wgtmacpitrou
andauthored
GH-45045: [C++][Parquet] Add a benchmark for size_statistics_level (#45085)
### Rationale for this change Add a benchmark to know the performance of writing different size stats levels. ### What changes are included in this PR? Add a size_stats_benchmark for parquet. ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #45045 Lead-authored-by: Gang Wu <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 4f4eb3f commit 0804ba6

File tree

2 files changed

+166
-0
lines changed

2 files changed

+166
-0
lines changed

cpp/src/parquet/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,4 @@ add_parquet_benchmark(metadata_benchmark)
437437
add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
438438
benchmark_util.cc)
439439
add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
440+
add_parquet_benchmark(arrow/size_stats_benchmark PREFIX "parquet-arrow")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "benchmark/benchmark.h"
19+
20+
#include <cstdint>
21+
#include <numeric>
22+
23+
#include "parquet/arrow/writer.h"
24+
#include "parquet/file_reader.h"
25+
#include "parquet/metadata.h"
26+
#include "parquet/platform.h"
27+
#include "parquet/properties.h"
28+
29+
#include "arrow/array.h"
30+
#include "arrow/io/buffered.h"
31+
#include "arrow/io/memory.h"
32+
#include "arrow/table.h"
33+
#include "arrow/testing/gtest_util.h"
34+
#include "arrow/testing/random.h"
35+
36+
namespace parquet::benchmark {
37+
38+
// This should result in multiple pages for most primitive types
39+
constexpr int64_t kBenchmarkSize = 1024 * 1024;
40+
// Use a skewed null probability to reduce levels encoding overhead
41+
constexpr double kNullProbability = 0.95;
42+
43+
int64_t GetTotalBytes(const std::shared_ptr<::arrow::ArrayData>& data) {
44+
if (data == nullptr) {
45+
return 0;
46+
}
47+
int64_t total_bytes =
48+
std::accumulate(data->buffers.cbegin(), data->buffers.cend(), int64_t{0},
49+
[](int64_t acc, const auto& buffer) {
50+
return acc + (buffer != nullptr ? buffer->size() : int64_t{0});
51+
});
52+
total_bytes += std::accumulate(
53+
data->child_data.cbegin(), data->child_data.cend(), int64_t{0},
54+
[](int64_t acc, const auto& child) { return acc + GetTotalBytes(child); });
55+
total_bytes += GetTotalBytes(data->dictionary);
56+
return total_bytes;
57+
}
58+
59+
int64_t GetTotalBytes(const std::shared_ptr<::arrow::Table>& table) {
60+
int64_t total_bytes = 0;
61+
for (const auto& column : table->columns()) {
62+
for (const auto& chunk : column->chunks()) {
63+
total_bytes += GetTotalBytes(chunk->data());
64+
}
65+
}
66+
return total_bytes;
67+
}
68+
69+
int64_t GetTotalPageIndexSize(const std::shared_ptr<::parquet::FileMetaData>& metadata) {
70+
int64_t total_page_index_size = 0;
71+
for (int i = 0; i < metadata->num_row_groups(); ++i) {
72+
auto row_group = metadata->RowGroup(i);
73+
for (int j = 0; j < row_group->num_columns(); ++j) {
74+
auto column = row_group->ColumnChunk(j);
75+
total_page_index_size +=
76+
column->GetColumnIndexLocation().value_or(parquet::IndexLocation{0, 0}).length;
77+
}
78+
}
79+
return total_page_index_size;
80+
}
81+
82+
void WriteColumn(::benchmark::State& state, const std::shared_ptr<::arrow::Table>& table,
83+
SizeStatisticsLevel stats_level) {
84+
// Use the fastest possible encoding and compression settings, to better exhibit
85+
// the size statistics overhead.
86+
auto properties = WriterProperties::Builder()
87+
.enable_statistics()
88+
->enable_write_page_index()
89+
->disable_dictionary()
90+
->encoding(Encoding::PLAIN)
91+
->set_size_statistics_level(stats_level)
92+
->build();
93+
94+
for (auto _ : state) {
95+
auto output = parquet::CreateOutputStream();
96+
ARROW_EXPECT_OK(::parquet::arrow::WriteTable(
97+
*table, ::arrow::default_memory_pool(),
98+
std::static_pointer_cast<::arrow::io::OutputStream>(output),
99+
DEFAULT_MAX_ROW_GROUP_LENGTH, properties));
100+
101+
if (state.counters.find("page_index_size") == state.counters.end()) {
102+
state.PauseTiming();
103+
auto metadata = parquet::ReadMetaData(
104+
std::make_shared<::arrow::io::BufferReader>(output->Finish().ValueOrDie()));
105+
state.counters["output_size"] = static_cast<double>(output->Tell().ValueOrDie());
106+
state.counters["page_index_size"] =
107+
static_cast<double>(GetTotalPageIndexSize(metadata));
108+
state.ResumeTiming();
109+
}
110+
}
111+
112+
state.SetItemsProcessed(state.iterations() * kBenchmarkSize);
113+
state.SetBytesProcessed(state.iterations() * GetTotalBytes(table));
114+
}
115+
116+
template <SizeStatisticsLevel level, typename ArrowType>
117+
void BM_WritePrimitiveColumn(::benchmark::State& state) {
118+
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
119+
auto type = std::make_shared<ArrowType>();
120+
auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability);
121+
auto table = ::arrow::Table::Make(
122+
::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}), {array});
123+
WriteColumn(state, table, level);
124+
}
125+
126+
template <SizeStatisticsLevel level, typename ArrowType>
127+
void BM_WriteListColumn(::benchmark::State& state) {
128+
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
129+
auto element_type = std::make_shared<ArrowType>();
130+
auto element_array = generator.ArrayOf(element_type, kBenchmarkSize, kNullProbability);
131+
auto list_type = ::arrow::list(element_type);
132+
auto list_array = generator.List(*element_array, kBenchmarkSize / 10, kNullProbability);
133+
auto table = ::arrow::Table::Make(
134+
::arrow::schema({::arrow::field("column", list_type, kNullProbability > 0)}),
135+
{list_array});
136+
WriteColumn(state, table, level);
137+
}
138+
139+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
140+
::arrow::Int64Type);
141+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
142+
::arrow::Int64Type);
143+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk,
144+
::arrow::Int64Type);
145+
146+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
147+
::arrow::StringType);
148+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
149+
::arrow::StringType);
150+
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk,
151+
::arrow::StringType);
152+
153+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type);
154+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
155+
::arrow::Int64Type);
156+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
157+
::arrow::Int64Type);
158+
159+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType);
160+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
161+
::arrow::StringType);
162+
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
163+
::arrow::StringType);
164+
165+
} // namespace parquet::benchmark

0 commit comments

Comments
 (0)