Skip to content

Commit

Permalink
Add Parquet reader benchmarks for row selection (#14147)
Browse files Browse the repository at this point in the history
Re-enabled the group of benchmarks that compares row selection options in Parquet reader.
Use `read_parquet_metadata` to get the column names and number of row groups.
Clean up read chunk computation for ORC and Parquet benchmarks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: #14147
  • Loading branch information
vuule authored Sep 25, 2023
1 parent ddd2b0d commit 1b925bf
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 42 deletions.
18 changes: 10 additions & 8 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/io/cuio_common.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/logger.hpp>

#include <cstdio>
Expand Down Expand Up @@ -141,17 +142,18 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
return col_names_to_read;
}

std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk)
std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk_idx)
{
CUDF_EXPECTS(num_segments >= num_chunks,
"Number of chunks cannot be greater than the number of segments in the file");
auto start_segment = [num_segments, num_chunks](int chunk) {
return num_segments * chunk / num_chunks;
};
std::vector<cudf::size_type> selected_segments;
for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) {
selected_segments.push_back(segment);
}
CUDF_EXPECTS(chunk_idx < num_chunks,
"Chunk index must be smaller than the number of chunks in the file");

auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks);
auto const begin_segment = std::min(chunk_idx * segments_in_chunk, num_segments);
auto const end_segment = std::min(begin_segment + segments_in_chunk, num_segments);
std::vector<cudf::size_type> selected_segments(end_segment - begin_segment);
std::iota(selected_segments.begin(), selected_segments.end(), begin_segment);

return selected_segments;
}
Expand Down
12 changes: 5 additions & 7 deletions cpp/benchmarks/io/orc/orc_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/orc.hpp>
#include <cudf/io/orc_metadata.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -30,7 +31,7 @@
constexpr int64_t data_size = 512 << 20;
// The number of separate read calls to use when reading files in multiple chunks
// Each call reads roughly equal amounts of data
constexpr int32_t chunked_read_num_chunks = 8;
constexpr int32_t chunked_read_num_chunks = 4;

std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
Expand Down Expand Up @@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state,

auto const num_stripes =
cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
timer.start();
cudf::size_type rows_read = 0;
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
auto const is_last_chunk = chunk == (num_chunks - 1);
switch (RowSelection) {
case row_selection::ALL: break;
case row_selection::STRIPES:
Expand All @@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
case row_selection::NROWS:
read_options.set_skip_rows(chunk * chunk_row_cnt);
read_options.set_num_rows(chunk_row_cnt);
if (is_last_chunk) read_options.set_num_rows(-1);
break;
default: CUDF_FAIL("Unsupported row selection method");
}
Expand All @@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
column_selection::ALTERNATE,
column_selection::FIRST_HALF,
column_selection::SECOND_HALF>;
using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;

NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
NVBENCH_TYPE_AXES(col_selections,
nvbench::enum_type_list<row_selection::ALL>,
Expand All @@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
{"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
.set_min_samples(4);

using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::STRIPES>;
NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
row_selections,
Expand Down
65 changes: 38 additions & 27 deletions cpp/benchmarks/io/parquet/parquet_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,29 @@
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr std::size_t data_size = 512 << 20;
constexpr std::size_t row_group_size = 128 << 20;
constexpr std::size_t data_size = 512 << 20;
// The number of separate read calls to use when reading files in multiple chunks
// Each call reads roughly equal amounts of data
constexpr int32_t chunked_read_num_chunks = 4;

std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
cudf::io::parquet_reader_options const read_options =
cudf::io::parquet_reader_options::builder(source);
auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;

std::vector<std::string> names;
names.reserve(schema.size());
std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
return c.name;
});
return names;
auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children();
std::vector<std::string> col_names;
std::transform(top_lvl_cols.cbegin(),
top_lvl_cols.cend(),
std::back_inserter(col_names),
[](auto const& col_meta) { return col_meta.name(); });

return col_names;
}

template <column_selection ColSelection,
Expand All @@ -55,6 +56,8 @@ void BM_parquet_read_options(nvbench::state& state,
nvbench::enum_type<UsesPandasMetadata>,
nvbench::enum_type<Timestamp>>)
{
auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;

auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES;
auto constexpr uses_pd_metadata = UsesPandasMetadata == uses_pandas_metadata::YES;

Expand Down Expand Up @@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state,
.use_pandas_metadata(uses_pd_metadata)
.timestamp_type(ts_type);

// TODO: add read_parquet_metadata to properly calculate #row_groups
auto constexpr num_row_groups = data_size / row_group_size;
auto constexpr num_chunks = 1;
auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups();
auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state,
timer.start();
cudf::size_type rows_read = 0;
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
auto const is_last_chunk = chunk == (num_chunks - 1);
switch (RowSelection) {
case row_selection::ALL: break;
case row_selection::ROW_GROUPS: {
auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk);
if (is_last_chunk) {
// Need to assume that an additional "overflow" row group is present
row_groups_to_read.push_back(num_row_groups);
}
read_options.set_row_groups({row_groups_to_read});
read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)});
} break;
case row_selection::NROWS: [[fallthrough]];
case row_selection::NROWS:
read_options.set_skip_rows(chunk * chunk_row_cnt);
read_options.set_num_rows(chunk_row_cnt);
break;
default: CUDF_FAIL("Unsupported row selection method");
}

Expand All @@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::ROW_GROUPS>;
NVBENCH_BENCH_TYPES(BM_parquet_read_options,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
row_selections,
nvbench::enum_type_list<converts_strings::YES>,
nvbench::enum_type_list<uses_pandas_metadata::YES>,
nvbench::enum_type_list<cudf::type_id::EMPTY>))
.set_name("parquet_read_row_selection")
.set_type_axes_names({"column_selection",
"row_selection",
"str_to_categories",
"uses_pandas_metadata",
"timestamp_type"})
.set_min_samples(4);

using col_selections = nvbench::enum_type_list<column_selection::ALL,
column_selection::ALTERNATE,
column_selection::FIRST_HALF,
column_selection::SECOND_HALF>;

// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file
// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863

NVBENCH_BENCH_TYPES(BM_parquet_read_options,
NVBENCH_TYPE_AXES(col_selections,
nvbench::enum_type_list<row_selection::ALL>,
Expand Down

0 comments on commit 1b925bf

Please sign in to comment.