diff --git a/tools/RunGen.cpp b/tools/RunGen.cpp index 611ed918b0fe..604c8e0a9248 100644 --- a/tools/RunGen.cpp +++ b/tools/RunGen.cpp @@ -424,6 +424,30 @@ Shape parse_extents(const std::string &extent_list) { return result; } +// Given a Buffer<>, return its shape in the form of a vector. +// (Oddly, Buffer<> has no API to do this directly.) +Shape get_shape(const Buffer<> &b) { + Shape s; + for (int i = 0; i < b.dimensions(); ++i) { + s.push_back(b.raw_buffer()->dim[i]); + } + return s; +} + +// Given a type and shape, create a new Buffer<> but *don't* allocate allocate storage for it. +Buffer<> make_with_shape(const halide_type_t &type, const Shape &shape) { + return Buffer<>(type, nullptr, (int) shape.size(), &shape[0]); +} + +// Given a type and shape, create a new Buffer<> and allocate storage for it. +// (Oddly, Buffer<> has an API to do this with vector-of-extent, but not vector-of-halide_dimension_t.) +Buffer<> allocate_buffer(const halide_type_t &type, const Shape &shape) { + Buffer<> b = make_with_shape(type, shape); + b.check_overflow(); + b.allocate(); + return b; +} + // BEGIN TODO: hacky algorithm inspired by Safelight // (should really use the algorithm from AddImageChecks to come up with something more rigorous.) Shape choose_output_extents(int dimensions, const Shape &defaults) { @@ -438,7 +462,72 @@ Shape choose_output_extents(int dimensions, const Shape &defaults) { return s; } -Shape fix_bounds_query_shape(const Shape &constrained_shape) { +void fix_chunky_strides(const Shape &constrained_shape, Shape *new_shape) { + // Special-case Chunky: most "chunky" generators tend to constrain stride[0] + // and stride[2] to exact values, leaving stride[1] unconstrained; + // in practice, we must ensure that stride[1] == stride[0] * extent[0] + // and stride[0] = extent[2] to get results that are not garbled. + // This is unpleasantly hacky and will likely need aditional enhancements. + // (Note that there are, theoretically, other stride combinations that might + // need fixing; in practice, ~all generators that aren't planar tend + // to be classically chunky.) + if (new_shape->size() >= 3) { + if (constrained_shape[2].stride == 1) { + if (constrained_shape[0].stride >= 1) { + // If we have stride[0] and stride[2] std::set to obviously-chunky, + // then force extent[2] to match stride[0]. + (*new_shape)[2].extent = constrained_shape[0].stride; + } else { + // If we have stride[2] == 1 but stride[0] <= 1, + // force stride[0] = extent[2] + (*new_shape)[0].stride = (*new_shape)[2].extent; + } + // Ensure stride[1] is reasonable. + (*new_shape)[1].stride = (*new_shape)[0].extent * (*new_shape)[0].stride; + } + } +} + +// Given a constraint Shape (generally produced by a bounds query), update +// the input Buffer to meet those constraints, allocating and copying into +// a new Buffer if necessary. +bool adapt_input_buffer_layout(const Shape &constrained_shape, Buffer<> *buf) { + bool shape_changed = false; + Shape new_shape = get_shape(*buf); + if (new_shape.size() != constrained_shape.size()) { + fail() << "Dimension mismatch"; + } + for (size_t i = 0; i < constrained_shape.size(); ++i) { + // min of nonzero means "min" + if (constrained_shape[i].min != 0 && new_shape[i].min > constrained_shape[i].min) { + new_shape[i].min = constrained_shape[i].min; + shape_changed = true; + } + // extent of nonzero means "max" + if (constrained_shape[i].extent != 0 && new_shape[i].extent > constrained_shape[i].extent) { + new_shape[i].extent = constrained_shape[i].extent; + shape_changed = true; + } + // stride of 0 means "no constraints" + if (constrained_shape[i].stride != 0 && new_shape[i].stride != constrained_shape[i].stride) { + new_shape[i].stride = constrained_shape[i].stride; + shape_changed = true; + } + } + if (shape_changed) { + fix_chunky_strides(constrained_shape, &new_shape); + Buffer<> new_buf = allocate_buffer(buf->type(), new_shape); + new_buf.copy_from(*buf); + *buf = new_buf; + } + return shape_changed; +} + +// Given a constraint Shape (generally produced by a bounds query), create a new +// Shape that can legally be used to create and allocate a new Buffer: +// ensure that extents/strides aren't zero, do some reality checking +// on planar vs interleaved, and generally try to guess at a reasonable result. +Shape make_legal_output_buffer_shape(const Shape &constrained_shape) { Shape new_shape = constrained_shape; // Make sure that the extents and strides for these are nonzero. @@ -462,38 +551,16 @@ Shape fix_bounds_query_shape(const Shape &constrained_shape) { } } - // Special-case Chunky: most "chunky" generators tend to constrain stride[0] - // and stride[2] to exact values, leaving stride[1] unconstrained; - // in practice, we must ensure that stride[1] == stride[0] * extent[0] - // and stride[0] = extent[2] to get results that are not garbled. - // This is unpleasantly hacky and will likely need aditional enhancements. - // (Note that there are, theoretically, other stride combinations that might - // need fixing; in practice, ~all generators that aren't planar tend - // to be classically chunky.) - if (new_shape.size() >= 3) { - if (constrained_shape[2].stride == 1) { - if (constrained_shape[0].stride >= 1) { - // If we have stride[0] and stride[2] std::set to obviously-chunky, - // then force extent[2] to match stride[0]. - new_shape[2].extent = constrained_shape[0].stride; - } else { - // If we have stride[2] == 1 but stride[0] <= 1, - // force stride[0] = extent[2] - new_shape[0].stride = new_shape[2].extent; - } - // Ensure stride[1] is reasonable. - new_shape[1].stride = new_shape[0].extent * new_shape[0].stride; - } - } + fix_chunky_strides(constrained_shape, &new_shape); // If anything else is zero, just set strides to planar and hope for the best. - bool zero_strides = false; + bool any_strides_zero = false; for (size_t i = 0; i < new_shape.size(); ++i) { if (!new_shape[i].stride) { - zero_strides = true; + any_strides_zero = true; } } - if (zero_strides) { + if (any_strides_zero) { // Planar new_shape[0].stride = 1; for (size_t i = 1; i < new_shape.size(); ++i) { @@ -504,25 +571,6 @@ Shape fix_bounds_query_shape(const Shape &constrained_shape) { } // END TODO: hacky algorithm inspired by Safelight -// Given a Buffer<>, return its shape in the form of a vector. -// (Oddly, Buffer<> has no API to do this directly.) -Shape get_shape(const Buffer<> &b) { - Shape s; - for (int i = 0; i < b.dimensions(); ++i) { - s.push_back(b.raw_buffer()->dim[i]); - } - return s; -} - -// Given a type and shape, create a new Buffer<> and allocate storage for it. -// (Oddly, Buffer<> has an API to do this with vector-of-extent, but not vector-of-halide_dimension_t.) -Buffer<> allocate_buffer(const halide_type_t &type, const Shape &shape) { - Buffer<> b(type, nullptr, (int) shape.size(), &shape[0]); - b.check_overflow(); - b.allocate(); - return b; -} - // Return true iff all of the dimensions in the range [first, last] have an extent of <= 1. bool dims_in_range_are_trivial(const Buffer<> &b, int first, int last) { for (int d = first; d <= last; ++d) { @@ -606,6 +654,79 @@ Buffer<> load_input(const std::string &pathname, return Buffer<>(); } +struct ArgData { + size_t index{0}; + const halide_filter_argument_t *metadata{nullptr}; + std::string raw_string; + halide_scalar_value_t scalar_value; + Buffer<> buffer_value; +}; + +// Run a bounds-query call with the given args, and return the shapes +// to which we are constrained. +std::vector run_bounds_query(const std::map &args, + const Shape &default_output_shape) { + std::vector filter_argv(args.size(), nullptr); + // These vectors are larger than needed, but simplifies logic downstream. + std::vector> bounds_query_buffers(args.size()); + std::vector constrained_shapes(args.size()); + for (auto &arg_pair : args) { + auto &arg = arg_pair.second; + switch (arg.metadata->kind) { + case halide_argument_kind_input_scalar: + filter_argv[arg.index] = const_cast(&arg.scalar_value); + break; + case halide_argument_kind_input_buffer: + case halide_argument_kind_output_buffer: + Shape shape = (arg.metadata->kind == halide_argument_kind_input_buffer) ? + get_shape(arg.buffer_value) : + choose_output_extents(arg.metadata->dimensions, default_output_shape); + bounds_query_buffers[arg.index] = make_with_shape(arg.metadata->type, shape); + filter_argv[arg.index] = bounds_query_buffers[arg.index].raw_buffer(); + break; + } + } + + info() << "Running bounds query..."; + // Ignore result since our halide_error() should catch everything. + (void) halide_rungen_redirect_argv(&filter_argv[0]); + + for (auto &arg_pair : args) { + auto &arg = arg_pair.second; + switch (arg.metadata->kind) { + case halide_argument_kind_input_scalar: + break; + case halide_argument_kind_input_buffer: + case halide_argument_kind_output_buffer: + constrained_shapes[arg.index] = get_shape(bounds_query_buffers[arg.index]); + break; + } + } + return constrained_shapes; +} + +uint64_t calc_pixels_out(const std::map &args) { + uint64_t pixels_out = 0; + for (auto &arg_pair : args) { + auto &arg = arg_pair.second; + switch (arg.metadata->kind) { + case halide_argument_kind_output_buffer: { + // TODO: this assumes that most output is "pixel-ish", and counting the size of the first + // two dimensions approximates the "pixel size". This is not, in general, a valid assumption, + // but is a useful metric for benchmarking. + Shape shape = get_shape(arg.buffer_value); + if (shape.size() >= 2) { + pixels_out += shape[0].extent * shape[1].extent; + } else { + pixels_out += shape[0].extent; + } + break; + } + } + } + return pixels_out; +} + void usage(const char *argv0) { const std::string usage = R"USAGE( Usage: $NAME$ argument=value [argument=value... ] [flags] @@ -680,6 +801,10 @@ Usage: $NAME$ argument=value [argument=value... ] [flags] Override the default number of benchmarking iterations; ignored if --benchmark is not also specified. + --benchmark_warmup=NUM: + Number of iterations to run before timing, to warm up caches; ignored if + --benchmark is not also specified. + --track_memory: Override Halide memory allocator to track high-water mark of memory allocation during run; note that this may slow down execution, so @@ -753,14 +878,6 @@ int main(int argc, char **argv) { const halide_filter_metadata_t *md = halide_rungen_redirect_metadata(); - struct ArgData { - size_t index{0}; - const halide_filter_argument_t *metadata{nullptr}; - std::string raw_string; - halide_scalar_value_t scalar_value; - Buffer<> buffer_value; - }; - std::map args; std::set found; for (size_t i = 0; i < (size_t) md->num_arguments; ++i) { @@ -778,13 +895,13 @@ int main(int argc, char **argv) { } Shape default_output_shape; - std::vector filter_argv(md->num_arguments, nullptr); std::vector unknown_args; bool benchmark = false; bool track_memory = false; bool describe = false; int benchmark_samples = 3; int benchmark_iterations = 10; + int benchmark_warmup = 1; for (int i = 1; i < argc; ++i) { if (argv[i][0] == '-') { const char *p = argv[i] + 1; // skip - @@ -840,6 +957,10 @@ int main(int argc, char **argv) { if (!parse_scalar(flag_value, &benchmark_iterations)) { fail() << "Invalid value for flag: " << flag_name; } + } else if (flag_name == "benchmark_warmup") { + if (!parse_scalar(flag_value, &benchmark_warmup)) { + fail() << "Invalid value for flag: " << flag_name; + } } else if (flag_name == "output_extents") { default_output_shape = parse_extents(flag_value); } else { @@ -918,7 +1039,6 @@ int main(int argc, char **argv) { << arg.metadata->type << ": " << arg.raw_string; } - filter_argv[arg.index] = &arg.scalar_value; break; } case halide_argument_kind_input_buffer: { @@ -930,7 +1050,6 @@ int main(int argc, char **argv) { if (default_output_shape.empty()) { default_output_shape = get_shape(arg.buffer_value); } - filter_argv[arg.index] = arg.buffer_value.raw_buffer(); break; } case halide_argument_kind_output_buffer: @@ -939,51 +1058,35 @@ int main(int argc, char **argv) { } } - // Run a bounds query, so we can allocate output buffers appropriately. - { - for (auto &arg_pair : args) { - auto &arg = arg_pair.second; - switch (arg.metadata->kind) { - case halide_argument_kind_output_buffer: - auto bounds_query_shape = choose_output_extents(arg.metadata->dimensions, default_output_shape); - arg.buffer_value = Buffer<>(arg.metadata->type, nullptr, (int) bounds_query_shape.size(), &bounds_query_shape[0]); - filter_argv[arg.index] = arg.buffer_value.raw_buffer(); - break; - } - } - - info() << "Running bounds query..."; - int result = halide_rungen_redirect_argv(&filter_argv[0]); - if (result != 0) { - fail() << "Bounds query failed with result code: " << result; - } - } + // Run a bounds query: we need to figure out how to allocate the output buffers, + // and the input buffers might need reshaping to satisfy constraints (e.g. a chunky/interleaved layout). + std::vector constrained_shapes = run_bounds_query(args, default_output_shape); - // Allocate the output buffers we'll need. - double pixels_out = 0.f; for (auto &arg_pair : args) { auto &arg_name = arg_pair.first; auto &arg = arg_pair.second; + const Shape &constrained_shape = constrained_shapes[arg.index]; switch (arg.metadata->kind) { - case halide_argument_kind_output_buffer: - auto constrained_shape = get_shape(arg.buffer_value); - info() << "Output " << arg_name << ": BoundsQuery result is " << constrained_shape; - Shape shape = fix_bounds_query_shape(constrained_shape); - arg.buffer_value = allocate_buffer(arg.metadata->type, shape); - info() << "Output " << arg_name << ": Shape is " << get_shape(arg.buffer_value); - filter_argv[arg.index] = arg.buffer_value.raw_buffer(); - // TODO: this assumes that most output is "pixel-ish", and counting the size of the first - // two dimensions approximates the "pixel size". This is not, in general, a valid assumption, - // but is a useful metric for benchmarking. - if (shape.size() >= 2) { - pixels_out += shape[0].extent * shape[1].extent; - } else { - pixels_out += shape[0].extent; + case halide_argument_kind_input_buffer: { + info() << "Input " << arg_name << ": Shape is " << get_shape(arg.buffer_value); + bool updated = adapt_input_buffer_layout(constrained_shape, &arg.buffer_value); + info() << "Input " << arg_name << ": BoundsQuery result is " << constrained_shape; + if (updated) { + info() << "Input " << arg_name << ": Updated Shape is " << get_shape(arg.buffer_value); + } + break; + } + case halide_argument_kind_output_buffer: { + arg.buffer_value = allocate_buffer(arg.metadata->type, make_legal_output_buffer_shape(constrained_shape)); + info() << "Output " << arg_name << ": BoundsQuery result is " << constrained_shape; + info() << "Output " << arg_name << ": Shape is " << get_shape(arg.buffer_value); + break; } - break; } } - double megapixels = pixels_out / (1024.0 * 1024.0); + + uint64_t pixels_out = calc_pixels_out(args); + double megapixels = (double) pixels_out / (1024.0 * 1024.0); // If we're tracking memory, install the memory tracker *after* doing a bounds query. HalideMemoryTracker tracker; @@ -991,12 +1094,25 @@ int main(int argc, char **argv) { tracker.install(); } - if (benchmark) { - info() << "Benchmarking filter..."; + { + std::vector filter_argv(args.size(), nullptr); + for (auto &arg_pair : args) { + auto &arg = arg_pair.second; + switch (arg.metadata->kind) { + case halide_argument_kind_input_scalar: + filter_argv[arg.index] = &arg.scalar_value; + break; + case halide_argument_kind_input_buffer: + case halide_argument_kind_output_buffer: + filter_argv[arg.index] = arg.buffer_value.raw_buffer(); + break; + } + } - // Run once to warm up cache. Ignore result since our halide_error() should catch everything. - (void) halide_rungen_redirect_argv(&filter_argv[0]); + if (benchmark) { + info() << "Benchmarking filter..."; +<<<<<<< Updated upstream double time_in_seconds = Halide::Tools::benchmark(benchmark_samples, benchmark_iterations, [&filter_argv, &args]() { (void) halide_rungen_redirect_argv(&filter_argv[0]); // Ensure that all outputs are finished, otherwise we may just be @@ -1007,18 +1123,41 @@ int main(int argc, char **argv) { Buffer<> &b = arg.buffer_value; b.device_sync(); } +======= + for (int i = 0; i < benchmark_warmup; ++i) { + // Ignore result since our halide_error() should catch everything. + (void) halide_rungen_redirect_argv(&filter_argv[0]); +>>>>>>> Stashed changes } - }); +<<<<<<< Updated upstream std::cout << "Benchmark for " << md->name << " produces best case of " << time_in_seconds << " sec/iter, over " << benchmark_samples << " blocks of " << benchmark_iterations << " iterations.\n"; std::cout << "Best output throughput is " << (megapixels / time_in_seconds) << " mpix/sec.\n"; +======= + double time_in_seconds = Halide::Tools::benchmark(benchmark_samples, benchmark_iterations, [&filter_argv, &args]() { + // Ignore result since our halide_error() should catch everything. + (void) halide_rungen_redirect_argv(&filter_argv[0]); + // Ensure that all outputs are finished, otherwise we may just be + // measuring how long it takes to do a kernel launch for GPU code. + for (auto &arg_pair : args) { + auto &arg = arg_pair.second; + if (arg.metadata->kind == halide_argument_kind_output_buffer) { + Buffer<> &b = arg.buffer_value; + b.device_sync(); + } + } + }); +>>>>>>> Stashed changes - } else { - info() << "Running filter..."; - int result = halide_rungen_redirect_argv(&filter_argv[0]); - if (result != 0) { - fail() << "Filter failed with result code: " << result; + std::cout << "Benchmark for " << md->name << " produces best case of " << time_in_seconds << " sec/iter, over " + << benchmark_samples << " blocks of " << benchmark_iterations << " iterations.\n"; + std::cout << "Best output throughput is " << (megapixels / time_in_seconds) << " mpix/sec.\n"; + + } else { + info() << "Running filter..."; + // Ignore result since our halide_error() should catch everything. + (void) halide_rungen_redirect_argv(&filter_argv[0]); } }