diff --git a/tools/RunGen.cpp b/tools/RunGen.cpp
index 611ed918b0fe..604c8e0a9248 100644
--- a/tools/RunGen.cpp
+++ b/tools/RunGen.cpp
@@ -424,6 +424,30 @@ Shape parse_extents(const std::string &extent_list) {
     return result;
 }
 
+// Given a Buffer<>, return its shape in the form of a vector<halide_dimension_t>.
+// (Oddly, Buffer<> has no API to do this directly.)
+Shape get_shape(const Buffer<> &b) {
+    Shape s;
+    for (int i = 0; i < b.dimensions(); ++i) {
+        s.push_back(b.raw_buffer()->dim[i]);
+    }
+    return s;
+}
+
+// Given a type and shape, create a new Buffer<> but *don't* allocate allocate storage for it.
+Buffer<> make_with_shape(const halide_type_t &type, const Shape &shape) {
+    return Buffer<>(type, nullptr, (int) shape.size(), &shape[0]);
+}
+
+// Given a type and shape, create a new Buffer<> and allocate storage for it.
+// (Oddly, Buffer<> has an API to do this with vector-of-extent, but not vector-of-halide_dimension_t.)
+Buffer<> allocate_buffer(const halide_type_t &type, const Shape &shape) {
+    Buffer<> b = make_with_shape(type, shape);
+    b.check_overflow();
+    b.allocate();
+    return b;
+}
+
 // BEGIN TODO: hacky algorithm inspired by Safelight
 // (should really use the algorithm from AddImageChecks to come up with something more rigorous.)
 Shape choose_output_extents(int dimensions, const Shape &defaults) {
@@ -438,7 +462,72 @@ Shape choose_output_extents(int dimensions, const Shape &defaults) {
     return s;
 }
 
-Shape fix_bounds_query_shape(const Shape &constrained_shape) {
+void fix_chunky_strides(const Shape &constrained_shape, Shape *new_shape) {
+    // Special-case Chunky: most "chunky" generators tend to constrain stride[0]
+    // and stride[2] to exact values, leaving stride[1] unconstrained;
+    // in practice, we must ensure that stride[1] == stride[0] * extent[0]
+    // and stride[0] = extent[2] to get results that are not garbled.
+    // This is unpleasantly hacky and will likely need aditional enhancements.
+    // (Note that there are, theoretically, other stride combinations that might
+    // need fixing; in practice, ~all generators that aren't planar tend
+    // to be classically chunky.)
+    if (new_shape->size() >= 3) {
+        if (constrained_shape[2].stride == 1) {
+            if (constrained_shape[0].stride >= 1) {
+                // If we have stride[0] and stride[2] std::set to obviously-chunky,
+                // then force extent[2] to match stride[0].
+                (*new_shape)[2].extent = constrained_shape[0].stride;
+            } else {
+                // If we have stride[2] == 1 but stride[0] <= 1,
+                // force stride[0] = extent[2]
+                (*new_shape)[0].stride = (*new_shape)[2].extent;
+            }
+            // Ensure stride[1] is reasonable.
+            (*new_shape)[1].stride = (*new_shape)[0].extent * (*new_shape)[0].stride;
+        }
+    }
+}
+
+// Given a constraint Shape (generally produced by a bounds query), update
+// the input Buffer to meet those constraints, allocating and copying into 
+// a new Buffer if necessary.
+bool adapt_input_buffer_layout(const Shape &constrained_shape, Buffer<> *buf) {
+    bool shape_changed = false;
+    Shape new_shape = get_shape(*buf);
+    if (new_shape.size() != constrained_shape.size()) {
+        fail() << "Dimension mismatch";
+    }
+    for (size_t i = 0; i < constrained_shape.size(); ++i) {
+        // min of nonzero means "min"
+        if (constrained_shape[i].min != 0 && new_shape[i].min > constrained_shape[i].min) {
+            new_shape[i].min = constrained_shape[i].min;
+            shape_changed = true;
+        }
+        // extent of nonzero means "max"
+        if (constrained_shape[i].extent != 0 && new_shape[i].extent > constrained_shape[i].extent) {
+            new_shape[i].extent = constrained_shape[i].extent;
+            shape_changed = true;
+        }
+        // stride of 0 means "no constraints"
+        if (constrained_shape[i].stride != 0 && new_shape[i].stride != constrained_shape[i].stride) {
+            new_shape[i].stride = constrained_shape[i].stride;
+            shape_changed = true;
+        }
+    }
+    if (shape_changed) {
+        fix_chunky_strides(constrained_shape, &new_shape);
+        Buffer<> new_buf = allocate_buffer(buf->type(), new_shape);
+        new_buf.copy_from(*buf);
+        *buf = new_buf;
+    }
+    return shape_changed;
+}
+
+// Given a constraint Shape (generally produced by a bounds query), create a new
+// Shape that can legally be used to create and allocate a new Buffer:
+// ensure that extents/strides aren't zero, do some reality checking
+// on planar vs interleaved, and generally try to guess at a reasonable result.
+Shape make_legal_output_buffer_shape(const Shape &constrained_shape) {
     Shape new_shape = constrained_shape;
 
     // Make sure that the extents and strides for these are nonzero.
@@ -462,38 +551,16 @@ Shape fix_bounds_query_shape(const Shape &constrained_shape) {
         }
     }
 
-    // Special-case Chunky: most "chunky" generators tend to constrain stride[0]
-    // and stride[2] to exact values, leaving stride[1] unconstrained;
-    // in practice, we must ensure that stride[1] == stride[0] * extent[0]
-    // and stride[0] = extent[2] to get results that are not garbled.
-    // This is unpleasantly hacky and will likely need aditional enhancements.
-    // (Note that there are, theoretically, other stride combinations that might
-    // need fixing; in practice, ~all generators that aren't planar tend
-    // to be classically chunky.)
-    if (new_shape.size() >= 3) {
-        if (constrained_shape[2].stride == 1) {
-            if (constrained_shape[0].stride >= 1) {
-                // If we have stride[0] and stride[2] std::set to obviously-chunky,
-                // then force extent[2] to match stride[0].
-                new_shape[2].extent = constrained_shape[0].stride;
-            } else {
-                // If we have stride[2] == 1 but stride[0] <= 1,
-                // force stride[0] = extent[2]
-                new_shape[0].stride = new_shape[2].extent;
-            }
-            // Ensure stride[1] is reasonable.
-            new_shape[1].stride = new_shape[0].extent * new_shape[0].stride;
-        }
-    }
+    fix_chunky_strides(constrained_shape, &new_shape);
 
     // If anything else is zero, just set strides to planar and hope for the best.
-    bool zero_strides = false;
+    bool any_strides_zero = false;
     for (size_t i = 0; i < new_shape.size(); ++i) {
         if (!new_shape[i].stride) {
-            zero_strides = true;
+            any_strides_zero = true;
         }
     }
-    if (zero_strides) {
+    if (any_strides_zero) {
         // Planar
         new_shape[0].stride = 1;
         for (size_t i = 1; i < new_shape.size(); ++i) {
@@ -504,25 +571,6 @@ Shape fix_bounds_query_shape(const Shape &constrained_shape) {
 }
 // END TODO: hacky algorithm inspired by Safelight
 
-// Given a Buffer<>, return its shape in the form of a vector<halide_dimension_t>.
-// (Oddly, Buffer<> has no API to do this directly.)
-Shape get_shape(const Buffer<> &b) {
-    Shape s;
-    for (int i = 0; i < b.dimensions(); ++i) {
-        s.push_back(b.raw_buffer()->dim[i]);
-    }
-    return s;
-}
-
-// Given a type and shape, create a new Buffer<> and allocate storage for it.
-// (Oddly, Buffer<> has an API to do this with vector-of-extent, but not vector-of-halide_dimension_t.)
-Buffer<> allocate_buffer(const halide_type_t &type, const Shape &shape) {
-    Buffer<> b(type, nullptr, (int) shape.size(), &shape[0]);
-    b.check_overflow();
-    b.allocate();
-    return b;
-}
-
 // Return true iff all of the dimensions in the range [first, last] have an extent of <= 1.
 bool dims_in_range_are_trivial(const Buffer<> &b, int first, int last) {
     for (int d = first; d <= last; ++d) {
@@ -606,6 +654,79 @@ Buffer<> load_input(const std::string &pathname,
     return Buffer<>();
 }
 
+struct ArgData {
+    size_t index{0};
+    const halide_filter_argument_t *metadata{nullptr};
+    std::string raw_string;
+    halide_scalar_value_t scalar_value;
+    Buffer<> buffer_value;
+};
+
+// Run a bounds-query call with the given args, and return the shapes
+// to which we are constrained.
+std::vector<Shape> run_bounds_query(const std::map<std::string, ArgData> &args, 
+                                    const Shape &default_output_shape) {
+    std::vector<void*> filter_argv(args.size(), nullptr);
+    // These vectors are larger than needed, but simplifies logic downstream.
+    std::vector<Buffer<>> bounds_query_buffers(args.size());
+    std::vector<Shape> constrained_shapes(args.size());
+    for (auto &arg_pair : args) {
+        auto &arg = arg_pair.second;
+        switch (arg.metadata->kind) {
+        case halide_argument_kind_input_scalar:
+            filter_argv[arg.index] = const_cast<halide_scalar_value_t*>(&arg.scalar_value);
+            break;
+        case halide_argument_kind_input_buffer: 
+        case halide_argument_kind_output_buffer:
+            Shape shape = (arg.metadata->kind == halide_argument_kind_input_buffer) ? 
+                           get_shape(arg.buffer_value) :
+                           choose_output_extents(arg.metadata->dimensions, default_output_shape);
+            bounds_query_buffers[arg.index] = make_with_shape(arg.metadata->type, shape);
+            filter_argv[arg.index] = bounds_query_buffers[arg.index].raw_buffer();
+            break;
+        }
+    }
+
+    info() << "Running bounds query...";
+    // Ignore result since our halide_error() should catch everything.
+    (void) halide_rungen_redirect_argv(&filter_argv[0]);
+
+    for (auto &arg_pair : args) {
+        auto &arg = arg_pair.second;
+        switch (arg.metadata->kind) {
+        case halide_argument_kind_input_scalar:
+            break;
+        case halide_argument_kind_input_buffer:
+        case halide_argument_kind_output_buffer:
+            constrained_shapes[arg.index] = get_shape(bounds_query_buffers[arg.index]);
+            break;
+        }
+    }
+    return constrained_shapes;
+}
+
+uint64_t calc_pixels_out(const std::map<std::string, ArgData> &args) {
+    uint64_t pixels_out = 0;
+    for (auto &arg_pair : args) {
+        auto &arg = arg_pair.second;
+        switch (arg.metadata->kind) {
+            case halide_argument_kind_output_buffer: {
+                // TODO: this assumes that most output is "pixel-ish", and counting the size of the first
+                // two dimensions approximates the "pixel size". This is not, in general, a valid assumption,
+                // but is a useful metric for benchmarking.
+                Shape shape = get_shape(arg.buffer_value);
+                if (shape.size() >= 2) {
+                    pixels_out += shape[0].extent * shape[1].extent;
+                } else {
+                    pixels_out += shape[0].extent;
+                }
+                break;
+            }
+        }
+    }
+    return pixels_out;
+}
+
 void usage(const char *argv0) {
 const std::string usage = R"USAGE(
 Usage: $NAME$ argument=value [argument=value... ] [flags]
@@ -680,6 +801,10 @@ Usage: $NAME$ argument=value [argument=value... ] [flags]
         Override the default number of benchmarking iterations; ignored if 
         --benchmark is not also specified.
 
+    --benchmark_warmup=NUM: 
+        Number of iterations to run before timing, to warm up caches; ignored if 
+        --benchmark is not also specified.
+
     --track_memory: 
         Override Halide memory allocator to track high-water mark of memory 
         allocation during run; note that this may slow down execution, so 
@@ -753,14 +878,6 @@ int main(int argc, char **argv) {
 
     const halide_filter_metadata_t *md = halide_rungen_redirect_metadata();
 
-    struct ArgData {
-        size_t index{0};
-        const halide_filter_argument_t *metadata{nullptr};
-        std::string raw_string;
-        halide_scalar_value_t scalar_value;
-        Buffer<> buffer_value;
-    };
-
     std::map<std::string, ArgData> args;
     std::set<std::string> found;
     for (size_t i = 0; i < (size_t) md->num_arguments; ++i) {
@@ -778,13 +895,13 @@ int main(int argc, char **argv) {
     }
 
     Shape default_output_shape;
-    std::vector<void*> filter_argv(md->num_arguments, nullptr);
     std::vector<std::string> unknown_args;
     bool benchmark = false;
     bool track_memory = false;
     bool describe = false;
     int benchmark_samples = 3;
     int benchmark_iterations = 10;
+    int benchmark_warmup = 1;
     for (int i = 1; i < argc; ++i) {
         if (argv[i][0] == '-') {
             const char *p = argv[i] + 1; // skip -
@@ -840,6 +957,10 @@ int main(int argc, char **argv) {
                 if (!parse_scalar(flag_value, &benchmark_iterations)) {
                     fail() << "Invalid value for flag: " << flag_name;
                 }
+            } else if (flag_name == "benchmark_warmup") {
+                if (!parse_scalar(flag_value, &benchmark_warmup)) {
+                    fail() << "Invalid value for flag: " << flag_name;
+                }
             } else if (flag_name == "output_extents") {
                 default_output_shape = parse_extents(flag_value);
             } else {
@@ -918,7 +1039,6 @@ int main(int argc, char **argv) {
                      << arg.metadata->type << ": "
                      << arg.raw_string;
             }
-            filter_argv[arg.index] = &arg.scalar_value;
             break;
         }
         case halide_argument_kind_input_buffer: {
@@ -930,7 +1050,6 @@ int main(int argc, char **argv) {
             if (default_output_shape.empty()) {
                 default_output_shape = get_shape(arg.buffer_value);
             }
-            filter_argv[arg.index] = arg.buffer_value.raw_buffer();
             break;
         }
         case halide_argument_kind_output_buffer:
@@ -939,51 +1058,35 @@ int main(int argc, char **argv) {
         }
     }
 
-    // Run a bounds query, so we can allocate output buffers appropriately.
-    {
-        for (auto &arg_pair : args) {
-            auto &arg = arg_pair.second;
-            switch (arg.metadata->kind) {
-            case halide_argument_kind_output_buffer:
-                auto bounds_query_shape = choose_output_extents(arg.metadata->dimensions, default_output_shape);
-                arg.buffer_value = Buffer<>(arg.metadata->type, nullptr, (int) bounds_query_shape.size(), &bounds_query_shape[0]);
-                filter_argv[arg.index] = arg.buffer_value.raw_buffer();
-                break;
-            }
-        }
-
-        info() << "Running bounds query...";
-        int result = halide_rungen_redirect_argv(&filter_argv[0]);
-        if (result != 0) {
-            fail() << "Bounds query failed with result code: " << result;
-        }
-    }
+    // Run a bounds query: we need to figure out how to allocate the output buffers,
+    // and the input buffers might need reshaping to satisfy constraints (e.g. a chunky/interleaved layout).
+    std::vector<Shape> constrained_shapes = run_bounds_query(args, default_output_shape);
 
-    // Allocate the output buffers we'll need.
-    double pixels_out = 0.f;
     for (auto &arg_pair : args) {
         auto &arg_name = arg_pair.first;
         auto &arg = arg_pair.second;
+        const Shape &constrained_shape = constrained_shapes[arg.index];
         switch (arg.metadata->kind) {
-        case halide_argument_kind_output_buffer:
-            auto constrained_shape = get_shape(arg.buffer_value);
-            info() << "Output " << arg_name << ": BoundsQuery result is " << constrained_shape;
-            Shape shape = fix_bounds_query_shape(constrained_shape);
-            arg.buffer_value = allocate_buffer(arg.metadata->type, shape);
-            info() << "Output " << arg_name << ": Shape is " << get_shape(arg.buffer_value);
-            filter_argv[arg.index] = arg.buffer_value.raw_buffer();
-            // TODO: this assumes that most output is "pixel-ish", and counting the size of the first
-            // two dimensions approximates the "pixel size". This is not, in general, a valid assumption,
-            // but is a useful metric for benchmarking.
-            if (shape.size() >= 2) {
-                pixels_out += shape[0].extent * shape[1].extent;
-            } else {
-                pixels_out += shape[0].extent;
+            case halide_argument_kind_input_buffer: {
+                info() << "Input " << arg_name << ": Shape is " << get_shape(arg.buffer_value);
+                bool updated = adapt_input_buffer_layout(constrained_shape, &arg.buffer_value);
+                info() << "Input " << arg_name << ": BoundsQuery result is " << constrained_shape;
+                if (updated) {
+                    info() << "Input " << arg_name << ": Updated Shape is " << get_shape(arg.buffer_value);
+                }
+                break;
+            }
+            case halide_argument_kind_output_buffer: {
+                arg.buffer_value = allocate_buffer(arg.metadata->type, make_legal_output_buffer_shape(constrained_shape));
+                info() << "Output " << arg_name << ": BoundsQuery result is " << constrained_shape;
+                info() << "Output " << arg_name << ": Shape is " << get_shape(arg.buffer_value);
+                break;
             }
-            break;
         }
     }
-    double megapixels = pixels_out / (1024.0 * 1024.0);
+
+    uint64_t pixels_out = calc_pixels_out(args);
+    double megapixels = (double) pixels_out / (1024.0 * 1024.0);
 
     // If we're tracking memory, install the memory tracker *after* doing a bounds query.
     HalideMemoryTracker tracker;
@@ -991,12 +1094,25 @@ int main(int argc, char **argv) {
         tracker.install();
     }
 
-    if (benchmark) {
-        info() << "Benchmarking filter...";
+    {
+        std::vector<void*> filter_argv(args.size(), nullptr);
+        for (auto &arg_pair : args) {
+            auto &arg = arg_pair.second;
+            switch (arg.metadata->kind) {
+                case halide_argument_kind_input_scalar:
+                    filter_argv[arg.index] = &arg.scalar_value;
+                    break;
+                case halide_argument_kind_input_buffer:
+                case halide_argument_kind_output_buffer:
+                    filter_argv[arg.index] = arg.buffer_value.raw_buffer();
+                    break;
+            }
+        }
 
-        // Run once to warm up cache. Ignore result since our halide_error() should catch everything.
-        (void) halide_rungen_redirect_argv(&filter_argv[0]);
+        if (benchmark) {
+            info() << "Benchmarking filter...";
 
+<<<<<<< Updated upstream
         double time_in_seconds = Halide::Tools::benchmark(benchmark_samples, benchmark_iterations, [&filter_argv, &args]() {
             (void) halide_rungen_redirect_argv(&filter_argv[0]);
             // Ensure that all outputs are finished, otherwise we may just be
@@ -1007,18 +1123,41 @@ int main(int argc, char **argv) {
                     Buffer<> &b = arg.buffer_value;
                     b.device_sync();
                 }
+=======
+            for (int i = 0; i < benchmark_warmup; ++i) {
+                // Ignore result since our halide_error() should catch everything.
+                (void) halide_rungen_redirect_argv(&filter_argv[0]);
+>>>>>>> Stashed changes
             }
-          });
 
+<<<<<<< Updated upstream
         std::cout << "Benchmark for " << md->name << " produces best case of " << time_in_seconds << " sec/iter, over "
             << benchmark_samples << " blocks of " << benchmark_iterations << " iterations.\n";
         std::cout << "Best output throughput is " << (megapixels / time_in_seconds) << " mpix/sec.\n";
+=======
+            double time_in_seconds = Halide::Tools::benchmark(benchmark_samples, benchmark_iterations, [&filter_argv, &args]() {
+                // Ignore result since our halide_error() should catch everything.
+                (void) halide_rungen_redirect_argv(&filter_argv[0]);
+                // Ensure that all outputs are finished, otherwise we may just be
+                // measuring how long it takes to do a kernel launch for GPU code.
+                for (auto &arg_pair : args) {
+                    auto &arg = arg_pair.second;
+                    if (arg.metadata->kind == halide_argument_kind_output_buffer) {
+                        Buffer<> &b = arg.buffer_value;
+                        b.device_sync();
+                    }
+                }
+              });
+>>>>>>> Stashed changes
 
-    } else {
-        info() << "Running filter...";
-        int result = halide_rungen_redirect_argv(&filter_argv[0]);
-        if (result != 0) {
-            fail() << "Filter failed with result code: " << result;
+            std::cout << "Benchmark for " << md->name << " produces best case of " << time_in_seconds << " sec/iter, over "
+                << benchmark_samples << " blocks of " << benchmark_iterations << " iterations.\n";
+            std::cout << "Best output throughput is " << (megapixels / time_in_seconds) << " mpix/sec.\n";
+
+        } else {
+            info() << "Running filter...";
+            // Ignore result since our halide_error() should catch everything.
+            (void) halide_rungen_redirect_argv(&filter_argv[0]);
         }
     }