From 1a370e7ec3c8a7c2fd00e8d8ed60f01e75dc7614 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 17 Dec 2024 21:18:32 -0800 Subject: [PATCH 1/9] Exclusiveness analysis --- csrc/scheduler/resize.cpp | 36 +-- csrc/scheduler/tools/resize_utils.cpp | 106 ++++++++ csrc/scheduler/tools/resize_utils.h | 79 ++++++ tests/cpp/test_gpu3.cpp | 2 - tests/cpp/test_resize.cpp | 356 +++++++++++++++++++++++++- 5 files changed, 533 insertions(+), 46 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index fc96bd3db67..ce1513f47af 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -71,42 +71,16 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { IdModel id_model(fusion, /*build_graphs=*/false); const auto& broadcast_graph = id_model.buildBroadcastGraph(); - // For now, only a single resize op is allowed to exist. auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); - if (resize_based_tensor_ops.size() != 1) { + + if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo( + resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT)); + !non_exclusive_resizes.empty()) { scheduler_debug_utils::canScheduleRejectReason( - schedulerType(), "Only a single resize op is allowed."); + schedulerType(), "Not exclusively consumed."); return false; } - auto resize_out_tv = - resize_based_tensor_ops.at(0)->output(0)->as(); - - auto all_dep_vals = DependencyCheck::getAllValsBetween( - {fusion->inputs().begin(), fusion->inputs().end()}, {resize_out_tv}); - for (auto tv : ir_utils::filterByType(all_dep_vals)) { - if (tv == resize_out_tv) { - continue; - } - if (tv->isFusionOutput()) { - scheduler_debug_utils::canScheduleRejectReason( - schedulerType(), - "Dependency to fusion output not allowed: ", - tv->toString()); - return false; - } - for (auto consumer_of_tv : ir_utils::consumerTvsOf(tv)) { - if (std::find(all_dep_vals.begin(), all_dep_vals.end(), consumer_of_tv) == - all_dep_vals.end()) { - scheduler_debug_utils::canScheduleRejectReason( - schedulerType(), - "Resize inputs must be exclusively consumed by resize: ", - consumer_of_tv->toString()); - return false; - } - } - } - // Slicing of or to a broadcast ID is not allowed yet. for (auto tensor_op : resize_based_tensor_ops) { TensorView* out_tv = tensor_op->output(0)->as(); diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index cc914e5684b..f1206f99676 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -66,5 +66,111 @@ void propagateResizeToInputs(Expr* resize_tensor_op) { } } +std::unordered_map getNonExclusiveResizeInfo( + const std::vector& ordered_resize_tensor_ops, + const ValGraph& exact_graph) { + NVF_ERROR(!ordered_resize_tensor_ops.empty()); + Fusion* fusion = ordered_resize_tensor_ops[0]->fusion(); + + std::unordered_map non_exclusive_resizes; + + std::unordered_set inputs{ + fusion->inputs().begin(), fusion->inputs().end()}; + + auto get_root_to_logical_resizes = + [&exact_graph](TensorView* tv) -> ValGroups { + auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween( + {tv->getRootDomain().begin(), tv->getRootDomain().end()}, + {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); + ValGroups resize_inp_ids; + for (auto resize : + ir_utils::filterByType(out_tv_root_to_logical_exprs)) { + resize_inp_ids.pushBack(exact_graph.toGroup(resize->in())); + } + return resize_inp_ids; + }; + + // Traverse the ops in a topological order + for (Expr* resize_tensor_op : ordered_resize_tensor_ops) { + auto inp_tv = dynamic_cast(resize_tensor_op->inputs().at(0)); + auto out_tv = dynamic_cast(resize_tensor_op->outputs().at(0)); + + ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv); + NVF_ERROR(!resize_inp_ids.empty()); + + auto dep_vals = + DependencyCheck::getAllValsBetween(inputs, std::vector{inp_tv}); + + // For each tensor that inp_tv depends on, check if the resize op + // is considered non-exclusive with respect to the tensor. That + // is, if propagation of the resize may result in externally + // visible changes through the tensor, the resize is considered + // non-exclusive. + for (auto dep_tv : ir_utils::filterByType(dep_vals)) { + bool maybe_non_exclusive = false; + + if (dep_tv->isFusionOutput()) { + maybe_non_exclusive = true; + } + + if (!maybe_non_exclusive) { + // If a dependent tv has a consumer that inp_tv does not + // depend on, propagation of resize would escape to outputs, + // which needs to be avoided. + for (auto consumer_tv : ir_utils::consumerTvsOf(dep_tv)) { + // We are interested in if resized IDs are used by other tensors + // than out_tv + if (consumer_tv != out_tv && + std::find(dep_vals.begin(), dep_vals.end(), consumer_tv) == + dep_vals.end()) { + maybe_non_exclusive = true; + break; + } + } + } + + if (!maybe_non_exclusive) { + continue; + } + + // dep_tv potentially is either a fusion output or it has a + // consumer outside of the dependency set to the resized + // tensor. Propagating the resize to dep_tv should be + // avoided. However, if the dep_tv iter domain that corresponds + // to the resized ID is a broadcast or there's no such ID, it + // should still be safe to consider the resize op exclusive as + // there's no iter domain to resize. + const auto inp_tv_logical_groups = + exact_graph.toGroups(inp_tv->getLogicalDomain()); + const auto dep_tv_logical_groups = + exact_graph.toGroups(dep_tv->getLogicalDomain()); + auto vals_between = getValsBetween( + {inp_tv_logical_groups.begin(), inp_tv_logical_groups.end()}, + {dep_tv_logical_groups.begin(), dep_tv_logical_groups.end()}, + exact_graph); + + for (const ValGroup& resize_inp_id : resize_inp_ids) { + if (std::find( + vals_between.begin(), vals_between.end(), resize_inp_id) == + vals_between.end()) { + // This resize can be ignored as there's no corresponding ID + // in the dep tv + continue; + } + + // This resize input ID is not exclusively used + non_exclusive_resizes[inp_tv].pushBack(resize_inp_id); + } + } + + // Analysis of exclusiveness until in_tv is done. Following + // resize-based tensor ops do not need to check the same section + // of the fusion and can start from out_tv. + inputs.insert(out_tv); + } + + return non_exclusive_resizes; +} + } // namespace scheduler_tools } // namespace nvfuser diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h index cf03083ad4f..1245f166c65 100644 --- a/csrc/scheduler/tools/resize_utils.h +++ b/csrc/scheduler/tools/resize_utils.h @@ -7,9 +7,12 @@ // clang-format on #pragma once +#include + namespace nvfuser { class Expr; +class TensorView; namespace scheduler_tools { @@ -19,5 +22,81 @@ namespace scheduler_tools { // fusion inputs are skipped as their loop domains don't matter. void propagateResizeToInputs(Expr* resize_op); +// Given a topologically ordered list of resize-based tensor ops such +// as slice and pad, check if they can be propagated to fusion inputs +// exclusively without causing any visible side effect. For example, +// if a tensor is sliced and also is used to produce an output without +// the slicing, the slice is considered non exclusive as the slice +// input has the other visible consumer. Propagating the resize of the +// slice to the slice input is invalid since the output computed from +// the slice input depends on the full iteration space. +// +// For example, consider the following case: +// +// t0 = makeSymbolicTensor(1) +// fusion.addInput(t0) +// t1 = t0 + 1 +// t2 = t1[1:10] +// t3 = t1 + 1 +// fusion.addOutput(t2) +// fusion.addOutput(t3) +// +// In this case, propating the resize op of the slice would alter t1, +// which would in turn affect t3, which is a fusion output. Since the +// change would be visible due to the change of t3, this resie op is +// considered non-exclusive. +// +// Consider a slightly different case as shown below: +// +// t0 = makeSymbolicTensor(1) +// fusion.addInput(t0) +// t1 = t0[1:10] +// t2 = t0 + 1 +// fusion.addOutput(t1) +// fusion.addOutput(t2) +// +// Note that the slice is directly done with the fusion input. Since +// we do not propagate resize ops to fusion inputs, this can be +// considered exclusive. However, this is also considered +// non-exclusive since the actual scheduling inserts a cache after t0, +// which can cause a visible side effect if the resize is propagated. +// +// Another non-exclusivess comes from dependent fusion outputs. For +// example, if a slice input depends on a fusion output, propation +// would alter the fusion output. Consider a case like: +// +// t0 = makeSymbolicTensor(1) +// fusion.addInput(t0) +// t1 = t0 + 1 +// t2 = t1[1:10] // slice +// fusion.addOutput(t1) +// fusion.addOutput(t2) +// +// If the resize op for the slice is propagated to t1, only the +// section of [1:10] would be computed. Since that would change a +// fusion output, the resize op is considered non-exclusive. +// +// When there's a chain of resize-based ops, for example: +// +// t0 = makeSymbolicTensor(1) +// fusion.addInput(t0) +// t1 = t0 + 1 +// t2 = t1[1:10] +// t3 = t2[2:5] +// t4 = t1 + 1 +// fusion.addOutput(t3) +// fusion.addOutput(t4) +// +// We do not consider the second slice as non-exclusive as +// long as the first slice is considered non-exclusive. This will be +// important when resolving the non-exclusiveness by replication. +// +// The function returns a map from tensors that are input to +// non-exclusive ops to their resize input ID groups. This map will be +// used to resolve the non-exclusiveness by replication. +std::unordered_map getNonExclusiveResizeInfo( + const std::vector& ordered_resize_tensor_ops, + const ValGraph& exact_graph); + } // namespace scheduler_tools } // namespace nvfuser diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 66087eab2f5..76d45f6de4c 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -9249,8 +9249,6 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) { tv1->split(0, 4); tv1->split(0, 8); - fusion.print(); - auto all_ids = tv1->domain()->allIDs(); auto split2 = tv1->axis(0)->definition()->as(); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index fb6c74512f2..1c3ca839baa 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4282,7 +4282,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) { } } -TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) { +TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) { auto fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr; FusionGuard fg(fusion_ptr.get()); @@ -4368,7 +4368,251 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) { auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } else { - GTEST_SKIP() << "Scheduling not yet supported"; + // Make sure all slices are detected as exclusive + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_TRUE(non_exclusive_resize_info.empty()); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); + Fusion* scheduled_fusion = + dynamic_cast(runtime->executors().at(0).get()) + ->fusion(); + checkLoopDomainEquivalence( + scheduled_fusion->outputs().at(0)->as()); + } +} + +// Two horizontal slices, both of which slice the same iter domain. +TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + EnableOptionsGuard enable_options_guard; + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = sin(tv0); + + auto tv2 = slice( + tv1, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(1L), tv1->getLogicalDomain().at(1)->extent()}}); + + auto tv3 = sin(tv2); + + auto tv4 = sin(tv1); + + auto tv5 = slice( + tv4, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(2L), tv1->getLogicalDomain().at(1)->extent()}}); + + auto tv6 = sin(tv5); + + fusion.addOutput(tv3); + fusion.addOutput(tv6); + + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + + // tv1 is the input of the first slice, which is not exclusive as + // tv1 is also a producer of tv4. + EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv1), + exact_graph.toGroups(std::vector{tv1->axis(1)})); + + // Similary, tv4 is the input of the second slice, which is not exclusive as + // tv1 is also a producer of tv2. + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv4), + exact_graph.toGroups(std::vector{tv4->axis(1)})); +} + +// Non-exclusive slice due to a dependency to a fusion output +TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + EnableOptionsGuard enable_options_guard; + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = makeConcreteTensor({-1}); + fusion.addInput(tv1); + + auto tv2 = sin(tv0); + + fusion.addOutput(tv2); + + auto tv3 = add(tv2, broadcast(tv1, {false, true})); + + auto tv4 = slice( + tv3, + {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(1L), tv3->getLogicalDomain().at(1)->extent()}}); + + auto tv5 = sin(tv4); + + fusion.addOutput(tv5); + + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + + // tv3 is the input of the slice, which is not exclusive as + // tv3 depends on tv2, which is a fusion output + EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv3), + exact_graph.toGroups(std::vector{tv3->axis(1)})); +} + +// Slice input tensor depends on a fusion output, but the slice is +// still considered exclusive as the fusion output has no +// corresponding ID for the sliced ID. Note that scheduling is not yet +// supported due to the existence of the dependency from the slice input +// ID to the broadcast ID. +TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs4) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = makeConcreteTensor({shape[0]}); + fusion.addInput(tv1); + + auto tv2 = sin(tv1); + + fusion.addOutput(tv2); + + auto tv3 = add(tv0, broadcast(tv2, {false, true})); + + auto tv4 = slice( + tv3, + {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(1L), tv3->getLogicalDomain().at(1)->extent()}}); + + auto tv5 = sin(tv4); + + fusion.addOutput(tv5); + + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + + EXPECT_TRUE(non_exclusive_resize_info.empty()); +} + +// Testing chained slices. Should be considered exclusive +TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = sin(tv0); + + auto tv2 = slice( + tv1, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(1L), tv1->getLogicalDomain().at(1)->extent()}}); + + auto tv3 = slice( + tv2, + {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(3L), tv2->getLogicalDomain().at(1)->extent()}}); + + auto tv4 = sin(tv3); + + fusion.addOutput(tv4); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + std::vector inputs({t0}); + + const bool use_scheduler = GetParam(); + + if (!use_scheduler) { + scheduler_tools::propagateResizeToInputs(tv2->definition()); + scheduler_tools::propagateResizeToInputs(tv3->definition()); + auto ref_tv = tv4; + + // Fusion should have a uniform loop domain + checkLoopDomainEquivalence(ref_tv); + + // Schedule the reference + ref_tv->flatten(); + // For TIDx + ref_tv->split(0, 128); + // For BIDx + ref_tv->split(0, 4); + + scheduler_tools::scheduleLoopDomainsLike( + fusion.allTvs(), ref_tv->getLoopDomain()); + + // Fusion should still have a uniform loop domain + checkLoopDomainEquivalence(ref_tv); + + inlineMost(); + + // All tensors, except for fusion inputs, should be fully inlined + for (auto tv : fusion.allTvs()) { + if (tv->isFusionInput()) { + continue; + } + EXPECT_EQ(tv->getComputeAtPosition(), tv->nDims()); + } + + ref_tv->axis(-1)->parallelize(ParallelType::TIDx); + ref_tv->axis(-2)->parallelize(ParallelType::BIDx); + + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); + testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); + } else { + // The two slices do not conflict + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_TRUE(non_exclusive_resize_info.empty()); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); @@ -4387,6 +4631,52 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) { } } +// Testing chained slices. The first slice is considered +// non-exclusive, but the following slice should not. +TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = sin(tv0); + + auto tv2 = slice( + tv1, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(1L), tv1->getLogicalDomain().at(1)->extent()}}); + + auto tv3 = slice( + tv2, + {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(3L), tv2->getLogicalDomain().at(1)->extent()}}); + + auto tv4 = sin(tv3); + fusion.addOutput(tv4); + + auto tv5 = sin(tv1); + fusion.addOutput(tv5); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + std::vector inputs({t0}); + + // The two slices do not conflict + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 1); + EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv1), + exact_graph.toGroups(std::vector{tv1->axis(1)})); +} + // RoPE-like rotation patten TEST_P(ResizeSchedulerTest, SliceRotateCat) { auto fusion_ptr = std::make_unique(); @@ -4451,16 +4741,6 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { // For BIDx ref_tv->split(0, 4); - { - IdModel id_model(&fusion, false); - id_model.buildExactGraph(); - std::ofstream ofs("exact_graph.dot", std::ofstream::trunc); - auto dot_string = - id_model.idGraph(IdMappingMode::EXACT).toGraphvizDotGraph(); - ofs << dot_string; - ofs.close(); - } - scheduler_tools::scheduleLoopDomainsLike( fusion.allTvs(), ref_tv->getLoopDomain(), /*update_mode=*/true); @@ -4485,6 +4765,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } else { + // tv1 is not considered exclusive as tv0 is also a consumer of + // tv3. Same for tv3. While the common input, tv0, is a fusion + // input, so it isn't actually scheduled, since a cache is + // inserted, which is indeed scheduled, the two slices do + // conflict. + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv1), + exact_graph.toGroups(std::vector{tv1->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv3), + exact_graph.toGroups(std::vector{tv3->axis(1)})); + // These two entries should be all the info map has. + EXPECT_EQ(non_exclusive_resize_info.size(), 2); + GTEST_SKIP() << "Scheduling not yet supported"; FusionExecutorCache executor_cache(std::move(fusion_ptr)); @@ -4605,6 +4905,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } else { + // tv1 is not considered exclusive as tv0 is also a consumer of + // tv3. Same for tv3. While the common input, tv0, is a fusion + // input, so it isn't actually scheduled, since a cache is + // inserted, which is indeed scheduled, the two slices do + // conflict. + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv1), + exact_graph.toGroups(std::vector{tv1->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); + EXPECT_EQ( + non_exclusive_resize_info.at(tv3), + exact_graph.toGroups(std::vector{tv3->axis(1)})); + // These two entries should be all the info map has. + EXPECT_EQ(non_exclusive_resize_info.size(), 2); + GTEST_SKIP() << "Scheduling not yet supported"; FusionExecutorCache executor_cache(std::move(fusion_ptr)); @@ -4691,6 +5011,12 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) { auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } else { + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_TRUE(non_exclusive_resize_info.empty()); + FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate( @@ -4787,7 +5113,11 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) { auto outputs = ke.run(inputs); testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } else { - GTEST_SKIP() << "Scheduling not yet supported"; + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_TRUE(non_exclusive_resize_info.empty()); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); From ac5a1bc2f2fc551197f65444e0108d9915c00280 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 18 Dec 2024 08:48:30 -0800 Subject: [PATCH 2/9] cleanup --- csrc/scheduler/resize.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index ce1513f47af..194087b90e8 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -76,8 +76,13 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo( resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT)); !non_exclusive_resizes.empty()) { - scheduler_debug_utils::canScheduleRejectReason( - schedulerType(), "Not exclusively consumed."); + std::stringstream msg; + msg << "Propagation of resizes would affect fusion outputs."; + for (const auto& [tv, resize_ids] : non_exclusive_resizes) { + msg << " Resize input tv: " << tv->toString() + << ", resize input ID groups: " << nvfuser::toString(resize_ids); + } + scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str()); return false; } From 8b8c708d5dcbf06770810346deedc76801e04295 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 12:12:43 -0800 Subject: [PATCH 3/9] cleanup --- csrc/scheduler/tools/resize_utils.cpp | 9 ++++----- csrc/scheduler/tools/resize_utils.h | 6 +++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index f1206f99676..4771a66cf31 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -79,6 +79,9 @@ std::unordered_map getNonExclusiveResizeInfo( auto get_root_to_logical_resizes = [&exact_graph](TensorView* tv) -> ValGroups { + // This should be only used for outputs of resize-based ops, + // so it should always have a root domain. + NVF_ERROR(tv->hasRoot()); auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween( {tv->getRootDomain().begin(), tv->getRootDomain().end()}, {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); @@ -107,11 +110,7 @@ std::unordered_map getNonExclusiveResizeInfo( // visible changes through the tensor, the resize is considered // non-exclusive. for (auto dep_tv : ir_utils::filterByType(dep_vals)) { - bool maybe_non_exclusive = false; - - if (dep_tv->isFusionOutput()) { - maybe_non_exclusive = true; - } + bool maybe_non_exclusive = dep_tv->isFusionOutput(); if (!maybe_non_exclusive) { // If a dependent tv has a consumer that inp_tv does not diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h index 1245f166c65..7b19062d6de 100644 --- a/csrc/scheduler/tools/resize_utils.h +++ b/csrc/scheduler/tools/resize_utils.h @@ -43,7 +43,7 @@ void propagateResizeToInputs(Expr* resize_op); // // In this case, propating the resize op of the slice would alter t1, // which would in turn affect t3, which is a fusion output. Since the -// change would be visible due to the change of t3, this resie op is +// change would be visible due to the change of t3, this resize op is // considered non-exclusive. // // Consider a slightly different case as shown below: @@ -61,8 +61,8 @@ void propagateResizeToInputs(Expr* resize_op); // non-exclusive since the actual scheduling inserts a cache after t0, // which can cause a visible side effect if the resize is propagated. // -// Another non-exclusivess comes from dependent fusion outputs. For -// example, if a slice input depends on a fusion output, propation +// Another non-exclusivness comes from dependent fusion outputs. For +// example, if a slice input depends on a fusion output, propagation // would alter the fusion output. Consider a case like: // // t0 = makeSymbolicTensor(1) From d364442808cce87532d24fe13b5b755a6d2ee25b Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 12:26:32 -0800 Subject: [PATCH 4/9] PR feedback --- csrc/scheduler/tools/resize_utils.cpp | 3 ++- tests/cpp/test_resize.cpp | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index 4771a66cf31..c812812b905 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -138,7 +138,8 @@ std::unordered_map getNonExclusiveResizeInfo( // avoided. However, if the dep_tv iter domain that corresponds // to the resized ID is a broadcast or there's no such ID, it // should still be safe to consider the resize op exclusive as - // there's no iter domain to resize. + // there's no iter domain to resize. For a concrete example, see + // ResizeSchedulerTest.PropagateMultipleSlicesToInputs4. const auto inp_tv_logical_groups = exact_graph.toGroups(inp_tv->getLogicalDomain()); const auto dep_tv_logical_groups = diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 1c3ca839baa..f0a35dee3a7 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4496,9 +4496,14 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { // Slice input tensor depends on a fusion output, but the slice is // still considered exclusive as the fusion output has no -// corresponding ID for the sliced ID. Note that scheduling is not yet -// supported due to the existence of the dependency from the slice input -// ID to the broadcast ID. +// corresponding ID for the sliced ID. More specifically, tv2 is a +// fusion output and has a dependency to the input of the +// slice. However, the resize is done for the second axis of tv3, +// for which tv2 has no corresponding ID. In this case, it should be +// safe to do the propagation of the resize. +// +// Note that scheduling is not yet supported due to the existence of +// the dependency from the slice input ID to the broadcast ID. TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs4) { auto fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr; From 7380a40b45003b5b42336479f2ca6e61245c249b Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 12:54:09 -0800 Subject: [PATCH 5/9] Resolve conflicts by recomputation --- csrc/scheduler/resize.cpp | 61 ++++++-- csrc/scheduler/tools/resize_utils.cpp | 14 +- csrc/scheduler/tools/resize_utils.h | 16 +- tests/cpp/test_resize.cpp | 208 ++++++++++++++++++++------ 4 files changed, 235 insertions(+), 64 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index 194087b90e8..dece9f79238 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -73,19 +73,6 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); - if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo( - resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT)); - !non_exclusive_resizes.empty()) { - std::stringstream msg; - msg << "Propagation of resizes would affect fusion outputs."; - for (const auto& [tv, resize_ids] : non_exclusive_resizes) { - msg << " Resize input tv: " << tv->toString() - << ", resize input ID groups: " << nvfuser::toString(resize_ids); - } - scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str()); - return false; - } - // Slicing of or to a broadcast ID is not allowed yet. for (auto tensor_op : resize_based_tensor_ops) { TensorView* out_tv = tensor_op->output(0)->as(); @@ -133,6 +120,30 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { return false; } + for (auto out_tv : ir_utils::filterByType(fusion->outputs())) { + if (out_tv == ref_tv) { + continue; + } + auto exprs = ValGraphBFS::getExprGroupsBetween( + broadcast_graph, + broadcast_graph.toGroups(ref_tv->getLogicalDomain()), + broadcast_graph.toGroups(out_tv->getLogicalDomain()), + /*require_all_to_visited=*/false) + .first; + for (const auto& [expr_g, dir] : exprs) { + if (expr_g->front()->isA()) { + std::stringstream msg; + msg << "Resize between reference and output not allowed."; + msg << " Reference: " << ref_tv->toString() + << ". Output: " << out_tv->toString() + << ". Resize: " << expr_g->front()->toString(); + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), msg.str()); + return false; + } + } + } + // Disable the scheduler if there's a squeeze op. The loop option // may also need to be enabled in that case, but that option is not // turned on automatically yet. @@ -163,6 +174,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { scheduler_utils::cacheInputs(fusion, true); scheduler_utils::cacheAndForkOutputs(fusion, true); + auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); + + IdModel id_model(fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + + // Replicate resize inputs if necessary to avoid conflicting propagations + for (const auto& [out_tv, exlusivity_info] : + scheduler_tools::getNonExclusiveResizeInfo( + resize_based_tensor_ops, exact_graph)) { + auto resize_based_op = out_tv->definition(); + auto inp_tv = resize_based_op->input(0)->as(); + auto inp_tv_copy = RecomputeTv::recompute(inp_tv); + ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy); + } + for (auto expr : fusion->exprs()) { if (!expr->isOneOf()) { continue; @@ -186,9 +212,14 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { ref_tv->axis(-1)->parallelize(ParallelType::TIDx); ref_tv->axis(-2)->parallelize(ParallelType::BIDx); - // Propagate the reference to the other tensors + // Propagate the reference to the other tensors. Note that the + // update flag is enabled so to workaround the resize propagation + // issue. This may not work if there's a tensor that is reshaped + // from the reference tensor, but that should not be the case as the + // reference is picked by the same routine used for the pointwise + // scheduler. scheduler_tools::scheduleLoopDomainsLike( - fusion->allTvs(), ref_tv->getLoopDomain()); + fusion->allTvs(), ref_tv->getLoopDomain(), true); inlineMost(); diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index c812812b905..26ec36fa8ec 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -66,13 +66,13 @@ void propagateResizeToInputs(Expr* resize_tensor_op) { } } -std::unordered_map getNonExclusiveResizeInfo( +std::unordered_map getNonExclusiveResizeInfo( const std::vector& ordered_resize_tensor_ops, const ValGraph& exact_graph) { NVF_ERROR(!ordered_resize_tensor_ops.empty()); Fusion* fusion = ordered_resize_tensor_ops[0]->fusion(); - std::unordered_map non_exclusive_resizes; + std::unordered_map non_exclusive_resizes; std::unordered_set inputs{ fusion->inputs().begin(), fusion->inputs().end()}; @@ -98,6 +98,8 @@ std::unordered_map getNonExclusiveResizeInfo( auto inp_tv = dynamic_cast(resize_tensor_op->inputs().at(0)); auto out_tv = dynamic_cast(resize_tensor_op->outputs().at(0)); + ResizeExclusivityInfo info; + ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv); NVF_ERROR(!resize_inp_ids.empty()); @@ -159,10 +161,16 @@ std::unordered_map getNonExclusiveResizeInfo( } // This resize input ID is not exclusively used - non_exclusive_resizes[inp_tv].pushBack(resize_inp_id); + // non_exclusive_resizes[inp_tv].first.pushBack(resize_inp_id); + info.shared_tvs.push_back(dep_tv); + info.resized_ids.pushBack(resize_inp_id); } } + if (!info.shared_tvs.empty()) { + NVF_ERROR(non_exclusive_resizes.emplace(out_tv, info).second); + } + // Analysis of exclusiveness until in_tv is done. Following // resize-based tensor ops do not need to check the same section // of the fusion and can start from out_tv. diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h index 7b19062d6de..b3704bb7326 100644 --- a/csrc/scheduler/tools/resize_utils.h +++ b/csrc/scheduler/tools/resize_utils.h @@ -94,7 +94,21 @@ void propagateResizeToInputs(Expr* resize_op); // The function returns a map from tensors that are input to // non-exclusive ops to their resize input ID groups. This map will be // used to resolve the non-exclusiveness by replication. -std::unordered_map getNonExclusiveResizeInfo( +struct ResizeExclusivityInfo { + std::vector shared_tvs; + // std::unordered_map resized_ids; + ValGroups resized_ids; + + bool operator==(const ResizeExclusivityInfo& other) const { + return shared_tvs == other.shared_tvs && resized_ids == other.resized_ids; + } + + bool operator!=(const ResizeExclusivityInfo& other) const { + return !(*this == other); + } +}; + +std::unordered_map getNonExclusiveResizeInfo( const std::vector& ordered_resize_tensor_ops, const ValGraph& exact_graph); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index f0a35dee3a7..74e44c31841 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4427,25 +4427,83 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) { fusion.addOutput(tv3); fusion.addOutput(tv6); - IdModel id_model(&fusion, /*build_graphs=*/false); - const auto& exact_graph = id_model.buildExactGraph(); + { + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); - auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( - ir_utils::getOpsOfType(&fusion), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 2); - // tv1 is the input of the first slice, which is not exclusive as - // tv1 is also a producer of tv4. - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); + // tv2 is the output of the first slice, which is not exclusive as + // tv1 is also a producer of tv4. + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + // Similary, tv5 is the output of the second slice, which is not exclusive + // as tv1 is also a producer of tv2. + EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1); + scheduler_tools::ResizeExclusivityInfo tv5_info{ + {tv1}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info); + } - // Similary, tv4 is the input of the second slice, which is not exclusive as - // tv1 is also a producer of tv2. - EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv4), - exact_graph.toGroups(std::vector{tv4->axis(1)})); + // Test replication-based mitigation of conflicts + { + Fusion fusion_copy = fusion; + FusionGuard fg(&fusion_copy); + + auto tv0 = fusion_copy.inputs().at(0)->as(); + auto tv2 = + fusion_copy.outputs().at(0)->definition()->input(0)->as(); + auto slice = dynamic_cast(tv2->definition()); + ASSERT_NE(slice, nullptr); + auto tv1 = slice->input(0)->as(); + auto tv5 = + fusion_copy.outputs().at(1)->definition()->input(0)->as(); + auto tv4 = tv5->definition()->input(0)->as(); + + // Replicate tv1 for tv2 + auto private_copy = RecomputeTv::recompute(tv1); + ir_utils::replaceValInExprInputs(slice, tv1, private_copy); + + // The two slices should still be reported as non-exclusive but they + // both are shared at the fusion input. + IdModel id_model(&fusion_copy, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion_copy), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 2); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv0->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1); + scheduler_tools::ResizeExclusivityInfo tv5_info{ + {tv0}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + std::vector inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); + Fusion* scheduled_fusion = + dynamic_cast(runtime->executors().at(0).get())->fusion(); + checkLoopDomainEquivalence( + scheduled_fusion->outputs().at(0)->as()); } // Non-exclusive slice due to a dependency to a fusion output @@ -4486,12 +4544,57 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - // tv3 is the input of the slice, which is not exclusive as + // tv4 is the input of the slice, which is not exclusive as // tv3 depends on tv2, which is a fusion output - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv2}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); + + // Test replication-based mitigation of conflicts + { + Fusion fusion_copy = fusion; + FusionGuard fg(&fusion_copy); + + auto tv0 = fusion_copy.inputs().at(0)->as(); + auto tv5 = fusion_copy.outputs().at(1)->as(); + auto tv4 = tv5->definition()->input(0)->as(); + auto tv3 = tv4->definition()->input(0)->as(); + + auto private_copy = RecomputeTv::recompute(tv3); + ir_utils::replaceValInExprInputs(tv4->definition(), tv3, private_copy); + + IdModel id_model(&fusion_copy, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion_copy), exact_graph); + EXPECT_EQ(non_exclusive_resize_info.size(), 1); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv0->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); + } + + GTEST_SKIP() << "Scheduling not yet supported due to broadcast"; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + auto t1 = at::randn({16}, options); + std::vector inputs({t0, t1}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); + Fusion* scheduled_fusion = + dynamic_cast(runtime->executors().at(0).get())->fusion(); + checkLoopDomainEquivalence( + scheduled_fusion->outputs().at(0)->as()); } // Slice input tensor depends on a fusion output, but the slice is @@ -4676,10 +4779,29 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) { auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); EXPECT_EQ(non_exclusive_resize_info.size(), 1); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + // When scheduled, since the shape of the tv4 is different from the + // shape of tv5, this fusion is segmented. One segment is a resize + // segment consisting of tv2 and tv3 slices. Another is a pointwise + // segment for tv5. + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + const auto& heuristic_list = runtime->schedulerHeuristics()->heuristicsList(); + EXPECT_EQ(heuristic_list.size(), 2); + // They should be a combination of a resize scheduler and a pointwise + // scheduler + EXPECT_TRUE( + (heuristic_list[0]->scheduler_type == SchedulerType::PointWise && + heuristic_list[1]->scheduler_type == SchedulerType::Resize) || + (heuristic_list[0]->scheduler_type == SchedulerType::Resize && + heuristic_list[1]->scheduler_type == SchedulerType::PointWise)); } // RoPE-like rotation patten @@ -4779,19 +4901,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { const auto& exact_graph = id_model.buildExactGraph(); auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); // These two entries should be all the info map has. EXPECT_EQ(non_exclusive_resize_info.size(), 2); - GTEST_SKIP() << "Scheduling not yet supported"; - FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate( @@ -4919,19 +5039,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { const auto& exact_graph = id_model.buildExactGraph(); auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( ir_utils::getOpsOfType(&fusion), exact_graph); - EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv1), - exact_graph.toGroups(std::vector{tv1->axis(1)})); - EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); - EXPECT_EQ( - non_exclusive_resize_info.at(tv3), - exact_graph.toGroups(std::vector{tv3->axis(1)})); + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv0}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1); + scheduler_tools::ResizeExclusivityInfo tv4_info{ + {tv0}, exact_graph.toGroups(std::vector{tv3->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info); // These two entries should be all the info map has. EXPECT_EQ(non_exclusive_resize_info.size(), 2); - GTEST_SKIP() << "Scheduling not yet supported"; - FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate( From 9631958ef7c0119f356492b0768a6891218c9acf Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 21:11:31 -0800 Subject: [PATCH 6/9] test fix --- tests/cpp/test_resize.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 6013d385018..c7a70d928b4 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4495,15 +4495,15 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) { auto out_tensors = executor_cache.runFusionWithInputs(inputs); testValidate( executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + + // While the slices can be transformed to be all exclusive, it is + // currently segmented as the output has differet shapes. Both + // segments should be scheduled as resize segments. FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); - EXPECT_FALSE(runtime->isSegmented()); - const auto& heuristic_param = - runtime->schedulerHeuristics()->heuristicsList().front(); - EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = - dynamic_cast(runtime->executors().at(0).get())->fusion(); - checkLoopDomainEquivalence( - scheduled_fusion->outputs().at(0)->as()); + const auto& heuristic_list = runtime->schedulerHeuristics()->heuristicsList(); + EXPECT_EQ(heuristic_list.size(), 2); + EXPECT_EQ(heuristic_list[0]->scheduler_type, SchedulerType::Resize); + EXPECT_EQ(heuristic_list[1]->scheduler_type, SchedulerType::Resize); } // Non-exclusive slice due to a dependency to a fusion output From 76dbab97b4fdcc739f2ed1269e11e2669cee53ff Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 19 Dec 2024 22:43:00 -0800 Subject: [PATCH 7/9] fix --- csrc/scheduler/resize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index dece9f79238..408c662d8aa 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -185,6 +185,12 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { resize_based_tensor_ops, exact_graph)) { auto resize_based_op = out_tv->definition(); auto inp_tv = resize_based_op->input(0)->as(); + // Since cacheInput may skip caching if an input is used by + // slice/pad, inp_tv may be a fusion input, in which case it is + // not necessary to recompute the tensor. + if (inp_tv->isFusionInput()) { + continue; + } auto inp_tv_copy = RecomputeTv::recompute(inp_tv); ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy); } From 75338a4f18a77b79d1420a8e4ca3a765a3a713ea Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 20 Dec 2024 10:15:47 -0800 Subject: [PATCH 8/9] cleanup --- csrc/scheduler/resize.cpp | 14 ++++++++++++++ csrc/scheduler/tools/resize_utils.cpp | 4 ++-- csrc/scheduler/tools/resize_utils.h | 12 +++++++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index 408c662d8aa..b9772c507eb 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -120,6 +120,20 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { return false; } + // Having different resizes between outputs is not allowed at this + // moment. For example, consider a fusion like: + // + // t0 = [i0] + // fusion.addInput(t0) + // t1 = t0[:i0/2] + // t2 = t0[i0/2:] + // fusion.addOutput(t1) + // fusion.addOutput(t2) + // + // For now, this is not going to be fused since t1 and t2 have + // different resize ops, although in this case, since the extents of t1 and + // t2 are the same, it should be relatively straightforward to fuse them + // together. for (auto out_tv : ir_utils::filterByType(fusion->outputs())) { if (out_tv == ref_tv) { continue; diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index cb9ecd94c6d..fc8c0cc5f09 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -161,12 +161,12 @@ std::unordered_map getNonExclusiveResizeInfo } // This resize input ID is not exclusively used - info.shared_tvs.push_back(dep_tv); + info.non_exclusive_dep_tvs.push_back(dep_tv); info.resized_ids.pushBack(resize_inp_id); } } - if (!info.shared_tvs.empty()) { + if (!info.non_exclusive_dep_tvs.empty()) { NVF_ERROR(non_exclusive_resizes.emplace(out_tv, info).second); } diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h index b3704bb7326..b9afed5effa 100644 --- a/csrc/scheduler/tools/resize_utils.h +++ b/csrc/scheduler/tools/resize_utils.h @@ -91,16 +91,18 @@ void propagateResizeToInputs(Expr* resize_op); // long as the first slice is considered non-exclusive. This will be // important when resolving the non-exclusiveness by replication. // -// The function returns a map from tensors that are input to -// non-exclusive ops to their resize input ID groups. This map will be +// The function returns a map from tensors that are outputs to +// non-exclusive ops to ResizeExclusivityInfo. This map will be // used to resolve the non-exclusiveness by replication. struct ResizeExclusivityInfo { - std::vector shared_tvs; - // std::unordered_map resized_ids; + // Dependent tensors that should not be resized + std::vector non_exclusive_dep_tvs; + // ID groups of resize input IDs ValGroups resized_ids; bool operator==(const ResizeExclusivityInfo& other) const { - return shared_tvs == other.shared_tvs && resized_ids == other.resized_ids; + return non_exclusive_dep_tvs == other.non_exclusive_dep_tvs && + resized_ids == other.resized_ids; } bool operator!=(const ResizeExclusivityInfo& other) const { From e48a2f6274348cee5cdeca8d445bb7535b3bc0c4 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Mon, 23 Dec 2024 17:57:26 -0800 Subject: [PATCH 9/9] Recomputation needs to be done in a topological order --- csrc/scheduler/resize.cpp | 26 ++++++----- tests/cpp/test_resize.cpp | 98 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 11 deletions(-) diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index b9772c507eb..43a7195fe20 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -71,11 +71,11 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { IdModel id_model(fusion, /*build_graphs=*/false); const auto& broadcast_graph = id_model.buildBroadcastGraph(); - auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); + auto resize_tensor_ops = ir_utils::getOpsOfType(fusion); // Slicing of or to a broadcast ID is not allowed yet. - for (auto tensor_op : resize_based_tensor_ops) { - TensorView* out_tv = tensor_op->output(0)->as(); + for (auto resize_tensor_op : resize_tensor_ops) { + TensorView* out_tv = resize_tensor_op->output(0)->as(); for (auto logical_id : out_tv->getLogicalDomain()) { Resize* resize = dynamic_cast(logical_id->definition()); if (resize == nullptr) { @@ -188,17 +188,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { scheduler_utils::cacheInputs(fusion, true); scheduler_utils::cacheAndForkOutputs(fusion, true); - auto resize_based_tensor_ops = ir_utils::getOpsOfType(fusion); + auto resize_tensor_ops = ir_utils::getOpsOfType(fusion); IdModel id_model(fusion, /*build_graphs=*/false); const auto& exact_graph = id_model.buildExactGraph(); - // Replicate resize inputs if necessary to avoid conflicting propagations - for (const auto& [out_tv, exlusivity_info] : - scheduler_tools::getNonExclusiveResizeInfo( - resize_based_tensor_ops, exact_graph)) { - auto resize_based_op = out_tv->definition(); - auto inp_tv = resize_based_op->input(0)->as(); + // Replicate resize inputs if necessary to avoid conflicting + // propagations + const auto exclusivity_info_map = scheduler_tools::getNonExclusiveResizeInfo( + resize_tensor_ops, exact_graph); + for (auto resize_tensor_op : resize_tensor_ops) { + auto out_tv = resize_tensor_op->output(0)->as(); + if (exclusivity_info_map.count(out_tv) == 0) { + continue; + } + auto inp_tv = resize_tensor_op->input(0)->as(); // Since cacheInput may skip caching if an input is used by // slice/pad, inp_tv may be a fusion input, in which case it is // not necessary to recompute the tensor. @@ -206,7 +210,7 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { continue; } auto inp_tv_copy = RecomputeTv::recompute(inp_tv); - ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy); + ir_utils::replaceValInExprInputs(resize_tensor_op, inp_tv, inp_tv_copy); } for (auto expr : fusion->exprs()) { diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index a4710d293c8..587f72143a4 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -5123,6 +5123,104 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { } } +// Rotate twice. Resolving the non-exclusivity must be done in a +// topological order. +TEST_F(ResizeSchedulerTest, SliceRotateCatTwice) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + std::vector shape({-1, 100}); + + EnableOptionsGuard enable_options_guard; + EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"}); + + auto tv0 = makeConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = sin(tv0); + + auto tv2 = slice( + tv1, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {fusion.zeroVal(), IrBuilder::create(shape[1] / 2)}}); + + auto tv3 = slice( + tv1, + {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(shape[1] / 2), + IrBuilder::create(shape[1])}}); + + auto tv4 = cat({tv3, tv2}, -1); + + auto tv5 = slice( + tv4, + {{fusion.zeroVal(), tv4->getLogicalDomain().at(0)->extent()}, + {fusion.zeroVal(), IrBuilder::create(shape[1] / 2)}}); + + auto tv6 = slice( + tv4, + {{fusion.zeroVal(), tv4->getLogicalDomain().at(0)->extent()}, + {IrBuilder::create(shape[1] / 2), + IrBuilder::create(shape[1])}}); + + auto tv7 = cat({tv6, tv5}, -1); + + fusion.addOutput(tv7); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({16, 100}, options); + std::vector inputs({t0}); + + // tv1 is not considered exclusive as tv0 is also a consumer of + // tv3. Same for tv3. While the common input, tv0, is a fusion + // input, so it isn't actually scheduled, since a cache is + // inserted, which is indeed scheduled, the two slices do + // conflict. + IdModel id_model(&fusion, /*build_graphs=*/false); + const auto& exact_graph = id_model.buildExactGraph(); + auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo( + ir_utils::getOpsOfType(&fusion), exact_graph); + + // tv2 + EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1); + scheduler_tools::ResizeExclusivityInfo tv2_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info); + + // tv3 + EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1); + scheduler_tools::ResizeExclusivityInfo tv3_info{ + {tv1}, exact_graph.toGroups(std::vector{tv1->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv3), tv3_info); + + // tv5 + EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1); + scheduler_tools::ResizeExclusivityInfo tv5_info{ + {tv4}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info); + + // tv6 + EXPECT_EQ(non_exclusive_resize_info.count(tv6), 1); + scheduler_tools::ResizeExclusivityInfo tv6_info{ + {tv4}, exact_graph.toGroups(std::vector{tv4->axis(1)})}; + EXPECT_EQ(non_exclusive_resize_info.at(tv6), tv6_info); + + // These should be all the info the map has. + EXPECT_EQ(non_exclusive_resize_info.size(), 4); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto out_tensors = executor_cache.runFusionWithInputs(inputs); + testValidate( + executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__); + + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_FALSE(runtime->isSegmented()); + const auto& heuristic_param = + runtime->schedulerHeuristics()->heuristicsList().front(); + EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); +} + // Consumer-based scheduling of pad TEST_P(ResizeSchedulerTest, PropagatePadToInputs) { auto fusion_ptr = std::make_unique();