Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,7 @@ if(BUILD_TEST)
list(APPEND HOSTIR_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_evaluator.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_passes.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
)
Expand Down
219 changes: 144 additions & 75 deletions csrc/host_ir/allocate_and_deallocate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,64 +12,69 @@
#include <functional>
#include <iterator>
#include <list>
#include <ranges>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "fusion.h"
#include "host_ir/ir.h"
#include "ir/builder.h"
#include "ir/utils.h"

namespace nvfuser::hir {

namespace {

class DominatorTree {
class Node {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: create different nodes in each tree to avoid overloading too much.

public:
class Node {
public:
Node(Scope* scope, Scope::Iterator iterator)
: scope_(scope), iterator_(iterator) {}
Node(const Node& other) = delete;
Node(Node&& other) = delete;
Node& operator=(const Node& other) = delete;
Node& operator=(Node&& other) = delete;

const std::vector<Node*>& children() const {
return children_;
}
Node(Scope* scope, Expr* expr, const Node* parent)
: scope_(scope),
expr_(expr),
parent_(parent),
depth_(parent ? parent->depth() + 1 : 0) {}

Scope* scope() const {
return scope_;
}

void addChild(Node* child) {
children_.push_back(child);
}
Expr* getExpr() const {
return expr_;
}

Scope* scope() const {
return scope_;
}
const Node* parent() const {
return parent_;
}

Scope::Iterator iterator() const {
return iterator_;
}
int depth() const {
return depth_;
}

Expr* getExpr() const {
return *iterator_;
}
const std::vector<Node*>& children() const {
return children_;
}

private:
// Consider putting `scope` and `iterator` into a separate Mutator class.
// They are only needed when the user wants to modify the host IR.
Scope* scope_;
Scope::Iterator iterator_;
void addChild(Node* child) {
children_.push_back(child);
}

std::vector<Node*> children_;
};
private:
Scope* scope_;
Expr* expr_;
const Node* parent_;
int depth_;
std::vector<Node*> children_;
};

explicit DominatorTree(hir::HostIrContainer& hic) : hic_(hic) {
build(hic_.topLevel(), /*parent=*/nullptr);
class DominatorTree {
public:
explicit DominatorTree(hir::HostIrContainer& hic) : hic_(&hic) {
build(hic_->topLevel(), /*parent=*/nullptr);
}

const Node* getRoot() const {
const auto& top_level_exprs = hic_.topLevelExprs();
const auto& top_level_exprs = hic_->topLevelExprs();
NVF_ERROR(!top_level_exprs.empty());
Expr* root = top_level_exprs.front();
return &nodes_.at(root);
Expand Down Expand Up @@ -105,10 +110,8 @@ class DominatorTree {

private:
void build(Scope& scope, Node* parent) {
for (auto scope_it = scope.exprs().begin(); scope_it != scope.exprs().end();
++scope_it) {
Expr* e = *scope_it;
auto [node_it, inserted] = nodes_.try_emplace(e, &scope, scope_it);
for (Expr* e : scope.exprs()) {
auto [node_it, inserted] = nodes_.try_emplace(e, &scope, e, parent);
NVF_ERROR(inserted);
Node& node = node_it->second;
if (parent != nullptr) {
Expand All @@ -131,7 +134,77 @@ class DominatorTree {
}
}

hir::HostIrContainer& hic_;
hir::HostIrContainer* hic_;
std::unordered_map<const Expr*, Node> nodes_;
};

// Post-dominator tree: node A post-dominates B if every path from B to exit
// goes through A. Built by traversing from exit toward entry.
class PostDominatorTree {
public:
explicit PostDominatorTree(
hir::HostIrContainer& hic,
std::unordered_map<TensorView*, const Node*>& lca)
: hic_(&hic) {
build(hic_->topLevel(), /*scope_exit_successor=*/nullptr, lca);
}

const Node* getNode(Expr* expr) const {
auto it = nodes_.find(expr);
return it != nodes_.end() ? &it->second : nullptr;
}

private:
void build(
Scope& scope,
Node* parent,
std::unordered_map<TensorView*, const Node*>& lca) {
for (Expr* e : scope.exprs() | std::views::reverse) {
auto [node_it, inserted] = nodes_.try_emplace(e, &scope, e, parent);
NVF_ERROR(inserted);
Node& node = node_it->second;

if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
TensorView* tv = alloc->buffer()->as<TensorView>();
lca[tv] = findLCA(lca[tv], &node);
}
for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
lca[in] = findLCA(lca[in], &node);
}

if (auto* loop = dynamic_cast<hir::ForLoop*>(e)) {
build(loop->body(), &node, lca);
}
if (auto* ite = dynamic_cast<kir::IfThenElse*>(e)) {
build(ite->thenBody(), &node, lca);
build(ite->elseBody(), &node, lca);
}

parent = &node;
}
}

const Node* findLCA(const Node* a, const Node* b) const {
if (a == nullptr) {
return b;
}
if (b == nullptr) {
return a;
}
while (a->depth() > b->depth()) {
a = a->parent();
}
while (b->depth() > a->depth()) {
b = b->parent();
}
while (a != b) {
a = a->parent();
b = b->parent();
}
return a;
}

hir::HostIrContainer* hic_;
std::unordered_map<const Expr*, Node> nodes_;
};

Expand Down Expand Up @@ -159,7 +232,7 @@ void insertAllocations(hir::HostIrContainer& hic) {

dom_tree.depthFirstTraverse(
/*pre_fn=*/
[&](const DominatorTree::Node* node) {
[&](const Node* node) {
Expr* e = node->getExpr();
// If `e`'s output needs preallocation but isn't defined, insert an
// allocation right before `e`.
Expand All @@ -171,64 +244,60 @@ void insertAllocations(hir::HostIrContainer& hic) {
if (needsOutputPreallocation(e)) {
auto* allocate =
IrBuilder::create<kir::Allocate>(out, out->getMemoryType());
node->scope()->insert(node->iterator(), allocate);
node->scope()->insert_before(node->getExpr(), allocate);
}

defined.insert(out);
}
},
/*post_fn=*/
[&](const DominatorTree::Node* node) {
[&](const Node* node) {
Expr* e = node->getExpr();
for (auto* out : ir_utils::filterByType<TensorView>(e->outputs())) {
defined.erase(out);
}
});
}

bool needsDeallocation(TensorView* tv) {
if (tv->isFusionInput()) {
return false;
}
if (tv->isFusionOutput()) {
return false;
}
if (tv->definition()->isA<ShardByStream>()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add null check before dereferencing tv->definition(). While fusion inputs are filtered at line 198, there may be other cases where definition() is nullptr (e.g., intermediate values without definitions).

Suggested change
if (tv->definition()->isA<ShardByStream>()) {
if (tv->definition() != nullptr && tv->definition()->isA<ShardByStream>()) {

return false;
}
const AliasInfo& alias_info = tv->container()->getOutputAlias(tv);
if (alias_info.type == AllocationType::ReuseBuffer) {
return false;
}
return true;
}

Copy link
Collaborator Author

@Priya2698 Priya2698 Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not complete. Ops like view do not always allocate new tensors. Few things make that analysis tricky:

  1. Aliasing information is not available in hic as it copies over from completeFusion. Question: Should we run an aliasing pass in host IR as well for expr-evaluated segments?
  2. HostIrEvaluator for LoadStoreOp checks if the out_tv is known and either copies over the data or binds it to a view of the input. HostIrJit always creates a new tensor for ops like permute:
    void* permute_func_ptr = reinterpret_cast<void*>(
    . Question: Is this just for simplicity for the first integration?

One solution can be to explicitly allocate expr eval outputs where needed like we do for matmul/linear. Then, we only deallocate tvs that are allocated.

The previous version did not make any distinction for view-like ops, so the functionality does not regress.

What do you think @wujingyue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I'm missing some context. Can you remind me why this PR needs to change how we decide what needs deallocation? I understood the motivation of looking into loops but I'm missing some connections otherwise.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remind me why this PR needs to change how we decide what needs deallocation?

This PR does not need to necessarily change this. But we do need to decide what needs deallocation since not all ops allocate new tensors.

I initially started with deallocating only explicitly "allocated" tensorviews. However, that breaks the HostIrJit tests where outputs of view/permute are also new allocated tensors.
If I deallocate everything, that includes outputs of ShardByStream which are not new tensors, rather slices (we could amend aliasing such that we mark these as aliases of their inputs). Hence, I am placing some minimum conditions on what needs deallocation for current use cases.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I deallocate everything, that includes outputs of ShardByStream which are not new tensors

Got it. This is actually the old behavior. It didn't trigger this problem because ShardByStream is never top-level.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HostIrEvaluator handles deallocation by removing the tensor from the underlying hash table. It doesn't always free the memory. What problems did you run into with ShardByStream exactly?

I can try it myself tomorrow. Not on a computer right now

Copy link
Collaborator Author

@Priya2698 Priya2698 Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HostIrEvaluator handles deallocation by removing the tensor from the underlying hash table. It doesn't always free the memory.

Correct. I did not run into any errors with existing tests since handle(Deallocate*) only invalidates. But looking at the HostIrJit behavior, it actually deletes the tensor, hence, I avoided adding deallocation statements for those.

For simplicity, if you prefer, I can remove the additional conditions from this PR, and we can discuss that in a separate PR.

void insertDeallocations(hir::HostIrContainer& hic) {
const std::list<Expr*>& top_level_exprs = hic.topLevelExprs();
std::for_each(top_level_exprs.begin(), top_level_exprs.end(), [](Expr* expr) {
std::ranges::for_each(top_level_exprs, [](Expr* expr) {
NVF_ERROR(
!expr->isA<hir::Deallocate>(),
"Expected hostir container to not have deallocate, but found one "
"anyways: ",
expr);
});

// For each input in every expression in the container, find the position of
// its last use and insert a deallocate directly after, except for fusion
// inputs and outputs.
std::unordered_set<TensorView*> last_use_found;
for (auto insertion_point = top_level_exprs.end();
insertion_point != top_level_exprs.begin();) {
auto prev = std::prev(insertion_point);
Expr* e = *prev;

// Only tensors need to be allocated.
for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
// Fusion inputs are managed by the caller.
if (in->isFusionInput()) {
continue;
}
std::unordered_map<TensorView*, const Node*> lca;
PostDominatorTree post_dom_tree(hic, lca);

// Fusion outputs need to be kept alive for the caller.
if (in->isFusionOutput()) {
continue;
}

// Skip if `e` is not the last use.
if (!last_use_found.insert(in).second) {
continue;
}

auto* deallocate = IrBuilder::create<hir::Deallocate>(in);
hic.insertExprBefore(insertion_point, deallocate);
// Insert deallocate at LCA for each TV that needs deallocation.
for (const auto& [tv, lca_node] : lca) {
if (!needsDeallocation(tv)) {
continue;
}

// Don't `--insertion_point;` because we'd like to skip newly inserted
// deallocations.
insertion_point = prev;
NVF_ERROR(
lca_node != nullptr, "Could not find post-dominator for tensor ", tv);
auto* deallocate = IrBuilder::create<hir::Deallocate>(tv);
lca_node->scope()->insert_after(lca_node->getExpr(), deallocate);
}
}

Expand Down
Loading
Loading