Fix the use-after-free bug in unfused normalization (#2002)

ptrendx · KshitijLakhani · commit c90a720765bc · 2025-07-29T15:35:06.000-07:00
Signed-off-by: Przemek Tredak &lt;ptredak@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/csrc/extensions/normalization.cpp b/transformer_engine/pytorch/csrc/extensions/normalization.cpp
@@ -108,9 +108,9 @@ std::vector<py::object> layernorm_fwd(py::handle input, py::handle weight, Maybe
     }
   }
   TensorWrapper unquantized_out_cu;
+  py::object unquantized_out;
   if (force_unfused_kernel) {
     NoneQuantizer q{none};
-    py::object unquantized_out;
     std::tie(unquantized_out_cu, unquantized_out) = q.create_tensor(size, out_dtype);
   }
   TensorWrapper &kernel_out_cu = force_unfused_kernel ? unquantized_out_cu : out_cu;
@@ -269,9 +269,9 @@ std::vector<py::object> rmsnorm_fwd(const py::handle &input, const py::handle &w
     }
   }
   TensorWrapper unquantized_out_cu;
+  py::object unquantized_out;
   if (force_unfused_kernel) {
     NoneQuantizer q{none};
-    py::object unquantized_out;
     std::tie(unquantized_out_cu, unquantized_out) = q.create_tensor(size, out_dtype);
   }
   TensorWrapper &kernel_out_cu = force_unfused_kernel ? unquantized_out_cu : out_cu;

Original file line number	Diff line number	Diff line change
`@@ -108,9 +108,9 @@ std::vector<py::object> layernorm_fwd(py::handle input, py::handle weight, Maybe`
`108`	`108`	`}`
`109`	`109`	`}`
`110`	`110`	`TensorWrapper unquantized_out_cu;`
	`111`	`+ py::object unquantized_out;`
`111`	`112`	`if (force_unfused_kernel) {`
`112`	`113`	`NoneQuantizer q{none};`
`113`		`- py::object unquantized_out;`
`114`	`114`	`std::tie(unquantized_out_cu, unquantized_out) = q.create_tensor(size, out_dtype);`
`115`	`115`	`}`
`116`	`116`	`TensorWrapper &kernel_out_cu = force_unfused_kernel ? unquantized_out_cu : out_cu;`
`@@ -269,9 +269,9 @@ std::vector<py::object> rmsnorm_fwd(const py::handle &input, const py::handle &w`
`269`	`269`	`}`
`270`	`270`	`}`
`271`	`271`	`TensorWrapper unquantized_out_cu;`
	`272`	`+ py::object unquantized_out;`
`272`	`273`	`if (force_unfused_kernel) {`
`273`	`274`	`NoneQuantizer q{none};`
`274`		`- py::object unquantized_out;`
`275`	`275`	`std::tie(unquantized_out_cu, unquantized_out) = q.create_tensor(size, out_dtype);`
`276`	`276`	`}`
`277`	`277`	`TensorWrapper &kernel_out_cu = force_unfused_kernel ? unquantized_out_cu : out_cu;`