INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

t-vi · 2024-12-18T08:51:47Z

While working on Lightning-AI/lightning-thunder#1560 the randomness tests seem to fail with the following repro.

# CUDA devices:
#  0: NVIDIA GeForce RTX 3090
#  1: NVIDIA GeForce RTX 3090
# torch version: 2.6.0a0+gitc418a9a
# cuda version: 12.5
# nvfuser version: 0.2.23+git911d7bf
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id24(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[2, 24000], contiguity=[None, None], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    S1 = fd.define_scalar(None, dtype=DataType.Int)
    S2 = fd.define_scalar(None, dtype=DataType.Int)
    T3 = fd.define_tensor(shape=[24000], contiguity=[True], dtype=DataType.Double, is_cpu=False, stride_order=[0])
    T4 = fd.define_tensor(shape=[2, 24000], contiguity=[True, True], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    T5 = fd.define_tensor(shape=[2, 24000], contiguity=[True, True], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    S6 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T7 = fd.ops.mul(S6, T0)
    S8 = fd.define_scalar(2.00000, dtype=DataType.Double)
    T9 = fd.ops.mul(S8, T7)
    S10 = fd.define_scalar(0.00000, dtype=DataType.Double)
    S11 = fd.define_scalar(1.00000, dtype=DataType.Double)
    S12 = fd.define_scalar(2, dtype=DataType.Int)
    S13 = fd.define_scalar(24000, dtype=DataType.Int)
    T15 = fd.ops.uniform(S10, S11, shape=[S12, S13], rng_seed=S2, rng_offset=S1, dtype=DataType.Double)
    S16 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T17 = fd.ops.lt(T15, S16)
    T18 = fd.ops.cast(T17, dtype=DataType.Double)
    T19 = fd.ops.mul(T18, T9)
    T20 = fd.ops.sum(T19, dims=[0], keepdim=False, dtype=DataType.Null)
    T24 = fd.ops.broadcast_in_dim(T3, shape=[2, 24000], broadcast_dims=[1])
    T25 = fd.ops.mul(T24, T19)
    T26, T27 = fd.ops.var_mean(T4, dims=[1], correction=0, keepdim=False)
    T31 = fd.ops.broadcast_in_dim(T27, shape=[2, 1], broadcast_dims=[0])
    T35 = fd.ops.broadcast_in_dim(T31, shape=[2, 24000], broadcast_dims=[0, 1])
    T36 = fd.ops.sub(T4, T35)
    T40 = fd.ops.broadcast_in_dim(T26, shape=[2, 1], broadcast_dims=[0])
    S41 = fd.define_scalar(1.00000e-05, dtype=DataType.Double)
    T42 = fd.ops.add(T40, S41)
    T43 = fd.ops.rsqrt(T42)
    T47 = fd.ops.broadcast_in_dim(T43, shape=[2, 24000], broadcast_dims=[0, 1])
    T48 = fd.ops.mul(T36, T47)
    T49 = fd.ops.mul(T48, T19)
    T50 = fd.ops.sum(T49, dims=[0], keepdim=False, dtype=DataType.Null)
    T51 = fd.ops.mul(T47, T25)
    T52 = fd.ops.mul(T36, T25)
    T53 = fd.ops.sum(T52, dims=[1], keepdim=False, dtype=DataType.Null)
    T57 = fd.ops.broadcast_in_dim(T53, shape=[2, 1], broadcast_dims=[0])
    T58 = fd.ops.neg(T51)
    T59 = fd.ops.sum(T58, dims=[1], keepdim=False, dtype=DataType.Null)
    T63 = fd.ops.broadcast_in_dim(T59, shape=[2, 1], broadcast_dims=[0])
    S64 = fd.define_scalar(-0.500000, dtype=DataType.Double)
    T65 = fd.ops.mul(S64, T57)
    S66 = fd.define_scalar(3.00000, dtype=DataType.Double)
    T67 = fd.ops.pow(T43, S66)
    T68 = fd.ops.mul(T65, T67)
    T69 = fd.ops.sum(T63, dims=[1], keepdim=False, dtype=DataType.Null)
    T70 = fd.ops.sum(T68, dims=[1], keepdim=False, dtype=DataType.Null)
    T74 = fd.ops.broadcast_in_dim(T69, shape=[2, 1], broadcast_dims=[0])
    T78 = fd.ops.broadcast_in_dim(T74, shape=[2, 24000], broadcast_dims=[0, 1])
    S79 = fd.define_scalar(4.16667e-05, dtype=DataType.Double)
    T80 = fd.ops.mul(S79, T78)
    T84 = fd.ops.broadcast_in_dim(T70, shape=[2, 1], broadcast_dims=[0])
    T88 = fd.ops.broadcast_in_dim(T84, shape=[2, 24000], broadcast_dims=[0, 1])
    S89 = fd.define_scalar(2.00000, dtype=DataType.Double)
    T90 = fd.ops.mul(S89, T88)
    T91 = fd.ops.mul(T90, T36)
    S92 = fd.define_scalar(24000.0, dtype=DataType.Double)
    S93 = fd.ops.reciprocal(S92)
    T94 = fd.ops.mul(T91, S93)
    T95 = fd.ops.add(T80, T94)
    T96 = fd.ops.add(T51, T95)
    S97 = fd.define_scalar(0.00000, dtype=DataType.Double)
    T98 = fd.ops.gt(T5, S97)
    S99 = fd.define_scalar(0.00000, dtype=DataType.Double)
    T100 = fd.ops.where(T98, T96, S99)
    fd.add_output(T20)
    fd.add_output(T50)
    fd.add_output(T100)

with FusionDefinition() as fd:
    nvfuser_fusion_id24(fd)

inputs = [
    torch.randn(1, dtype=torch.float64, device='cuda:0').as_strided((2, 24000), (0, 0)),
    1,
    20,
    torch.testing.make_tensor((24000,), dtype=torch.float64, device='cuda:0'),
    torch.testing.make_tensor((2, 24000), dtype=torch.float64, device='cuda:0'),
    torch.testing.make_tensor((2, 24000), dtype=torch.float64, device='cuda:0'),
]
fd.execute(inputs)

Traceback:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/nvfuser/__init__.py", line 238, in execute
    results = self._execute(
              ^^^^^^^^^^^^^^
RuntimeError:  INTERNAL ASSERT FAILED at "/home/tv/data/firma/grid/thunder/Fuser/csrc/scheduler/utils.cpp":1476, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Unable to find mapped root/logical domain
Exception raised from propagateSibling at /home/tv/data/firma/grid/thunder/Fuser/csrc/scheduler/utils.cpp:1476 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xca (0x7f1841b1853a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3e (0x7f1841ea6b4e in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x888bb8 (0x7f1842288bb8 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #3: nvfuser::MaxInfoSpanningTree::traverse(nvfuser::MaxInfoSpanningTree::Propagator*) + 0x5a (0x7f184226da6a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x890cff (0x7f1842290cff in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x875fae (0x7f1842275fae in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #6: <unknown function> + 0x879e49 (0x7f1842279e49 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #7: <unknown function> + 0x87a962 (0x7f184227a962 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #8: <unknown function> + 0x851760 (0x7f1842251760 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x516635 (0x7f1841f16635 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x5256bb (0x7f1841f256bb in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x526d29 (0x7f1841f26d29 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x51de1e (0x7f1841f1de1e in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #13: nvfuser::SegmentCandidateFinder::SegmentCandidateFinder(std::unique_ptr<nvfuser::Fusion, std::default_delete<nvfuser::Fusion> >, nvfuser::KernelArgumentHolder const*, nvfuser::SegmentCandidateFinderOptions) + 0x3e2 (0x7f1841f1e5f2 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #14: <unknown function> + 0x51e755 (0x7f1841f1e755 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #15: <unknown function> + 0x51eb1c (0x7f1841f1eb1c in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #16: <unknown function> + 0x7bc6f8 (0x7f18421bc6f8 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #17: <unknown function> + 0x7b2de7 (0x7f18421b2de7 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #18: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0x9a (0x7f18421b385a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #19: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, std::optional<signed char>, bool, bool, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >) const + 0xbb2 (0x7f1842346562 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #20: <unknown function> + 0x1782ee (0x7f1841b782ee in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #21: <unknown function> + 0x238d0b (0x7f1841c38d0b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #22: <unknown function> + 0x28a62c (0x7f1841c8a62c in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #23: /usr/bin/python3() [0x57b5ce]
frame #24: _PyObject_MakeTpCall + 0x123 (0x543f83 in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x82c (0x5cfafc in /usr/bin/python3)
frame #26: /usr/bin/python3() [0x547527]
frame #27: PyObject_Call + 0x119 (0x545dc9 in /usr/bin/python3)
frame #28: _PyEval_EvalFrameDefault + 0x4204 (0x5d34d4 in /usr/bin/python3)
frame #29: _PyObject_Call_Prepend + 0xc2 (0x5454e2 in /usr/bin/python3)
frame #30: /usr/bin/python3() [0x59d434]
frame #31: _PyObject_MakeTpCall + 0x6f (0x543ecf in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x82c (0x5cfafc in /usr/bin/python3)
frame #33: /usr/bin/python3() [0x547466]
frame #34: torch::autograd::PyNode::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x7e (0x7f18dd12f25e in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #35: <unknown function> + 0x4e0bbab (0x7f18d440bbab in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #36: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0xde2 (0x7f18d4406c32 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #37: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x4cf (0x7f18d4407abf in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #38: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x32a (0x7f18d44000ea in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #39: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x62 (0x7f18dd129e42 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #40: <unknown function> + 0xe0ea4 (0x7f18ddee0ea4 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #41: <unknown function> + 0x89dab (0x7f18e82e9dab in /lib/x86_64-linux-gnu/libc.so.6)
frame #42: <unknown function> + 0x10b9f8 (0x7f18e836b9f8 in /lib/x86_64-linux-gnu/libc.so.6)

The text was updated successfully, but these errors were encountered:

t-vi · 2024-12-18T20:21:48Z

This is the fusion. If we should avoid patterns, I would appreciate a hint:

import torch
from thunder.executors.torchex import no_autocast

@torch.no_grad()
@no_autocast
def backward_fn(saved_for_backward, cotangents):
  # saved_for_backward: "Collection"
  # cotangents: "Collection"
  C0, C1, = saved_for_backward
  clear_mutable_collection(saved_for_backward)
  del saved_for_backward
  t0, = cotangents
  clear_mutable_collection(cotangents)
  del cotangents
  x, t_layer_norm_weight, input, = C0
  clear_mutable_collection(C0)
  del C0
  i67, i66, = C1
  clear_mutable_collection(C1)
  del C1
  [t118, t121, t152] = nvFusion0(t0, i67, i66, t_layer_norm_weight, input, x)
    # t112 = prims.mul(1.0, t0)  # t112: "cuda:0 f32[2, 24000]"
    # t114 = prims.mul(2.0, t112)  # t114: "cuda:0 f32[2, 24000]"
    # t60 = prims.uniform_philox((2, 24000), 0.0, 1.0, device=devices.Device("cuda:0"), dtype=dtypes.float32, seed=i66, offset=i67)  # t60: "cuda:0 f32[2, 24000]"
    # t61 = prims.lt(t60, 0.5)  # t61: "cuda:0 b8[2, 24000]"
    # t62 = prims.convert_element_type(t61, dtypes.float32)  # t62: "cuda:0 f32[2, 24000]"
    # t115 = prims.mul(t62, t114)  # t115: "cuda:0 f32[2, 24000]"
    # t118 = prims.sum(t115, (0,))  # t118: "cuda:0 f32[24000]"
    # t56 = prims.broadcast_in_dim(t_layer_norm_weight, (2, 24000), (1,))  # t56: "cuda:0 f32[2, 24000]"
    # t119 = prims.mul(t56, t115)  # t119: "cuda:0 f32[2, 24000]"
    # (t46, t47) = prims.var_mean(input, (1,), correction=0)
    # t49 = prims.broadcast_in_dim(t47, [2, 1], [0])  # t49: "cuda:0 f32[2, 1]"
    # t52 = prims.broadcast_in_dim(t49, (2, 24000), (0, 1))  # t52: "cuda:0 f32[2, 24000]"
    # t53 = prims.sub(input, t52)  # t53: "cuda:0 f32[2, 24000]"
    # t48 = prims.broadcast_in_dim(t46, [2, 1], [0])  # t48: "cuda:0 f32[2, 1]"
    # t50 = prims.add(t48, 1e-05)  # t50: "cuda:0 f32[2, 1]"
    # t51 = prims.rsqrt(t50)  # t51: "cuda:0 f32[2, 1]"
    # t54 = prims.broadcast_in_dim(t51, (2, 24000), (0, 1))  # t54: "cuda:0 f32[2, 24000]"
    # t55 = prims.mul(t53, t54)  # t55: "cuda:0 f32[2, 24000]"
    # t120 = prims.mul(t55, t115)  # t120: "cuda:0 f32[2, 24000]"
    # t121 = prims.sum(t120, (0,))  # t121: "cuda:0 f32[24000]"
    # t122 = prims.mul(t54, t119)  # t122: "cuda:0 f32[2, 24000]"
    # t123 = prims.mul(t53, t119)  # t123: "cuda:0 f32[2, 24000]"
    # t124 = prims.sum(t123, (1,))  # t124: "cuda:0 f32[2]"
    # t125 = prims.broadcast_in_dim(t124, [2, 1], [0])  # t125: "cuda:0 f32[2, 1]"
    # t126 = prims.neg(t122)  # t126: "cuda:0 f32[2, 24000]"
    # t127 = prims.sum(t126, (1,))  # t127: "cuda:0 f32[2]"
    # t128 = prims.broadcast_in_dim(t127, [2, 1], [0])  # t128: "cuda:0 f32[2, 1]"
    # t129 = prims.mul(-0.5, t125)  # t129: "cuda:0 f32[2, 1]"
    # t130 = prims.pow(t51, 3.0)  # t130: "cuda:0 f32[2, 1]"
    # t131 = prims.mul(t129, t130)  # t131: "cuda:0 f32[2, 1]"
    # t133 = prims.sum(t128, (1,))  # t133: "cuda:0 f32[2]"
    # t134 = prims.sum(t131, (1,))  # t134: "cuda:0 f32[2]"
    # t137 = prims.broadcast_in_dim(t133, [2, 1], [0])  # t137: "cuda:0 f32[2, 1]"
    # t138 = prims.broadcast_in_dim(t137, (2, 24000), (0, 1))  # t138: "cuda:0 f32[2, 24000]"
    # t139 = prims.mul(4.1666666666666665e-05, t138)  # t139: "cuda:0 f32[2, 24000]"
    # t140 = prims.broadcast_in_dim(t134, [2, 1], [0])  # t140: "cuda:0 f32[2, 1]"
    # t141 = prims.broadcast_in_dim(t140, (2, 24000), (0, 1))  # t141: "cuda:0 f32[2, 24000]"
    # t145 = prims.mul(2.0, t141)  # t145: "cuda:0 f32[2, 24000]"
    # t147 = prims.mul(t145, t53)  # t147: "cuda:0 f32[2, 24000]"
    # t148 = prims.div(t147, 24000.0)  # t148: "cuda:0 f32[2, 24000]"
    # t149 = prims.add(t139, t148)  # t149: "cuda:0 f32[2, 24000]"
    # t150 = prims.add(t122, t149)  # t150: "cuda:0 f32[2, 24000]"
    # t42 = prims.gt(x, 0.0)  # t42: "cuda:0 b8[2, 24000]"
    # t152 = prims.where(t42, t150, 0.0)  # t152: "cuda:0 f32[2, 24000]"
  del t0, i67, i66, t_layer_norm_weight, input, x
  return (t152, t118, t121)

naoyam · 2024-12-19T03:37:08Z

IIUC, this seems to be a simple mistake we have in the transpose scheduler. I'm surprised we haven't had this error before. #3619

kevinstephano added the Thunder label Dec 18, 2024

naoyam added a commit that referenced this issue Dec 19, 2024

Fix #3607

45a383d

naoyam mentioned this issue Dec 19, 2024

Fix #3607 #3619

Merged

naoyam closed this as completed in #3619 Dec 19, 2024

naoyam closed this as completed in 6fe1865 Dec 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

t-vi commented Dec 18, 2024

t-vi commented Dec 18, 2024

naoyam commented Dec 19, 2024 •

edited

Loading

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

Comments

t-vi commented Dec 18, 2024

t-vi commented Dec 18, 2024

naoyam commented Dec 19, 2024 • edited Loading

naoyam commented Dec 19, 2024 •

edited

Loading