Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

Closed
t-vi opened this issue Dec 18, 2024 · 2 comments · Fixed by #3619
Closed

INTERNAL_ASSERT: Unable to find mapped root/logical domain #3607

t-vi opened this issue Dec 18, 2024 · 2 comments · Fixed by #3619
Labels

Comments

@t-vi
Copy link
Contributor

t-vi commented Dec 18, 2024

While working on Lightning-AI/lightning-thunder#1560 the randomness tests seem to fail with the following repro.

# CUDA devices:
#  0: NVIDIA GeForce RTX 3090
#  1: NVIDIA GeForce RTX 3090
# torch version: 2.6.0a0+gitc418a9a
# cuda version: 12.5
# nvfuser version: 0.2.23+git911d7bf
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id24(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[2, 24000], contiguity=[None, None], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    S1 = fd.define_scalar(None, dtype=DataType.Int)
    S2 = fd.define_scalar(None, dtype=DataType.Int)
    T3 = fd.define_tensor(shape=[24000], contiguity=[True], dtype=DataType.Double, is_cpu=False, stride_order=[0])
    T4 = fd.define_tensor(shape=[2, 24000], contiguity=[True, True], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    T5 = fd.define_tensor(shape=[2, 24000], contiguity=[True, True], dtype=DataType.Double, is_cpu=False, stride_order=[1, 0])
    S6 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T7 = fd.ops.mul(S6, T0)
    S8 = fd.define_scalar(2.00000, dtype=DataType.Double)
    T9 = fd.ops.mul(S8, T7)
    S10 = fd.define_scalar(0.00000, dtype=DataType.Double)
    S11 = fd.define_scalar(1.00000, dtype=DataType.Double)
    S12 = fd.define_scalar(2, dtype=DataType.Int)
    S13 = fd.define_scalar(24000, dtype=DataType.Int)
    T15 = fd.ops.uniform(S10, S11, shape=[S12, S13], rng_seed=S2, rng_offset=S1, dtype=DataType.Double)
    S16 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T17 = fd.ops.lt(T15, S16)
    T18 = fd.ops.cast(T17, dtype=DataType.Double)
    T19 = fd.ops.mul(T18, T9)
    T20 = fd.ops.sum(T19, dims=[0], keepdim=False, dtype=DataType.Null)
    T24 = fd.ops.broadcast_in_dim(T3, shape=[2, 24000], broadcast_dims=[1])
    T25 = fd.ops.mul(T24, T19)
    T26, T27 = fd.ops.var_mean(T4, dims=[1], correction=0, keepdim=False)
    T31 = fd.ops.broadcast_in_dim(T27, shape=[2, 1], broadcast_dims=[0])
    T35 = fd.ops.broadcast_in_dim(T31, shape=[2, 24000], broadcast_dims=[0, 1])
    T36 = fd.ops.sub(T4, T35)
    T40 = fd.ops.broadcast_in_dim(T26, shape=[2, 1], broadcast_dims=[0])
    S41 = fd.define_scalar(1.00000e-05, dtype=DataType.Double)
    T42 = fd.ops.add(T40, S41)
    T43 = fd.ops.rsqrt(T42)
    T47 = fd.ops.broadcast_in_dim(T43, shape=[2, 24000], broadcast_dims=[0, 1])
    T48 = fd.ops.mul(T36, T47)
    T49 = fd.ops.mul(T48, T19)
    T50 = fd.ops.sum(T49, dims=[0], keepdim=False, dtype=DataType.Null)
    T51 = fd.ops.mul(T47, T25)
    T52 = fd.ops.mul(T36, T25)
    T53 = fd.ops.sum(T52, dims=[1], keepdim=False, dtype=DataType.Null)
    T57 = fd.ops.broadcast_in_dim(T53, shape=[2, 1], broadcast_dims=[0])
    T58 = fd.ops.neg(T51)
    T59 = fd.ops.sum(T58, dims=[1], keepdim=False, dtype=DataType.Null)
    T63 = fd.ops.broadcast_in_dim(T59, shape=[2, 1], broadcast_dims=[0])
    S64 = fd.define_scalar(-0.500000, dtype=DataType.Double)
    T65 = fd.ops.mul(S64, T57)
    S66 = fd.define_scalar(3.00000, dtype=DataType.Double)
    T67 = fd.ops.pow(T43, S66)
    T68 = fd.ops.mul(T65, T67)
    T69 = fd.ops.sum(T63, dims=[1], keepdim=False, dtype=DataType.Null)
    T70 = fd.ops.sum(T68, dims=[1], keepdim=False, dtype=DataType.Null)
    T74 = fd.ops.broadcast_in_dim(T69, shape=[2, 1], broadcast_dims=[0])
    T78 = fd.ops.broadcast_in_dim(T74, shape=[2, 24000], broadcast_dims=[0, 1])
    S79 = fd.define_scalar(4.16667e-05, dtype=DataType.Double)
    T80 = fd.ops.mul(S79, T78)
    T84 = fd.ops.broadcast_in_dim(T70, shape=[2, 1], broadcast_dims=[0])
    T88 = fd.ops.broadcast_in_dim(T84, shape=[2, 24000], broadcast_dims=[0, 1])
    S89 = fd.define_scalar(2.00000, dtype=DataType.Double)
    T90 = fd.ops.mul(S89, T88)
    T91 = fd.ops.mul(T90, T36)
    S92 = fd.define_scalar(24000.0, dtype=DataType.Double)
    S93 = fd.ops.reciprocal(S92)
    T94 = fd.ops.mul(T91, S93)
    T95 = fd.ops.add(T80, T94)
    T96 = fd.ops.add(T51, T95)
    S97 = fd.define_scalar(0.00000, dtype=DataType.Double)
    T98 = fd.ops.gt(T5, S97)
    S99 = fd.define_scalar(0.00000, dtype=DataType.Double)
    T100 = fd.ops.where(T98, T96, S99)
    fd.add_output(T20)
    fd.add_output(T50)
    fd.add_output(T100)

with FusionDefinition() as fd:
    nvfuser_fusion_id24(fd)

inputs = [
    torch.randn(1, dtype=torch.float64, device='cuda:0').as_strided((2, 24000), (0, 0)),
    1,
    20,
    torch.testing.make_tensor((24000,), dtype=torch.float64, device='cuda:0'),
    torch.testing.make_tensor((2, 24000), dtype=torch.float64, device='cuda:0'),
    torch.testing.make_tensor((2, 24000), dtype=torch.float64, device='cuda:0'),
]
fd.execute(inputs)

Traceback:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/nvfuser/__init__.py", line 238, in execute
    results = self._execute(
              ^^^^^^^^^^^^^^
RuntimeError:  INTERNAL ASSERT FAILED at "/home/tv/data/firma/grid/thunder/Fuser/csrc/scheduler/utils.cpp":1476, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Unable to find mapped root/logical domain
Exception raised from propagateSibling at /home/tv/data/firma/grid/thunder/Fuser/csrc/scheduler/utils.cpp:1476 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xca (0x7f1841b1853a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x3e (0x7f1841ea6b4e in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x888bb8 (0x7f1842288bb8 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #3: nvfuser::MaxInfoSpanningTree::traverse(nvfuser::MaxInfoSpanningTree::Propagator*) + 0x5a (0x7f184226da6a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x890cff (0x7f1842290cff in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x875fae (0x7f1842275fae in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #6: <unknown function> + 0x879e49 (0x7f1842279e49 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #7: <unknown function> + 0x87a962 (0x7f184227a962 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #8: <unknown function> + 0x851760 (0x7f1842251760 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x516635 (0x7f1841f16635 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x5256bb (0x7f1841f256bb in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x526d29 (0x7f1841f26d29 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x51de1e (0x7f1841f1de1e in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #13: nvfuser::SegmentCandidateFinder::SegmentCandidateFinder(std::unique_ptr<nvfuser::Fusion, std::default_delete<nvfuser::Fusion> >, nvfuser::KernelArgumentHolder const*, nvfuser::SegmentCandidateFinderOptions) + 0x3e2 (0x7f1841f1e5f2 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #14: <unknown function> + 0x51e755 (0x7f1841f1e755 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #15: <unknown function> + 0x51eb1c (0x7f1841f1eb1c in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #16: <unknown function> + 0x7bc6f8 (0x7f18421bc6f8 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #17: <unknown function> + 0x7b2de7 (0x7f18421b2de7 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #18: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0x9a (0x7f18421b385a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #19: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, std::optional<signed char>, bool, bool, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >) const + 0xbb2 (0x7f1842346562 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #20: <unknown function> + 0x1782ee (0x7f1841b782ee in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #21: <unknown function> + 0x238d0b (0x7f1841c38d0b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #22: <unknown function> + 0x28a62c (0x7f1841c8a62c in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #23: /usr/bin/python3() [0x57b5ce]
frame #24: _PyObject_MakeTpCall + 0x123 (0x543f83 in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x82c (0x5cfafc in /usr/bin/python3)
frame #26: /usr/bin/python3() [0x547527]
frame #27: PyObject_Call + 0x119 (0x545dc9 in /usr/bin/python3)
frame #28: _PyEval_EvalFrameDefault + 0x4204 (0x5d34d4 in /usr/bin/python3)
frame #29: _PyObject_Call_Prepend + 0xc2 (0x5454e2 in /usr/bin/python3)
frame #30: /usr/bin/python3() [0x59d434]
frame #31: _PyObject_MakeTpCall + 0x6f (0x543ecf in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x82c (0x5cfafc in /usr/bin/python3)
frame #33: /usr/bin/python3() [0x547466]
frame #34: torch::autograd::PyNode::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x7e (0x7f18dd12f25e in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #35: <unknown function> + 0x4e0bbab (0x7f18d440bbab in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #36: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0xde2 (0x7f18d4406c32 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #37: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x4cf (0x7f18d4407abf in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #38: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x32a (0x7f18d44000ea in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so)
frame #39: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x62 (0x7f18dd129e42 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #40: <unknown function> + 0xe0ea4 (0x7f18ddee0ea4 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #41: <unknown function> + 0x89dab (0x7f18e82e9dab in /lib/x86_64-linux-gnu/libc.so.6)
frame #42: <unknown function> + 0x10b9f8 (0x7f18e836b9f8 in /lib/x86_64-linux-gnu/libc.so.6)
@t-vi
Copy link
Contributor Author

t-vi commented Dec 18, 2024

This is the fusion. If we should avoid patterns, I would appreciate a hint:

import torch
from thunder.executors.torchex import no_autocast

@torch.no_grad()
@no_autocast
def backward_fn(saved_for_backward, cotangents):
  # saved_for_backward: "Collection"
  # cotangents: "Collection"
  C0, C1, = saved_for_backward
  clear_mutable_collection(saved_for_backward)
  del saved_for_backward
  t0, = cotangents
  clear_mutable_collection(cotangents)
  del cotangents
  x, t_layer_norm_weight, input, = C0
  clear_mutable_collection(C0)
  del C0
  i67, i66, = C1
  clear_mutable_collection(C1)
  del C1
  [t118, t121, t152] = nvFusion0(t0, i67, i66, t_layer_norm_weight, input, x)
    # t112 = prims.mul(1.0, t0)  # t112: "cuda:0 f32[2, 24000]"
    # t114 = prims.mul(2.0, t112)  # t114: "cuda:0 f32[2, 24000]"
    # t60 = prims.uniform_philox((2, 24000), 0.0, 1.0, device=devices.Device("cuda:0"), dtype=dtypes.float32, seed=i66, offset=i67)  # t60: "cuda:0 f32[2, 24000]"
    # t61 = prims.lt(t60, 0.5)  # t61: "cuda:0 b8[2, 24000]"
    # t62 = prims.convert_element_type(t61, dtypes.float32)  # t62: "cuda:0 f32[2, 24000]"
    # t115 = prims.mul(t62, t114)  # t115: "cuda:0 f32[2, 24000]"
    # t118 = prims.sum(t115, (0,))  # t118: "cuda:0 f32[24000]"
    # t56 = prims.broadcast_in_dim(t_layer_norm_weight, (2, 24000), (1,))  # t56: "cuda:0 f32[2, 24000]"
    # t119 = prims.mul(t56, t115)  # t119: "cuda:0 f32[2, 24000]"
    # (t46, t47) = prims.var_mean(input, (1,), correction=0)
    # t49 = prims.broadcast_in_dim(t47, [2, 1], [0])  # t49: "cuda:0 f32[2, 1]"
    # t52 = prims.broadcast_in_dim(t49, (2, 24000), (0, 1))  # t52: "cuda:0 f32[2, 24000]"
    # t53 = prims.sub(input, t52)  # t53: "cuda:0 f32[2, 24000]"
    # t48 = prims.broadcast_in_dim(t46, [2, 1], [0])  # t48: "cuda:0 f32[2, 1]"
    # t50 = prims.add(t48, 1e-05)  # t50: "cuda:0 f32[2, 1]"
    # t51 = prims.rsqrt(t50)  # t51: "cuda:0 f32[2, 1]"
    # t54 = prims.broadcast_in_dim(t51, (2, 24000), (0, 1))  # t54: "cuda:0 f32[2, 24000]"
    # t55 = prims.mul(t53, t54)  # t55: "cuda:0 f32[2, 24000]"
    # t120 = prims.mul(t55, t115)  # t120: "cuda:0 f32[2, 24000]"
    # t121 = prims.sum(t120, (0,))  # t121: "cuda:0 f32[24000]"
    # t122 = prims.mul(t54, t119)  # t122: "cuda:0 f32[2, 24000]"
    # t123 = prims.mul(t53, t119)  # t123: "cuda:0 f32[2, 24000]"
    # t124 = prims.sum(t123, (1,))  # t124: "cuda:0 f32[2]"
    # t125 = prims.broadcast_in_dim(t124, [2, 1], [0])  # t125: "cuda:0 f32[2, 1]"
    # t126 = prims.neg(t122)  # t126: "cuda:0 f32[2, 24000]"
    # t127 = prims.sum(t126, (1,))  # t127: "cuda:0 f32[2]"
    # t128 = prims.broadcast_in_dim(t127, [2, 1], [0])  # t128: "cuda:0 f32[2, 1]"
    # t129 = prims.mul(-0.5, t125)  # t129: "cuda:0 f32[2, 1]"
    # t130 = prims.pow(t51, 3.0)  # t130: "cuda:0 f32[2, 1]"
    # t131 = prims.mul(t129, t130)  # t131: "cuda:0 f32[2, 1]"
    # t133 = prims.sum(t128, (1,))  # t133: "cuda:0 f32[2]"
    # t134 = prims.sum(t131, (1,))  # t134: "cuda:0 f32[2]"
    # t137 = prims.broadcast_in_dim(t133, [2, 1], [0])  # t137: "cuda:0 f32[2, 1]"
    # t138 = prims.broadcast_in_dim(t137, (2, 24000), (0, 1))  # t138: "cuda:0 f32[2, 24000]"
    # t139 = prims.mul(4.1666666666666665e-05, t138)  # t139: "cuda:0 f32[2, 24000]"
    # t140 = prims.broadcast_in_dim(t134, [2, 1], [0])  # t140: "cuda:0 f32[2, 1]"
    # t141 = prims.broadcast_in_dim(t140, (2, 24000), (0, 1))  # t141: "cuda:0 f32[2, 24000]"
    # t145 = prims.mul(2.0, t141)  # t145: "cuda:0 f32[2, 24000]"
    # t147 = prims.mul(t145, t53)  # t147: "cuda:0 f32[2, 24000]"
    # t148 = prims.div(t147, 24000.0)  # t148: "cuda:0 f32[2, 24000]"
    # t149 = prims.add(t139, t148)  # t149: "cuda:0 f32[2, 24000]"
    # t150 = prims.add(t122, t149)  # t150: "cuda:0 f32[2, 24000]"
    # t42 = prims.gt(x, 0.0)  # t42: "cuda:0 b8[2, 24000]"
    # t152 = prims.where(t42, t150, 0.0)  # t152: "cuda:0 f32[2, 24000]"
  del t0, i67, i66, t_layer_norm_weight, input, x
  return (t152, t118, t121)

naoyam added a commit that referenced this issue Dec 19, 2024
@naoyam
Copy link
Collaborator

naoyam commented Dec 19, 2024

IIUC, this seems to be a simple mistake we have in the transpose scheduler. I'm surprised we haven't had this error before. #3619

@naoyam naoyam mentioned this issue Dec 19, 2024
@naoyam naoyam closed this as completed in 6fe1865 Dec 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants