feat: update torch and modify cuda graphs (#220)

pommedeterresautee · web-flow · commit e604c115f675 · 2022-12-19T20:24:45.000+01:00
diff --git a/Dockerfile b/Dockerfile
@@ -29,7 +29,7 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 &&
 
 RUN python3.9 -m ensurepip --default-pip --upgrade
 
-RUN pip install --pre torch==1.14.0.dev20221029+cu117 --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+RUN pip install --pre torch==2.0.0.dev20221214+cu117 --extra-index-url https://download.pytorch.org/whl/nightly/cu117
 
 
 WORKDIR /syncback
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 triton==2.0.0.dev20221202
-torch==1.14.0.dev20221029+cu117
+torch== 2.0.0.dev20221214+cu117
 pytest
 tabulate
 termcolor
diff --git a/src/kernl/implementations/cuda_graph.py b/src/kernl/implementations/cuda_graph.py
@@ -12,55 +12,35 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
+
 from typing import Callable, Union
 
 import torch
+from torch._inductor.compile_fx import cudagraphify_impl
+from torch._inductor.utils import dynamo_utils
+from torch._subclasses import FakeTensor
 
 
-def cuda_graphs_wrapper(
-    model: Callable,
-    inputs: Union[list[torch.Tensor], tuple[torch.Tensor]],
-    copy_outputs: bool = False,
-    pool: (int, int) = torch.cuda.graph_pool_handle(),
-):
-    """
-    From torchdynamo
-    """
-    assert isinstance(inputs, (list, tuple)), f"inputs is of type {type(inputs)} instead of list"
-
-    # required warmup, not just for perf but for correctness
-    torch.cuda.synchronize()
-    stream = torch.cuda.Stream()
-    stream.wait_stream(torch.cuda.current_stream())
-    with torch.cuda.stream(stream):
-        # 2 rounds, 1 to build the model (triton kernels, casting, etc.),
-        # and 1 for warmup
-        for _ in range(2):
-            model(*inputs)
-    stream.synchronize()
-    torch.cuda.current_stream().wait_stream(stream)
-    torch.cuda.synchronize()
-    # copy inputs after executing the warmup in case it mutates them at the first iteration
-    static_inputs = [torch.zeros_like(x) for x in inputs]
+def cuda_graphs_wrapper(model: Callable, inputs: Union[list[torch.Tensor], tuple[torch.Tensor]]):
+    assert isinstance(inputs, (list, tuple))
+    # if using fake tensors, defer cudagraphs until we get real inputs at runtime
+    if not any(isinstance(inp, FakeTensor) for inp in inputs):
+        model(*inputs)  # additional warmup needed when input is mutated by some kernel
+        f = cudagraphify_impl(lambda args: model(*args), inputs)
+        return lambda args: f(list(args))
 
-    # record
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph, stream=stream, pool=pool):
-        static_outputs = model(*static_inputs)
-    if not isinstance(static_outputs, (list, tuple)):
-        static_outputs = (static_outputs,)
+    compiled_fn = None
 
     def run(*new_inputs):
-        if "PYTEST_CURRENT_TEST" not in os.environ:  # for benchmarks, we may want to avoid input copy overhead
-            assert isinstance(new_inputs, (list, tuple)), f"inputs is of type {type(new_inputs)} instead of list"
-            assert len(static_inputs) == len(new_inputs), f"{len(static_inputs)} == {len(new_inputs)}"
-        for dst, src in zip(static_inputs, new_inputs):
-            dst.copy_(src)  # cuda graph can only read data from the same address
-        graph.replay()
-        if copy_outputs:
-            return [x.clone() for x in static_outputs]
-        else:
-            return static_outputs
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            with dynamo_utils.preserve_rng_state():
+                model(*new_inputs)  # additional warmup needed when input is mutated by some kernel
+                f = cudagraphify_impl(lambda args: model(*args), new_inputs)
+
+                def compiled_fn(args):
+                    return f(list(args))
+
+        return compiled_fn(new_inputs)
 
     return run
diff --git a/src/kernl/model_optimization.py b/src/kernl/model_optimization.py
@@ -22,15 +22,11 @@
 from kernl.optimizer.dynamo_backend import dynamo_backend_ofi
 
 
-# single shared pool by default
-_pool: (int, int) = torch.cuda.graph_pool_handle()
-
-
 # needs to be generated once to be reused several times, like encoder/decoder models
 # https://github.com/pytorch/torchdynamo/issues/1816
 def _compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
     dynamo_backend_ofi(gm)
-    return cuda_graphs_wrapper(gm, example_inputs, pool=_pool)
+    return cuda_graphs_wrapper(gm, example_inputs)
 
 
 def optimize_model(original_model: PreTrainedModel) -> None:
diff --git a/test/models/bert.py b/test/models/bert.py
@@ -17,7 +17,6 @@
 
 import torch
 import torch._dynamo as torchdynamo
-from torch._dynamo.optimizations import BACKENDS
 from transformers import AutoModel
 
 from kernl.implementations.cuda_graph import cuda_graphs_wrapper
@@ -35,8 +34,7 @@ def get_model_baseline(base):
 
 def get_model_dynamo_cuda_graphs(base):
     def compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-        compiled = BACKENDS["cudagraphs"](gm, example_inputs)
-        return compiled
+        return cuda_graphs_wrapper(gm, example_inputs)
 
     @torchdynamo.optimize(compiler)
     def run(*args, **kwargs):
diff --git a/test/test_attention.py b/test/test_attention.py
@@ -191,7 +191,6 @@ def test_benchmark_skinny_cross_attention(benchmark, implementation, shape):
     v = torch.rand_like(k)
     sm_scale = 0.3
 
-    p = torch.cuda.graph_pool_handle()
     expected = attention_reference(
         q=q.float(),
         k=k.float(),
@@ -203,8 +202,8 @@ def test_benchmark_skinny_cross_attention(benchmark, implementation, shape):
     )
     output = torch.empty_like(q)
     fn = implementations_skinny_cross_attention[implementation](output, sm_scale)
-    r = cuda_graphs_wrapper(fn, [q, k, v], pool=p)
-    _ = r(q, k, v)[0]
-    result = benchmark(r)[0]
+    r = cuda_graphs_wrapper(fn, [q, k, v])
+    _ = r([q, k, v])[0]
+    result = benchmark(r, [q, k, v])[0]
 
     assert_all_close(a=expected, b=result.float(), atol=1e-2)
diff --git a/test/test_layer_norm.py b/test/test_layer_norm.py
@@ -65,9 +65,9 @@ def test_benchmark_layer_norm(benchmark, shape: int, dtype, cuda_graphs: bool, i
 
     fn = implementations_layer_norm[implementation](layer_weight, layer_bias, eps)
     if cuda_graphs:
-        run = cuda_graphs_wrapper(model=fn, inputs=[x], copy_outputs=False)
+        run = cuda_graphs_wrapper(model=fn, inputs=[x])
         # CUDA graphs wraps output in a tuple
-        fn = lambda tensor: run(tensor)[0]  # noqa: E731
+        fn = lambda tensor: run([tensor])[0]  # noqa: E731
 
     value = benchmark(fn, x)
     assert_all_close(value.float(), expected, atol=1e-1)
@@ -99,9 +99,9 @@ def test_benchmark_rms_norm(benchmark, shape: int, dtype, cuda_graphs: bool, imp
 
     fn = implementations_rms_norm[implementation](layer_weight, eps)
     if cuda_graphs:
-        run = cuda_graphs_wrapper(model=fn, inputs=[x], copy_outputs=False)
+        run = cuda_graphs_wrapper(model=fn, inputs=[x])
         # CUDA graphs wraps output in a tuple
-        fn = lambda tensor: run(tensor)[0]  # noqa: E731
+        fn = lambda tensor: run([tensor])[0]  # noqa: E731
 
     value = benchmark(fn, x)
     assert_all_close(value.float(), expected, atol=1e-1)
diff --git a/test/test_linear_layer.py b/test/test_linear_layer.py
@@ -24,11 +24,6 @@
 from kernl.implementations.linear_layer import linear_layer
 
 
-@pytest.fixture
-def cuda_graphs_pool() -> (int, int):
-    return torch.cuda.graph_pool_handle()
-
-
 def get_pytorch_activation(activation: str) -> Callable:
     if activation == "gelu":
         return torch.nn.functional.gelu
@@ -71,7 +66,6 @@ def test_benchmark(
     bias: bool,
     activation: str,
     contiguous: bool,
-    cuda_graphs_pool: (int, int),
 ):
     batch, M, N, K = shape
 
@@ -96,9 +90,9 @@ def test_benchmark(
 
     fn = implementations[implementation](layer_weight, layer_bias, activation)
     if cuda_graphs:
-        run = cuda_graphs_wrapper(model=fn, inputs=[x], pool=cuda_graphs_pool)
+        run = cuda_graphs_wrapper(model=fn, inputs=[x])
         # CUDA graphs wraps output in a tuple
-        fn = lambda tensor: run(tensor)[0]  # noqa: E731
+        fn = lambda tensor: run([tensor])[0]  # noqa: E731
 
     value = benchmark(fn, x)