[Hexagon] Initial support for meta schedule tuning (#12587)

masahi · web-flow · commit d87fa854b8eb · 2022-08-26T10:01:24.000-07:00
Enables AutoTVM-style, template-based tuning for Hexagon. To run compiled code on Hexagon, we need to use Hexagon `Session` object https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35 in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an instance of `RPCSession`, https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32, to be created and used by various customizable functions. Since `RPCSession` and Hexagon `Session` have slightly different API, we cannot use `RPCRunner` with customizable functions directly. So I introduced an alternative implementation of `RPCRunner` for Hexagon. The test is disabled for simulator since `HexagonLauncherSimulator` is not pickle-able due to its `multiprocessing.Process` attribute: https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614 Output log from tuning `vrmpy` dense (included in the test) ``` ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Terminated -------------------------------------------------------------------------------------------------------------- 0 | main | 150994944 | 1 | 380.3399 | 397.0000 | 397.0000 | 32 | -------------------------------------------------------------------------------------------------------------- ```
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
@@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
+    "-DUSE_RANDOM=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
     "-DUSE_HEXAGON_QHL=ON"
+    "-DUSE_RANDOM=ON"
     "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta schedule tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+    EvaluatorConfig,
+    RunnerInput,
+    RunnerFuture,
+    PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+    default_alloc_argument,
+    default_run_evaluator,
+    RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+    """RPCRunner for Hexagon. See the documentation of RPCRunner for more details."""
+
+    def __init__(
+        self,
+        hexagon_launcher: HexagonLauncherRPC,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        cooldown_sec: float = 0.0,
+        alloc_repeat: int = 1,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Parameters
+        ----------
+        hexagon_launcher : HexagonLauncherRPC
+            The RPC launcher for Hexagon. It is needed for creating hexagon.Session
+            object inside the worker function.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        cooldown_sec: float
+            The cooldown in seconds.
+        alloc_repeat: int
+            The number of times to random fill the allocation.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+
+        super().__init__()
+        self.hexagon_launcher = hexagon_launcher
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.cooldown_sec = cooldown_sec
+        self.alloc_repeat = alloc_repeat
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=100,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results = []
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.hexagon_launcher,
+                    self.evaluator_config,
+                    self.alloc_repeat,
+                    str(runner_input.artifact_path),
+                    tuple(arg_info.as_json() for arg_info in runner_input.args_info),
+                ),
+                timeout_sec=100,
+            )
+            results.append(future)
+        return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
+    with hexagon_launcher.start_session() as session:
+        device = session.device
+        _, remote_path = os.path.split(artifact_path)
+        uploaded = session.upload(artifact_path, remote_path)
+        rt_mod = session.load_module(uploaded)
+        repeated_args = default_alloc_argument(
+            session,
+            device,
+            args_info,
+            alloc_repeat,
+        )
+        costs = default_run_evaluator(
+            session,
+            rt_mod,
+            device,
+            evaluator_config,
+            repeated_args,
+        )
+    return costs
+
+
+def get_hexagon_local_builder():
+    """Return Hexagon-compatible Builder for meta schedule."""
+
+    def export_func(mod):
+        binary_path = export_module(mod, tempfile.mkdtemp())
+        return str(binary_path)
+
+    return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(
+    hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
+):
+    """Return Hexagon-compatible RPC Runner for meta schedule.
+
+    Parameters
+    ----------
+    hexagon_launcher : HexagonLauncherRPC
+        The RPC launcher for Hexagon.
+    number: int
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
+    repeat: int
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int
+        Minimum repeat time in ms. if the execution latency is too short,
+        increase the number of runs to the given time (in ms) to reduce the measurement error.
+    """
+    evaluator_config = EvaluatorConfig(
+        number=number,
+        repeat=repeat,
+        min_repeat_ms=min_repeat_ms,
+        enable_cpu_cache_flush=False,
+    )
+
+    return HexagonRPCRunner(
+        hexagon_launcher,
+        evaluator_config,
+    )
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
@@ -30,6 +30,7 @@
     AOTExecutorFactoryModule,
     GraphExecutorFactoryModule,
 )
+from .tools import export_module
 
 
 class Session:
@@ -110,6 +111,9 @@ def device(self):
 
         return self._device
 
+    def get_function(self, name):
+        return self._rpc.get_function(name)
+
     def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
@@ -154,10 +158,8 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
 
         if isinstance(module, tvm.runtime.Module):
             with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir = pathlib.Path(temp_dir)
                 binary_name = "test_binary.so"
-                binary_path = temp_dir / binary_name
-                module.save(str(binary_path))
+                binary_path = export_module(module, temp_dir, binary_name)
                 remote_file_path = self.upload(binary_path, binary_name)
         else:
             remote_file_path = module
diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
@@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
     cross_compile.output_format = "o"
     c_files = [str(file) for file in files]
     cross_compile(str(so_name), c_files, options=compile_options + options)
+
+
+def export_module(module, out_dir, binary_name="test_binary.so"):
+    """Export Hexagon shared object to a file."""
+    binary_path = pathlib.Path(out_dir) / binary_name
+    module.save(str(binary_path))
+    return binary_path
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
@@ -178,7 +178,7 @@ def schedule_rules(  # pylint: disable=redefined-outer-name
         return sch_rules()
     if sch_rules is not None:
         raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.schedule_rules()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.schedule_rules()
@@ -194,7 +194,7 @@ def postproc(  # pylint: disable=redefined-outer-name
         return postproc()
     if postproc is not None:
         raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.postprocs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.postprocs()
@@ -212,7 +212,7 @@ def mutator_probs(  # pylint: disable=redefined-outer-name
         raise TypeError(
             f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
         )
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.mutator_probs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.mutator_probs()
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
@@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs):
         Whether to use QFloat HVX instructions.
     use_ieee_fp : bool (default: False)
         Whether to use IEEE HVX instructions
+    num_cores : int (default: 4)
+        The number of HVX threads. This attribute is required by meta scheduler.
 
     Note: Floating point support in HVX requires LLVM 14+.
     """
@@ -740,6 +742,9 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
 
     args_list = target_str.split() + llvm_str.split()
 
+    num_cores = config["num_cores"] if "num_cores" in kwargs else 4
+    args_list.append("--num-cores=%d" % num_cores)
+
     return Target(" ".join(["hexagon"] + args_list))
 
 
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 # pylint: disable=unused-import
 """Intrinsics for tensorization."""
-from . import arm_cpu, cuda, rocm, x86
+from . import arm_cpu, cuda, rocm, x86, hexagon
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for Hexagon tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+@T.prim_func
+def dot_product_32x4_u8u8i32_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+        for i in T.serial(0, 32):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_32x4_u8u8i32_vrmpy(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x128 = B.vload([0, 0], dtype="uint8x128")
+        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+            T.uint32(3),
+            C[T.ramp(T.int32(0), 1, 32)],
+            B_i32x32,
+            A_i32,
+            dtype="int32x32",
+        )
+
+
+VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
+
+TensorIntrin.register(
+    VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
+)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
@@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("llvm-options")
+    .add_attr_option<Integer>("num-cores")
     .set_default_keys({"hexagon"});
 
 TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py