Skip to content

Commit d87fa85

Browse files
authored
[Hexagon] Initial support for meta schedule tuning (#12587)
Enables AutoTVM-style, template-based tuning for Hexagon. To run compiled code on Hexagon, we need to use Hexagon `Session` object https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35 in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an instance of `RPCSession`, https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32, to be created and used by various customizable functions. Since `RPCSession` and Hexagon `Session` have slightly different API, we cannot use `RPCRunner` with customizable functions directly. So I introduced an alternative implementation of `RPCRunner` for Hexagon. The test is disabled for simulator since `HexagonLauncherSimulator` is not pickle-able due to its `multiprocessing.Process` attribute: https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614 Output log from tuning `vrmpy` dense (included in the test) ``` ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Terminated -------------------------------------------------------------------------------------------------------------- 0 | main | 150994944 | 1 | 380.3399 | 397.0000 | 397.0000 | 32 | -------------------------------------------------------------------------------------------------------------- ```
1 parent d171b4a commit d87fa85

File tree

10 files changed

+472
-7
lines changed

10 files changed

+472
-7
lines changed

apps/hexagon_api/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
8787
"-DUSE_HEXAGON_RPC=ON"
8888
"-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
8989
"-DUSE_ALTERNATIVE_LINKER=OFF"
90+
"-DUSE_RANDOM=ON"
9091
INSTALL_COMMAND ""
9192
BUILD_ALWAYS ON
9293
)
@@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
133134
"-DUSE_ALTERNATIVE_LINKER=OFF"
134135
"-DUSE_CUSTOM_LOGGING=ON"
135136
"-DUSE_HEXAGON_QHL=ON"
137+
"-DUSE_RANDOM=ON"
136138
"${GTEST_FLAG}"
137139
INSTALL_COMMAND ""
138140
BUILD_ALWAYS ON
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Meta schedule tuning utilities for Hexagon."""
18+
import os
19+
import tempfile
20+
from typing import Callable, List, Optional
21+
from tvm.contrib.popen_pool import PopenPoolExecutor
22+
from tvm.meta_schedule.utils import cpu_count, derived_object
23+
from tvm.meta_schedule.builder import LocalBuilder
24+
from tvm.meta_schedule.runner import (
25+
EvaluatorConfig,
26+
RunnerInput,
27+
RunnerFuture,
28+
PyRunner,
29+
)
30+
from tvm.meta_schedule.runner.rpc_runner import (
31+
default_alloc_argument,
32+
default_run_evaluator,
33+
RPCRunnerFuture,
34+
)
35+
36+
from .build import HexagonLauncherRPC
37+
from .tools import export_module
38+
39+
40+
@derived_object
41+
class HexagonRPCRunner(PyRunner):
42+
"""RPCRunner for Hexagon. See the documentation of RPCRunner for more details."""
43+
44+
def __init__(
45+
self,
46+
hexagon_launcher: HexagonLauncherRPC,
47+
evaluator_config: Optional[EvaluatorConfig] = None,
48+
cooldown_sec: float = 0.0,
49+
alloc_repeat: int = 1,
50+
max_workers: Optional[int] = None,
51+
initializer: Optional[Callable[[], None]] = None,
52+
):
53+
"""
54+
Parameters
55+
----------
56+
hexagon_launcher : HexagonLauncherRPC
57+
The RPC launcher for Hexagon. It is needed for creating hexagon.Session
58+
object inside the worker function.
59+
evaluator_config: EvaluatorConfig
60+
The evaluator configuration.
61+
cooldown_sec: float
62+
The cooldown in seconds.
63+
alloc_repeat: int
64+
The number of times to random fill the allocation.
65+
max_workers: Optional[int] = None
66+
The maximum number of connections. Defaults to number of logical CPU cores.
67+
initializer: Optional[Callable[[], None]]
68+
The initializer function.
69+
"""
70+
71+
super().__init__()
72+
self.hexagon_launcher = hexagon_launcher
73+
self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
74+
self.cooldown_sec = cooldown_sec
75+
self.alloc_repeat = alloc_repeat
76+
if max_workers is None:
77+
max_workers = cpu_count(logical=True)
78+
self.pool = PopenPoolExecutor(
79+
max_workers=max_workers,
80+
timeout=100,
81+
initializer=initializer,
82+
)
83+
84+
def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
85+
results = []
86+
for runner_input in runner_inputs:
87+
future = RPCRunnerFuture(
88+
future=self.pool.submit(
89+
_worker_func,
90+
self.hexagon_launcher,
91+
self.evaluator_config,
92+
self.alloc_repeat,
93+
str(runner_input.artifact_path),
94+
tuple(arg_info.as_json() for arg_info in runner_input.args_info),
95+
),
96+
timeout_sec=100,
97+
)
98+
results.append(future)
99+
return results
100+
101+
102+
def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
103+
with hexagon_launcher.start_session() as session:
104+
device = session.device
105+
_, remote_path = os.path.split(artifact_path)
106+
uploaded = session.upload(artifact_path, remote_path)
107+
rt_mod = session.load_module(uploaded)
108+
repeated_args = default_alloc_argument(
109+
session,
110+
device,
111+
args_info,
112+
alloc_repeat,
113+
)
114+
costs = default_run_evaluator(
115+
session,
116+
rt_mod,
117+
device,
118+
evaluator_config,
119+
repeated_args,
120+
)
121+
return costs
122+
123+
124+
def get_hexagon_local_builder():
125+
"""Return Hexagon-compatible Builder for meta schedule."""
126+
127+
def export_func(mod):
128+
binary_path = export_module(mod, tempfile.mkdtemp())
129+
return str(binary_path)
130+
131+
return LocalBuilder(f_export=export_func)
132+
133+
134+
def get_hexagon_rpc_runner(
135+
hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
136+
):
137+
"""Return Hexagon-compatible RPC Runner for meta schedule.
138+
139+
Parameters
140+
----------
141+
hexagon_launcher : HexagonLauncherRPC
142+
The RPC launcher for Hexagon.
143+
number: int
144+
The number of times to run this function for taking average.
145+
We call these runs as one `repeat` of measurement.
146+
repeat: int
147+
The number of times to repeat the measurement.
148+
In total, the function will be invoked (1 + number x repeat) times,
149+
where the first one is warm up and will be discarded.
150+
The returned result contains `repeat` costs,
151+
each of which is an average of `number` costs.
152+
min_repeat_ms: int
153+
Minimum repeat time in ms. if the execution latency is too short,
154+
increase the number of runs to the given time (in ms) to reduce the measurement error.
155+
"""
156+
evaluator_config = EvaluatorConfig(
157+
number=number,
158+
repeat=repeat,
159+
min_repeat_ms=min_repeat_ms,
160+
enable_cpu_cache_flush=False,
161+
)
162+
163+
return HexagonRPCRunner(
164+
hexagon_launcher,
165+
evaluator_config,
166+
)

python/tvm/contrib/hexagon/session.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
AOTExecutorFactoryModule,
3131
GraphExecutorFactoryModule,
3232
)
33+
from .tools import export_module
3334

3435

3536
class Session:
@@ -110,6 +111,9 @@ def device(self):
110111

111112
return self._device
112113

114+
def get_function(self, name):
115+
return self._rpc.get_function(name)
116+
113117
def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
114118
"""Upload a local file to the remote workspace.
115119
@@ -154,10 +158,8 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
154158

155159
if isinstance(module, tvm.runtime.Module):
156160
with tempfile.TemporaryDirectory() as temp_dir:
157-
temp_dir = pathlib.Path(temp_dir)
158161
binary_name = "test_binary.so"
159-
binary_path = temp_dir / binary_name
160-
module.save(str(binary_path))
162+
binary_path = export_module(module, temp_dir, binary_name)
161163
remote_file_path = self.upload(binary_path, binary_name)
162164
else:
163165
remote_file_path = module

python/tvm/contrib/hexagon/tools.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
194194
cross_compile.output_format = "o"
195195
c_files = [str(file) for file in files]
196196
cross_compile(str(so_name), c_files, options=compile_options + options)
197+
198+
199+
def export_module(module, out_dir, binary_name="test_binary.so"):
200+
"""Export Hexagon shared object to a file."""
201+
binary_path = pathlib.Path(out_dir) / binary_name
202+
module.save(str(binary_path))
203+
return binary_path

python/tvm/meta_schedule/default_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def schedule_rules( # pylint: disable=redefined-outer-name
178178
return sch_rules()
179179
if sch_rules is not None:
180180
raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
181-
if target.kind.name == "llvm":
181+
if target.kind.name in ["llvm", "hexagon"]:
182182
return _DefaultLLVM.schedule_rules()
183183
if target.kind.name in ["cuda", "rocm", "vulkan"]:
184184
return _DefaultCUDA.schedule_rules()
@@ -194,7 +194,7 @@ def postproc( # pylint: disable=redefined-outer-name
194194
return postproc()
195195
if postproc is not None:
196196
raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
197-
if target.kind.name == "llvm":
197+
if target.kind.name in ["llvm", "hexagon"]:
198198
return _DefaultLLVM.postprocs()
199199
if target.kind.name in ["cuda", "rocm", "vulkan"]:
200200
return _DefaultCUDA.postprocs()
@@ -212,7 +212,7 @@ def mutator_probs( # pylint: disable=redefined-outer-name
212212
raise TypeError(
213213
f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
214214
)
215-
if target.kind.name == "llvm":
215+
if target.kind.name in ["llvm", "hexagon"]:
216216
return _DefaultLLVM.mutator_probs()
217217
if target.kind.name in ["cuda", "rocm", "vulkan"]:
218218
return _DefaultCUDA.mutator_probs()

python/tvm/target/target.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs):
636636
Whether to use QFloat HVX instructions.
637637
use_ieee_fp : bool (default: False)
638638
Whether to use IEEE HVX instructions
639+
num_cores : int (default: 4)
640+
The number of HVX threads. This attribute is required by meta scheduler.
639641
640642
Note: Floating point support in HVX requires LLVM 14+.
641643
"""
@@ -740,6 +742,9 @@ def create_llvm_options(cpu_ver, config): # pylint: disable=unused-argument
740742

741743
args_list = target_str.split() + llvm_str.split()
742744

745+
num_cores = config["num_cores"] if "num_cores" in kwargs else 4
746+
args_list.append("--num-cores=%d" % num_cores)
747+
743748
return Target(" ".join(["hexagon"] + args_list))
744749

745750

python/tvm/tir/tensor_intrin/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@
1616
# under the License.
1717
# pylint: disable=unused-import
1818
"""Intrinsics for tensorization."""
19-
from . import arm_cpu, cuda, rocm, x86
19+
from . import arm_cpu, cuda, rocm, x86, hexagon
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
# pylint: disable=invalid-name,missing-function-docstring
18+
"""Intrinsics for Hexagon tensorization."""
19+
from tvm.script import tir as T
20+
from .. import TensorIntrin
21+
22+
23+
@T.prim_func
24+
def dot_product_32x4_u8u8i32_desc(
25+
A: T.Buffer((4,), "uint8", offset_factor=1),
26+
B: T.Buffer((32, 4), "uint8", offset_factor=1),
27+
C: T.Buffer((32,), "int32", offset_factor=1),
28+
) -> None:
29+
with T.block("root"):
30+
T.reads(C[0:32], A[0:4], B[0:32, 0:4])
31+
T.writes(C[0:32])
32+
for i in T.serial(0, 32):
33+
with T.init():
34+
C[i] = T.int32(0)
35+
for k in T.serial(0, 4):
36+
with T.block("update"):
37+
vi, vk = T.axis.remap("SR", [i, k])
38+
C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
39+
40+
41+
@T.prim_func
42+
def dot_product_32x4_u8u8i32_vrmpy(
43+
A: T.Buffer((4,), "uint8", offset_factor=1),
44+
B: T.Buffer((32, 4), "uint8", offset_factor=1),
45+
C: T.Buffer((32,), "int32", offset_factor=1),
46+
) -> None:
47+
with T.block("root"):
48+
T.reads(C[0:32], A[0:4], B[0:32, 0:4])
49+
T.writes(C[0:32])
50+
51+
A_u8x4 = A.vload([0], "uint8x4")
52+
A_i32 = T.reinterpret(A_u8x4, dtype="int32")
53+
54+
B_i8x128 = B.vload([0, 0], dtype="uint8x128")
55+
B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
56+
57+
C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
58+
T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
59+
T.uint32(3),
60+
C[T.ramp(T.int32(0), 1, 32)],
61+
B_i32x32,
62+
A_i32,
63+
dtype="int32x32",
64+
)
65+
66+
67+
VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
68+
69+
TensorIntrin.register(
70+
VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
71+
)

src/target/target_kind.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
417417
.add_attr_option<String>("mcpu")
418418
.add_attr_option<String>("mtriple")
419419
.add_attr_option<Array<String>>("llvm-options")
420+
.add_attr_option<Integer>("num-cores")
420421
.set_default_keys({"hexagon"});
421422

422423
TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);

0 commit comments

Comments
 (0)