Skip to content

Commit f7de2da

Browse files
authored
Add Voxtral runner (#13853)
### Summary Utilize `multimodal_runner.h` to run [Voxtral exported from Optimum Executorch](huggingface/optimum-executorch#126). The runner takes in a `.pt` file of a preprocessed audio recording and feeds it a C++ multimodal runner. Example output: ``` This audio is a casual and somewhat silly conversation between two speakers who seem to be discussing their tattoos. The speakers are engaging in a game where they ask each other what their tattoos say, but both repeatedly say "sweet" instead of the actual words. The speakers are aware of their mistake and try to correct it by asking the other what their tattoo says, but they still end up saying "sweet" again. The conversation ends with a speaker telling the other that their tattoo says " PyTorchObserver {"prompt_tokens":1138,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756159197436,"inference_end_ms":1756159222710,"prompt_eval_end_ms":1756159209605,"first_token_ms":1756159209605,"aggregate_sampling_time_ms":96,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:33.116291 executorch:stats.h:104] Prompt Tokens: 1138 Generated Tokens: 99 I 00:00:33.116304 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:33.116312 executorch:stats.h:117] Total inference time: 25.274000 (seconds) Rate: 3.917069 (tokens/second) I 00:00:33.116320 executorch:stats.h:127] Prompt evaluation: 12.169000 (seconds) Rate: 93.516312 (tokens/second) I 00:00:33.116327 executorch:stats.h:136] Generated 99 tokens: 13.105000 (seconds) Rate: 7.554369 (tokens/second) I 00:00:33.116338 executorch:stats.h:147] Time to first generated token: 12.169000 (seconds) I 00:00:33.116344 executorch:stats.h:153] Sampling time over 1237 tokens: 0.096000 (seconds) ``` ### Test plan Build and run: ``` # Build and install ExecuTorch cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release # Build and install Voxtral runner cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release # Run Voxtral runner ./cmake-out/examples/models/voxtral/voxtral_runner --model_path ~/models/voxtral/voxtral_q8da4w_edm_qe4w_d_split_metadata_unsqueeze.pte --tokenizer_path ~/hf/models--mistralai--Voxtral-Mini-3B-2507/snapshots/3060fe34b35ba5d44202ce9ff3c097642914f8f3/tekken.json --prompt "What can you tell me about this audio?" --audio_path ~/models/voxtral/input_features.bin ``` Pull Request resolved: #13663
1 parent a99b64f commit f7de2da

File tree

4 files changed

+374
-10
lines changed

4 files changed

+374
-10
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
#
8+
# Simple CMake build system for voxtral runner.
9+
#
10+
cmake_minimum_required(VERSION 3.24)
11+
project(voxtral)
12+
13+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
14+
15+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
16+
17+
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
18+
set(CMAKE_TOOLCHAIN_IOS ON)
19+
else()
20+
set(CMAKE_TOOLCHAIN_IOS OFF)
21+
endif()
22+
23+
# Let files say "include <executorch/path/to/header.h>"
24+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
25+
26+
# Need this for gflags for some reason
27+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
28+
find_package(gflags REQUIRED)
29+
30+
# Find `executorch` libraries, same as for gflags
31+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
32+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
33+
executorch_target_link_options_shared_lib(executorch)
34+
35+
set(LINK_LIBS executorch gflags)
36+
set(link_libraries ${LINK_LIBS})
37+
set(_srcs multimodal.cpp)
38+
39+
list(
40+
APPEND
41+
link_libraries
42+
optimized_native_cpu_ops_lib
43+
quantized_ops_lib
44+
custom_ops
45+
cpublas
46+
eigen_blas
47+
)
48+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
49+
executorch_target_link_options_shared_lib(quantized_ops_lib)
50+
executorch_target_link_options_shared_lib(custom_ops)
51+
52+
# XNNPACK
53+
if(TARGET xnnpack_backend)
54+
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
55+
if(TARGET kleidiai)
56+
list(APPEND xnnpack_backend_libs kleidiai)
57+
endif()
58+
list(APPEND link_libraries ${xnnpack_backend_libs})
59+
executorch_target_link_options_shared_lib(xnnpack_backend)
60+
endif()
61+
62+
# Add LLM runner and extension module
63+
if(NOT TARGET extension_llm_runner)
64+
message(
65+
FATAL_ERROR
66+
"ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
67+
)
68+
endif()
69+
70+
# Needed for cpuinfo where it uses android specific log lib
71+
if(ANDROID)
72+
list(APPEND link_libraries log)
73+
endif()
74+
75+
# Add the required ExecutorTorch extensions for multimodal LLM runner
76+
list(
77+
APPEND
78+
link_libraries
79+
extension_llm_runner
80+
extension_module
81+
extension_data_loader
82+
extension_tensor
83+
extension_flat_tensor
84+
)
85+
86+
# Add tokenizers
87+
list(APPEND link_libraries tokenizers::tokenizers)
88+
89+
add_executable(voxtral_runner ${_srcs})
90+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
91+
target_link_options_gc_sections(voxtral_runner)
92+
if(NOT APPLE)
93+
target_link_options(voxtral_runner PRIVATE "LINKER:-s")
94+
endif()
95+
endif()
96+
97+
target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
98+
target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
99+
target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cmath>
10+
#include <cstring>
11+
#include <fstream>
12+
13+
#include <gflags/gflags.h>
14+
15+
#include <executorch/extension/llm/runner/audio.h>
16+
#include <executorch/extension/llm/runner/image.h>
17+
#include <executorch/extension/llm/runner/llm_runner_helper.h>
18+
#include <executorch/extension/llm/runner/multimodal_input.h>
19+
#include <executorch/extension/llm/runner/multimodal_runner.h>
20+
#include <executorch/runtime/core/error.h>
21+
#include <executorch/runtime/platform/log.h>
22+
23+
#if defined(ET_USE_THREADPOOL)
24+
#include <executorch/extension/threadpool/cpuinfo_utils.h>
25+
#include <executorch/extension/threadpool/threadpool.h>
26+
#endif
27+
28+
DEFINE_string(
29+
model_path,
30+
"multimodal.pte",
31+
"Model serialized in flatbuffer format.");
32+
33+
DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
34+
35+
DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
36+
37+
DEFINE_string(audio_path, "", "Path to input audio file.");
38+
39+
DEFINE_double(
40+
temperature,
41+
0.8f,
42+
"Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
43+
44+
DEFINE_int32(
45+
cpu_threads,
46+
-1,
47+
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
48+
49+
DEFINE_bool(warmup, false, "Whether to run a warmup run.");
50+
51+
namespace {
52+
53+
using ::executorch::extension::llm::Image;
54+
using ::executorch::extension::llm::make_image_input;
55+
using ::executorch::extension::llm::make_text_input;
56+
using ::executorch::extension::llm::MultimodalInput;
57+
58+
bool ends_with(const std::string& str, const std::string& suffix) {
59+
return str.size() >= suffix.size() &&
60+
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
61+
}
62+
63+
/**
64+
* @brief Loads preprocessed audio data from a binary file
65+
*
66+
* Reads mel spectrogram features that have been pre-computed and saved as a
67+
* binary file. The audio data is expected to be stored as float values in
68+
* binary format, typically saved using:
69+
* with open("tensor.bin", "wb") as f:
70+
* f.write(t.numpy().tobytes())
71+
*
72+
* @param audio_path Path to the binary audio file (.bin)
73+
* @return MultimodalInput containing the loaded audio data
74+
*/
75+
MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
76+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
77+
int32_t n_bins = 128;
78+
int32_t n_frames = 3000;
79+
std::size_t n_floats =
80+
f.tellg() / sizeof(float); // Number of floats in the audio file.
81+
f.seekg(0, std::ios::beg);
82+
int32_t batch_size = ceil(
83+
n_floats /
84+
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85+
std::vector<float> audio_data(batch_size * n_bins * n_frames);
86+
f.read(
87+
reinterpret_cast<char*>(audio_data.data()),
88+
audio_data.size() * sizeof(float));
89+
90+
ET_LOG(Info, "audio_data len = %d", audio_data.size());
91+
92+
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
93+
audio->batch_size = batch_size;
94+
audio->n_bins = n_bins;
95+
audio->n_frames = n_frames;
96+
audio->data.resize(audio_data.size() * sizeof(float));
97+
std::memcpy(
98+
audio->data.data(), audio_data.data(), audio_data.size() * sizeof(float));
99+
return ::executorch::extension::llm::make_audio_input(std::move(*audio));
100+
}
101+
102+
/**
103+
* @brief Processes audio files for multimodal input
104+
*
105+
* Dispatches audio file processing based on file extension:
106+
* - .bin files: Loads preprocessed mel spectrogram features directly
107+
* - .wav/.mp3 files: Currently unsupported, throws runtime_error
108+
*
109+
* This function provides a interface for different audio input formats
110+
* and can be extended to support raw audio processing in the future.
111+
*
112+
* @param audio_path Path to the audio file
113+
* @return MultimodalInput containing the processed audio data
114+
* @throws std::runtime_error if file format is unsupported or processing fails
115+
*/
116+
MultimodalInput processAudioFile(const std::string& audio_path) {
117+
if (ends_with(audio_path, ".bin")) {
118+
// Current behavior - load preprocessed audio stored as a binary file.
119+
return loadPreprocessedAudio(audio_path);
120+
} else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
121+
// New: Process raw audio files - unsupported for now
122+
ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
123+
throw std::runtime_error("Raw audio file processing not supported");
124+
} else {
125+
ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
126+
throw std::runtime_error("Unsupported audio file format");
127+
}
128+
}
129+
130+
} // namespace
131+
132+
int32_t main(int32_t argc, char** argv) {
133+
gflags::ParseCommandLineFlags(&argc, &argv, true);
134+
135+
const char* model_path = FLAGS_model_path.c_str();
136+
137+
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
138+
const char* prompt = FLAGS_prompt.c_str();
139+
const char* audio_path = FLAGS_audio_path.c_str();
140+
float temperature = FLAGS_temperature;
141+
int32_t cpu_threads = FLAGS_cpu_threads;
142+
bool warmup = FLAGS_warmup;
143+
144+
#if defined(ET_USE_THREADPOOL)
145+
uint32_t num_performant_cores = cpu_threads == -1
146+
? ::executorch::extension::cpuinfo::get_num_performant_cores()
147+
: static_cast<uint32_t>(cpu_threads);
148+
ET_LOG(
149+
Info, "Resetting threadpool with num threads = %d", num_performant_cores);
150+
if (num_performant_cores > 0) {
151+
::executorch::extension::threadpool::get_threadpool()
152+
->_unsafe_reset_threadpool(num_performant_cores);
153+
}
154+
#endif
155+
156+
// Load tokenizer
157+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
158+
::executorch::extension::llm::load_tokenizer(tokenizer_path);
159+
if (tokenizer == nullptr) {
160+
ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
161+
return 1;
162+
}
163+
164+
// Create multimodal runner
165+
std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
166+
::executorch::extension::llm::create_multimodal_runner(
167+
model_path, std::move(tokenizer));
168+
if (runner == nullptr) {
169+
ET_LOG(Error, "Failed to create multimodal runner");
170+
return 1;
171+
}
172+
173+
// Load runner
174+
auto load_error = runner->load();
175+
if (load_error != ::executorch::runtime::Error::Ok) {
176+
ET_LOG(Error, "Failed to load multimodal runner");
177+
return 1;
178+
}
179+
180+
// Prepare inputs
181+
std::vector<MultimodalInput> inputs;
182+
183+
// 1. Add start bos-related text inputs and modality start token.
184+
inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
185+
186+
// 2. Add audio input
187+
inputs.emplace_back(processAudioFile(audio_path));
188+
189+
// 3. Add text input (the actual user-submitted prompt)
190+
inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));
191+
192+
::executorch::extension::llm::GenerationConfig config;
193+
config.max_new_tokens = 100;
194+
config.temperature = temperature;
195+
196+
// Run warmup if requested
197+
if (warmup) {
198+
ET_LOG(Info, "Running warmup...");
199+
auto warmup_error = runner->generate(inputs, config);
200+
if (warmup_error != ::executorch::runtime::Error::Ok) {
201+
ET_LOG(Error, "Failed to run warmup");
202+
return 1;
203+
}
204+
runner->reset();
205+
}
206+
207+
// Generate
208+
ET_LOG(Info, "Starting generation...");
209+
auto error = runner->generate(inputs, config);
210+
if (error != ::executorch::runtime::Error::Ok) {
211+
ET_LOG(Error, "Failed to generate with multimodal runner");
212+
return 1;
213+
}
214+
215+
printf("\n");
216+
return 0;
217+
}

0 commit comments

Comments
 (0)