Skip to content

Commit c89412e

Browse files
author
LittleMouse
committed
[update] update llm-asr llm-audio llm-kws llm-melotts & add AI_Pyramid_Demo file
1 parent d3ad6eb commit c89412e

File tree

11 files changed

+634
-278
lines changed

11 files changed

+634
-278
lines changed

projects/llm_framework/main_asr/SConstruct

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ LDFLAGS += [
4040

4141
STATIC_FILES += Glob('mode_*.json')
4242

43-
env['COMPONENTS'].append({'target':'llm_asr-1.8',
43+
env['COMPONENTS'].append({'target':'llm_asr-1.9',
4444
'SRCS':SRCS,
4545
'INCLUDE':INCLUDE,
4646
'PRIVATE_INCLUDE':PRIVATE_INCLUDE,

projects/llm_framework/main_asr/src/main.cpp

Lines changed: 69 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class llm_task {
116116
int delay_audio_frame_ = 10;
117117
float silence_ms_accum_ = 0.0f;
118118
float silence_timeout = 1000.0f;
119+
std::string pending_text_;
119120

120121
buffer_t *pcmdata;
121122
std::function<void(void)> pause;
@@ -554,146 +555,98 @@ class llm_task {
554555

555556
void sys_pcm_on_data_onnx(const std::string &raw)
556557
{
557-
if (delay_audio_frame_ == 0) {
558-
buffer_write_char(pcmdata, raw.data(), raw.length());
559-
buffer_position_set(pcmdata, 0);
560-
561-
std::vector<float> floatSamples;
562-
int16_t audio_val;
563-
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
564-
float normalizedSample = static_cast<float>(audio_val) / INT16_MAX;
565-
floatSamples.push_back(normalizedSample);
566-
}
558+
if (raw.size() >= sizeof(int16_t)) {
559+
const int16_t *pcm16 = reinterpret_cast<const int16_t *>(raw.data());
560+
size_t n16 = raw.size() / sizeof(int16_t);
561+
PushPreRollPcm(pcm16, n16);
562+
}
567563

568-
buffer_resize(pcmdata, 0);
569-
int32_t window_size = vad_config_.silero_vad.window_size;
570-
int32_t i = 0;
571-
std::string final_text;
564+
static int count = 0;
565+
if (count < delay_audio_frame_) {
566+
buffer_write_char(pcmdata, raw.data(), raw.length());
567+
count++;
568+
return;
569+
}
572570

573-
while (i < floatSamples.size()) {
574-
if (i + window_size <= floatSamples.size()) {
575-
vad_->AcceptWaveform(floatSamples.data() + i, window_size);
576-
} else {
577-
vad_->Flush();
578-
}
579-
i += window_size;
571+
buffer_write_char(pcmdata, raw.data(), raw.length());
572+
buffer_position_set(pcmdata, 0);
580573

581-
while (!vad_->Empty()) {
582-
const auto &segment = vad_->Front();
583-
float duration = segment.samples.size() / 16000.f;
584-
float start_time = segment.start / 16000.f;
585-
float end_time = start_time + duration;
574+
std::vector<float> floatSamples;
575+
floatSamples.reserve((delay_audio_frame_ + 1) * kFrameSamples);
586576

587-
if (duration < 0.1f) {
588-
vad_->Pop();
589-
continue;
590-
}
577+
int16_t audio_val;
578+
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
579+
floatSamples.push_back(static_cast<float>(audio_val) / 32768.0f);
580+
}
591581

592-
if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream();
582+
buffer_resize(pcmdata, 0);
583+
count = 0;
593584

594-
offline_stream_->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
595-
segment.samples.size());
585+
vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
586+
bool detected = vad_->IsSpeechDetected();
587+
bool speech_start = (!prev_vad_detected_ && detected);
588+
prev_vad_detected_ = detected;
596589

597-
onnx_recognizer_->DecodeStream(offline_stream_.get());
598-
const auto &result = offline_stream_->GetResult();
590+
while (!vad_->Empty()) {
591+
const auto &segment = vad_->Front();
599592

600-
final_text += result.text;
593+
if (!offline_stream_) {
594+
offline_stream_ = onnx_recognizer_->CreateStream();
595+
}
601596

602-
vad_->Pop();
603-
offline_stream_.reset();
597+
if (speech_start && !pre_roll_pcm_.empty()) {
598+
std::vector<float> pre;
599+
pre.reserve(pre_roll_pcm_.size());
600+
for (int16_t s : pre_roll_pcm_) {
601+
pre.push_back(static_cast<float>(s) / 32768.0f);
604602
}
605-
}
606603

607-
if (out_callback_) {
608-
out_callback_(final_text, true);
609-
}
610-
} else {
611-
if (raw.size() >= sizeof(int16_t)) {
612-
const int16_t *pcm16 = reinterpret_cast<const int16_t *>(raw.data());
613-
size_t n16 = raw.size() / sizeof(int16_t);
614-
PushPreRollPcm(pcm16, n16);
615-
}
604+
std::vector<float> merged;
605+
merged.reserve(pre.size() + segment.samples.size());
606+
merged.insert(merged.end(), pre.begin(), pre.end());
607+
merged.insert(merged.end(), segment.samples.begin(), segment.samples.end());
616608

617-
static int count = 0;
618-
if (count < delay_audio_frame_) {
619-
buffer_write_char(pcmdata, raw.data(), raw.length());
620-
count++;
621-
return;
609+
offline_stream_->AcceptWaveform(kSampleRate, merged.data(), merged.size());
610+
pre_roll_pcm_.clear();
611+
speech_start = false;
612+
} else {
613+
offline_stream_->AcceptWaveform(kSampleRate, segment.samples.data(), segment.samples.size());
622614
}
623615

624-
buffer_write_char(pcmdata, raw.data(), raw.length());
625-
buffer_position_set(pcmdata, 0);
626-
627-
std::vector<float> floatSamples;
628-
floatSamples.reserve((delay_audio_frame_ + 1) * kFrameSamples);
616+
onnx_recognizer_->DecodeStream(offline_stream_.get());
617+
const auto &result = offline_stream_->GetResult();
629618

630-
int16_t audio_val;
631-
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
632-
floatSamples.push_back(static_cast<float>(audio_val) / 32768.0f);
619+
if (!result.text.empty() && out_callback_) {
620+
out_callback_(result.text, false);
633621
}
634622

635-
buffer_resize(pcmdata, 0);
636-
count = 0;
637-
638-
vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
639-
640-
bool detected = vad_->IsSpeechDetected();
641-
bool speech_start = (!prev_vad_detected_ && detected);
642-
prev_vad_detected_ = detected;
643-
644-
while (!vad_->Empty()) {
645-
const auto &segment = vad_->Front();
646-
647-
if (!offline_stream_) {
648-
offline_stream_ = onnx_recognizer_->CreateStream();
649-
}
650-
651-
if (speech_start && !pre_roll_pcm_.empty()) {
652-
std::vector<float> pre;
653-
pre.reserve(pre_roll_pcm_.size());
654-
for (int16_t s : pre_roll_pcm_) {
655-
pre.push_back(static_cast<float>(s) / 32768.0f);
656-
}
657-
658-
std::vector<float> merged;
659-
merged.reserve(pre.size() + segment.samples.size());
660-
merged.insert(merged.end(), pre.begin(), pre.end());
661-
merged.insert(merged.end(), segment.samples.begin(), segment.samples.end());
662-
663-
offline_stream_->AcceptWaveform(kSampleRate, merged.data(), merged.size());
664-
665-
pre_roll_pcm_.clear();
666-
speech_start = false;
667-
} else {
668-
offline_stream_->AcceptWaveform(kSampleRate, segment.samples.data(), segment.samples.size());
669-
}
670-
671-
onnx_recognizer_->DecodeStream(offline_stream_.get());
672-
673-
const auto &result = offline_stream_->GetResult();
674-
if (!result.text.empty() && out_callback_) {
675-
out_callback_(result.text, true);
676-
}
623+
if (!result.text.empty()) {
624+
if (!pending_text_.empty()) pending_text_ += " ";
625+
pending_text_ += result.text;
626+
}
677627

678-
vad_->Pop();
628+
vad_->Pop();
629+
offline_stream_.reset();
630+
}
679631

680-
offline_stream_.reset();
632+
{
633+
float chunk_ms = (delay_audio_frame_ + 1) * 10.0f;
634+
if (detected) {
635+
silence_ms_accum_ = 0.0f;
636+
} else {
637+
silence_ms_accum_ += chunk_ms;
681638
}
682639

683-
{
684-
float chunk_ms = (delay_audio_frame_ + 1) * 10.0f;
685-
if (detected) {
686-
silence_ms_accum_ = 0.0f;
687-
} else {
688-
silence_ms_accum_ += chunk_ms;
640+
if (silence_ms_accum_ >= silence_timeout) {
641+
if (!pending_text_.empty() && out_callback_) {
642+
out_callback_(pending_text_, true);
689643
}
644+
pending_text_.clear();
690645

691-
if (silence_ms_accum_ >= silence_timeout) {
692-
if (ensleep_) {
693-
if (pause) pause();
694-
}
695-
silence_ms_accum_ = 0.0f;
646+
if (ensleep_) {
647+
if (pause) pause();
696648
}
649+
silence_ms_accum_ = 0.0f;
697650
}
698651
}
699652
}

projects/llm_framework/main_audio/SConstruct

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if 'CONFIG_AX_620E_MSP_ENABLED' in os.environ:
3232
STATIC_FILES += [AFile('audio.json'), AFile('audio_kit.json'), AFile('audio_pyramid.json')]
3333
STATIC_FILES += Glob('mode_*.json')
3434

35-
env['COMPONENTS'].append({'target':'llm_audio-1.8',
35+
env['COMPONENTS'].append({'target':'llm_audio-1.9',
3636
'SRCS':SRCS,
3737
'INCLUDE':INCLUDE,
3838
'PRIVATE_INCLUDE':PRIVATE_INCLUDE,

0 commit comments

Comments
 (0)