@@ -116,6 +116,7 @@ class llm_task {
116116 int delay_audio_frame_ = 10 ;
117117 float silence_ms_accum_ = 0 .0f ;
118118 float silence_timeout = 1000 .0f ;
119+ std::string pending_text_;
119120
120121 buffer_t *pcmdata;
121122 std::function<void (void )> pause;
@@ -554,146 +555,98 @@ class llm_task {
554555
555556 void sys_pcm_on_data_onnx (const std::string &raw)
556557 {
557- if (delay_audio_frame_ == 0 ) {
558- buffer_write_char (pcmdata, raw.data (), raw.length ());
559- buffer_position_set (pcmdata, 0 );
560-
561- std::vector<float > floatSamples;
562- int16_t audio_val;
563- while (buffer_read_i16 (pcmdata, &audio_val, 1 )) {
564- float normalizedSample = static_cast <float >(audio_val) / INT16_MAX;
565- floatSamples.push_back (normalizedSample);
566- }
558+ if (raw.size () >= sizeof (int16_t )) {
559+ const int16_t *pcm16 = reinterpret_cast <const int16_t *>(raw.data ());
560+ size_t n16 = raw.size () / sizeof (int16_t );
561+ PushPreRollPcm (pcm16, n16);
562+ }
567563
568- buffer_resize (pcmdata, 0 );
569- int32_t window_size = vad_config_.silero_vad .window_size ;
570- int32_t i = 0 ;
571- std::string final_text;
564+ static int count = 0 ;
565+ if (count < delay_audio_frame_) {
566+ buffer_write_char (pcmdata, raw.data (), raw.length ());
567+ count++;
568+ return ;
569+ }
572570
573- while (i < floatSamples.size ()) {
574- if (i + window_size <= floatSamples.size ()) {
575- vad_->AcceptWaveform (floatSamples.data () + i, window_size);
576- } else {
577- vad_->Flush ();
578- }
579- i += window_size;
571+ buffer_write_char (pcmdata, raw.data (), raw.length ());
572+ buffer_position_set (pcmdata, 0 );
580573
581- while (!vad_->Empty ()) {
582- const auto &segment = vad_->Front ();
583- float duration = segment.samples .size () / 16000 .f ;
584- float start_time = segment.start / 16000 .f ;
585- float end_time = start_time + duration;
574+ std::vector<float > floatSamples;
575+ floatSamples.reserve ((delay_audio_frame_ + 1 ) * kFrameSamples );
586576
587- if (duration < 0 . 1f ) {
588- vad_-> Pop ();
589- continue ;
590- }
577+ int16_t audio_val;
578+ while ( buffer_read_i16 (pcmdata, &audio_val, 1 )) {
579+ floatSamples. push_back ( static_cast < float >(audio_val) / 32768 . 0f ) ;
580+ }
591581
592- if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream ();
582+ buffer_resize (pcmdata, 0 );
583+ count = 0 ;
593584
594- offline_stream_->AcceptWaveform (onnx_asr_config_.feat_config .sampling_rate , segment.samples .data (),
595- segment.samples .size ());
585+ vad_->AcceptWaveform (floatSamples.data (), floatSamples.size ());
586+ bool detected = vad_->IsSpeechDetected ();
587+ bool speech_start = (!prev_vad_detected_ && detected);
588+ prev_vad_detected_ = detected;
596589
597- onnx_recognizer_-> DecodeStream (offline_stream_. get ());
598- const auto &result = offline_stream_-> GetResult ();
590+ while (!vad_-> Empty ()) {
591+ const auto &segment = vad_-> Front ();
599592
600- final_text += result.text ;
593+ if (!offline_stream_) {
594+ offline_stream_ = onnx_recognizer_->CreateStream ();
595+ }
601596
602- vad_->Pop ();
603- offline_stream_.reset ();
597+ if (speech_start && !pre_roll_pcm_.empty ()) {
598+ std::vector<float > pre ;
599+ pre .reserve (pre_roll_pcm_.size ());
600+ for (int16_t s : pre_roll_pcm_) {
601+ pre .push_back (static_cast <float >(s) / 32768 .0f );
604602 }
605- }
606603
607- if (out_callback_) {
608- out_callback_ (final_text, true );
609- }
610- } else {
611- if (raw.size () >= sizeof (int16_t )) {
612- const int16_t *pcm16 = reinterpret_cast <const int16_t *>(raw.data ());
613- size_t n16 = raw.size () / sizeof (int16_t );
614- PushPreRollPcm (pcm16, n16);
615- }
604+ std::vector<float > merged;
605+ merged.reserve (pre .size () + segment.samples .size ());
606+ merged.insert (merged.end (), pre .begin (), pre .end ());
607+ merged.insert (merged.end (), segment.samples .begin (), segment.samples .end ());
616608
617- static int count = 0 ;
618- if (count < delay_audio_frame_) {
619- buffer_write_char (pcmdata, raw. data (), raw. length ()) ;
620- count++;
621- return ;
609+ offline_stream_-> AcceptWaveform ( kSampleRate , merged. data (), merged. size ()) ;
610+ pre_roll_pcm_. clear ();
611+ speech_start = false ;
612+ } else {
613+ offline_stream_-> AcceptWaveform ( kSampleRate , segment. samples . data (), segment. samples . size ()) ;
622614 }
623615
624- buffer_write_char (pcmdata, raw.data (), raw.length ());
625- buffer_position_set (pcmdata, 0 );
626-
627- std::vector<float > floatSamples;
628- floatSamples.reserve ((delay_audio_frame_ + 1 ) * kFrameSamples );
616+ onnx_recognizer_->DecodeStream (offline_stream_.get ());
617+ const auto &result = offline_stream_->GetResult ();
629618
630- int16_t audio_val;
631- while (buffer_read_i16 (pcmdata, &audio_val, 1 )) {
632- floatSamples.push_back (static_cast <float >(audio_val) / 32768 .0f );
619+ if (!result.text .empty () && out_callback_) {
620+ out_callback_ (result.text , false );
633621 }
634622
635- buffer_resize (pcmdata, 0 );
636- count = 0 ;
637-
638- vad_->AcceptWaveform (floatSamples.data (), floatSamples.size ());
639-
640- bool detected = vad_->IsSpeechDetected ();
641- bool speech_start = (!prev_vad_detected_ && detected);
642- prev_vad_detected_ = detected;
643-
644- while (!vad_->Empty ()) {
645- const auto &segment = vad_->Front ();
646-
647- if (!offline_stream_) {
648- offline_stream_ = onnx_recognizer_->CreateStream ();
649- }
650-
651- if (speech_start && !pre_roll_pcm_.empty ()) {
652- std::vector<float > pre ;
653- pre .reserve (pre_roll_pcm_.size ());
654- for (int16_t s : pre_roll_pcm_) {
655- pre .push_back (static_cast <float >(s) / 32768 .0f );
656- }
657-
658- std::vector<float > merged;
659- merged.reserve (pre .size () + segment.samples .size ());
660- merged.insert (merged.end (), pre .begin (), pre .end ());
661- merged.insert (merged.end (), segment.samples .begin (), segment.samples .end ());
662-
663- offline_stream_->AcceptWaveform (kSampleRate , merged.data (), merged.size ());
664-
665- pre_roll_pcm_.clear ();
666- speech_start = false ;
667- } else {
668- offline_stream_->AcceptWaveform (kSampleRate , segment.samples .data (), segment.samples .size ());
669- }
670-
671- onnx_recognizer_->DecodeStream (offline_stream_.get ());
672-
673- const auto &result = offline_stream_->GetResult ();
674- if (!result.text .empty () && out_callback_) {
675- out_callback_ (result.text , true );
676- }
623+ if (!result.text .empty ()) {
624+ if (!pending_text_.empty ()) pending_text_ += " " ;
625+ pending_text_ += result.text ;
626+ }
677627
678- vad_->Pop ();
628+ vad_->Pop ();
629+ offline_stream_.reset ();
630+ }
679631
680- offline_stream_.reset ();
632+ {
633+ float chunk_ms = (delay_audio_frame_ + 1 ) * 10 .0f ;
634+ if (detected) {
635+ silence_ms_accum_ = 0 .0f ;
636+ } else {
637+ silence_ms_accum_ += chunk_ms;
681638 }
682639
683- {
684- float chunk_ms = (delay_audio_frame_ + 1 ) * 10 .0f ;
685- if (detected) {
686- silence_ms_accum_ = 0 .0f ;
687- } else {
688- silence_ms_accum_ += chunk_ms;
640+ if (silence_ms_accum_ >= silence_timeout) {
641+ if (!pending_text_.empty () && out_callback_) {
642+ out_callback_ (pending_text_, true );
689643 }
644+ pending_text_.clear ();
690645
691- if (silence_ms_accum_ >= silence_timeout) {
692- if (ensleep_) {
693- if (pause) pause ();
694- }
695- silence_ms_accum_ = 0 .0f ;
646+ if (ensleep_) {
647+ if (pause) pause ();
696648 }
649+ silence_ms_accum_ = 0 .0f ;
697650 }
698651 }
699652 }
0 commit comments