diff --git a/paraformer/runtime/python/model/vad/fsmnvad.py b/paraformer/runtime/python/model/vad/fsmnvad.py index 3ba05a2..cc1e32a 100644 --- a/paraformer/runtime/python/model/vad/fsmnvad.py +++ b/paraformer/runtime/python/model/vad/fsmnvad.py @@ -12,8 +12,9 @@ import numpy as np from paraformer.runtime.python.utils.logger import logger -from paraformer.runtime.python.utils.vadOrtInferRuntimeSession import \ - VadOrtInferRuntimeSession +from paraformer.runtime.python.utils.vadOrtInferRuntimeSession import ( + VadOrtInferRuntimeSession, +) class VadStateMachine(Enum): @@ -249,6 +250,7 @@ def __init__(self, config, vad_post_args: Dict[str, Any], root_dir: Path): ) self.speech_noise_thres = self.vad_opts.speech_noise_thres self.scores = None + self.scores_offset = 0 self.max_time_out = False self.decibel = [] self.data_buf_size = 0 @@ -336,10 +338,8 @@ def compute_scores(self, feats: np.ndarray) -> None: scores[0].shape[1] == feats.shape[1] ), "The shape between feats and scores does not match" - if self.scores is None: - self.scores = scores[0] # the first calculation - else: - self.scores = np.concatenate((self.scores, scores[0]), axis=1) + self.scores = scores[0] # the first calculation + self.scores_offset += self.scores.shape[1] return scores[1:] @@ -499,7 +499,8 @@ def get_frame_state(self, t: int) -> FrameState: if len(self.sil_pdf_ids) > 0: assert len(self.scores) == 1 # 只支持batch_size = 1的测试 sil_pdf_scores = [ - self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids + self.scores[0][t - self.scores_offset][sil_pdf_id] + for sil_pdf_id in self.sil_pdf_ids ] sum_score = sum(sil_pdf_scores) noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio