Add --frame option

shirayu · Nov 8, 2022 · 53970f3 · 53970f3
1 parent 8b464ff
commit 53970f3
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ whispering --language en --model tiny
 - ``--debug`` outputs logs for debug
 - ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
 - ``--output`` sets output file (Default: Standard output)
+- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)
 
 ### Parse interval
 
@@ -55,6 +56,7 @@ If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
 By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
 This is because the original Whisper assumes that the inputs are 30 seconds segments.
 However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
+You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.
 
 ## Example of web socket
 

diff --git a/whispering/cli.py b/whispering/cli.py
@@ -156,6 +156,12 @@ def get_opts() -> argparse.Namespace:
         help="Maximum number of skip to analyze because of nospeech",
         default=16,
     )
+    group_ctx.add_argument(
+        "--frame",
+        type=int,
+        help="The number of minimum frames of mel spectrogram input for Whisper",
+        default=N_FRAMES,
+    )
 
     group_misc = parser.add_argument_group("Other options")
     group_misc.add_argument(
@@ -228,6 +234,7 @@ def get_context(*, opts) -> Context:
         temperatures=opts.temperature,
         max_nospeech_skip=opts.max_nospeech_skip,
         vad_threshold=opts.vad,
+        mel_frame_min_num=opts.frame,
     )
     logger.debug(f"Context: {ctx}")
     return ctx

diff --git a/whispering/schema.py b/whispering/schema.py
@@ -5,7 +5,8 @@
 
 import numpy as np
 import torch
-from pydantic import BaseModel, root_validator
+from pydantic import BaseModel, Field, root_validator
+from whisper.audio import N_FRAMES
 
 
 class WhisperConfig(BaseModel):
@@ -24,7 +25,7 @@ def validate_model_name(cls, values):
         return values
 
 
-CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
+CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")
 
 
 class Context(BaseModel, arbitrary_types_allowed=True):
@@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     buffer_threshold: Optional[float] = 0.5
     vad_threshold: float
     max_nospeech_skip: int
+    mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)
 
     data_type: str = "float32"
 

diff --git a/whispering/transcriber.py b/whispering/transcriber.py
@@ -278,9 +278,9 @@ def transcribe(
             if mel.shape[-1] - seek <= 0:
                 logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
                 break
-            if mel.shape[-1] - seek < N_FRAMES:
+            if mel.shape[-1] - seek < ctx.mel_frame_min_num:
                 logger.debug(
-                    f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
+                    f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
                 )
                 if force_padding:
                     logger.debug("Padding")