Removed --allow-padding and add --max_nospeech_skip option (Resolve #13)

shirayu · Oct 15, 2022 · 75147ca · 75147ca
1 parent 20b8970
commit 75147ca
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -41,27 +41,18 @@ whispering --language en --model tiny
 - ``--no-progress`` disables the progress message
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
-- ``--vad`` sets VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
+- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. 0 disables VAD and forces whisper to analyze non-voice activity sound period
 - ``--output`` sets output file (Default: Standard output)
 
 ### Parse interval
 
 By default, whispering performs VAD for every 3.75 second.
 This interval is determined by the value of ``-n`` and its default is ``20``.
 When an interval is predicted as "silence", it will not be passed to whisper.
-If you want to disable VAD, please use ``--no-vad`` option.
+If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
 
 By default, Whisper does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
-This is because Whisper is trained to make predictions for 30-second intervals.
-Nevertheless, if you want to force Whisper to perform analysis even if a segment is less than 30 seconds, please use ``--allow-padding`` option like this.
-
-```bash
-whispering --language en --model tiny -n 20 --allow-padding
-```
-
-This forces Whisper to analyze every 3.75 seconds speech segment.
-Using ``--allow-padding`` may sacrifice the accuracy, while you can get quick response.
-The smaller value of ``-n`` with ``--allow-padding`` is, the worse the accuracy becomes.
+However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
 
 ## Example of web socket
 
@@ -81,7 +72,7 @@ whispering --language en --model tiny --host 0.0.0.0 --port 8000
 whispering --host ADDRESS_OF_HOST --port 8000 --mode client
 ```
 
-You can set ``-n``, ``--allow-padding`` and other options.
+You can set ``-n`` and other options.
 
 ## License
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -14,7 +14,6 @@ def test_options():
         "--mode server --mic 1",
         "--mode server --beam_size 3",
         "--mode server --temperature 0",
-        "--mode server --allow-padding",
         "--mode server --num_block 3",
         "--mode mic --host 0.0.0.0",
         "--mode mic --port 8000",

diff --git a/whispering/cli.py b/whispering/cli.py
@@ -144,16 +144,18 @@ def get_opts() -> argparse.Namespace:
         action="append",
         default=[],
     )
-    group_ctx.add_argument(
-        "--allow-padding",
-        action="store_true",
-    )
     group_ctx.add_argument(
         "--vad",
         type=float,
         help="Threshold of VAD",
         default=0.5,
     )
+    group_ctx.add_argument(
+        "--max_nospeech_skip",
+        type=int,
+        help="Maximum number of skip to analyze because of nospeech",
+        default=16,
+    )
 
     group_misc = parser.add_argument_group("Other options")
     group_misc.add_argument(
@@ -224,7 +226,7 @@ def get_context(*, opts) -> Context:
         protocol_version=CURRENT_PROTOCOL_VERSION,
         beam_size=opts.beam_size,
         temperatures=opts.temperature,
-        allow_padding=opts.allow_padding,
+        max_nospeech_skip=opts.max_nospeech_skip,
         vad_threshold=opts.vad,
     )
     logger.debug(f"Context: {ctx}")
@@ -245,7 +247,6 @@ def is_valid_arg(opts) -> bool:
             "mic",
             "beam_size",
             "temperature",
-            "allow_padding",
         ]
     elif opts.mode == Mode.mic.value:
         keys = [

diff --git a/whispering/schema.py b/whispering/schema.py
@@ -32,9 +32,9 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     timestamp: float = 0.0
     buffer_tokens: List[torch.Tensor] = []
     buffer_mel: Optional[torch.Tensor] = None
+    nosoeech_skip_count: Optional[int] = None
 
     temperatures: List[float]
-    allow_padding: bool = False
     patience: Optional[float] = None
     compression_ratio_threshold: Optional[float] = 2.4
     logprob_threshold: Optional[float] = -1.0
@@ -46,6 +46,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     compression_ratio_threshold: Optional[float] = 2.4
     buffer_threshold: Optional[float] = 0.5
     vad_threshold: float
+    max_nospeech_skip: int
 
 
 class ParsedChunk(BaseModel):

diff --git a/whispering/transcriber.py b/whispering/transcriber.py
@@ -233,6 +233,7 @@ def transcribe(
         ctx: Context,
     ) -> Iterator[ParsedChunk]:
         logger.debug(f"{len(audio)}")
+        force_padding: bool = False
 
         if ctx.vad_threshold > 0.0:
             x = [
@@ -246,7 +247,20 @@ def transcribe(
             if len(x) == 0:  # No speech
                 logger.debug("No speech")
                 ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
-                return
+
+                if ctx.nosoeech_skip_count is not None:
+                    ctx.nosoeech_skip_count += 1
+
+                if (
+                    ctx.nosoeech_skip_count is None
+                    or ctx.nosoeech_skip_count <= ctx.max_nospeech_skip
+                ):
+                    logger.debug(
+                        f"nosoeech_skip_count: {ctx.nosoeech_skip_count} (<= {ctx.max_nospeech_skip})"
+                    )
+                    return
+                ctx.nosoeech_skip_count = None
+                force_padding = True
 
         new_mel = log_mel_spectrogram(audio=audio)
         logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
@@ -261,12 +275,15 @@ def transcribe(
         seek: int = 0
         while seek < mel.shape[-1]:
             logger.debug(f"seek: {seek}")
+            if mel.shape[-1] - seek <= 0:
+                logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
+                break
             if mel.shape[-1] - seek < N_FRAMES:
                 logger.debug(
                     f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
                 )
-                if ctx.allow_padding:
-                    logger.warning("Padding is not expected while speaking")
+                if force_padding:
+                    logger.debug("Padding")
                 else:
                     logger.debug("No padding")
                     break
@@ -319,9 +336,13 @@ def transcribe(
             logger.debug(f"new seek={seek}, mel.shape: {mel.shape}")
 
         if mel.shape[-1] - seek <= 0:
+            ctx.buffer_mel = None
+            ctx.nosoeech_skip_count = None
             logger.debug(f"ctx.buffer_mel is None ({mel.shape}, {seek})")
             return
         ctx.buffer_mel = mel[:, seek:]
         assert ctx.buffer_mel is not None
         logger.debug(f"ctx.buffer_mel.shape: {ctx.buffer_mel.shape}")
         del mel
+        if ctx.nosoeech_skip_count is None:
+            ctx.nosoeech_skip_count = 0  # start count
diff --git a/whispering/vad.py b/whispering/vad.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+from logging import getLogger
 from typing import Iterator, Optional
 
 import numpy as np
@@ -8,6 +9,8 @@
 
 from whispering.schema import SpeechSegment
 
+logger = getLogger(__name__)
+
 
 class VAD:
     def __init__(
@@ -50,6 +53,7 @@ def my_ret(
                 torch.from_numpy(audio[start:end]),
                 SAMPLE_RATE,
             ).item()
+            logger.debug(f"VAD: {vad_prob} (threshold={threshold})")
             if vad_prob > threshold:
                 if start_block_idx is None:
                     start_block_idx = idx