Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Add --frame option
Browse files Browse the repository at this point in the history
  • Loading branch information
shirayu committed Nov 8, 2022
1 parent 8b464ff commit 53970f3
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ whispering --language en --model tiny
- ``--debug`` outputs logs for debug
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
- ``--output`` sets output file (Default: Standard output)
- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)

### Parse interval

Expand All @@ -55,6 +56,7 @@ If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
This is because the original Whisper assumes that the inputs are 30 seconds segments.
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.

## Example of web socket

Expand Down
7 changes: 7 additions & 0 deletions whispering/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,12 @@ def get_opts() -> argparse.Namespace:
help="Maximum number of skip to analyze because of nospeech",
default=16,
)
group_ctx.add_argument(
"--frame",
type=int,
help="The number of minimum frames of mel spectrogram input for Whisper",
default=N_FRAMES,
)

group_misc = parser.add_argument_group("Other options")
group_misc.add_argument(
Expand Down Expand Up @@ -228,6 +234,7 @@ def get_context(*, opts) -> Context:
temperatures=opts.temperature,
max_nospeech_skip=opts.max_nospeech_skip,
vad_threshold=opts.vad,
mel_frame_min_num=opts.frame,
)
logger.debug(f"Context: {ctx}")
return ctx
Expand Down
6 changes: 4 additions & 2 deletions whispering/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

import numpy as np
import torch
from pydantic import BaseModel, root_validator
from pydantic import BaseModel, Field, root_validator
from whisper.audio import N_FRAMES


class WhisperConfig(BaseModel):
Expand All @@ -24,7 +25,7 @@ def validate_model_name(cls, values):
return values


CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")


class Context(BaseModel, arbitrary_types_allowed=True):
Expand All @@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
buffer_threshold: Optional[float] = 0.5
vad_threshold: float
max_nospeech_skip: int
mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)

data_type: str = "float32"

Expand Down
4 changes: 2 additions & 2 deletions whispering/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,9 @@ def transcribe(
if mel.shape[-1] - seek <= 0:
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
break
if mel.shape[-1] - seek < N_FRAMES:
if mel.shape[-1] - seek < ctx.mel_frame_min_num:
logger.debug(
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
)
if force_padding:
logger.debug("Padding")
Expand Down

0 comments on commit 53970f3

Please sign in to comment.