Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Removed --allow-padding and add --max_nospeech_skip option (Resolve #13)
Browse files Browse the repository at this point in the history
  • Loading branch information
shirayu committed Oct 15, 2022
1 parent 20b8970 commit 75147ca
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 24 deletions.
17 changes: 4 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,18 @@ whispering --language en --model tiny
- ``--no-progress`` disables the progress message
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
- ``--debug`` outputs logs for debug
- ``--vad`` sets VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. 0 disables VAD and forces whisper to analyze non-voice activity sound period
- ``--output`` sets output file (Default: Standard output)

### Parse interval

By default, whispering performs VAD for every 3.75 second.
This interval is determined by the value of ``-n`` and its default is ``20``.
When an interval is predicted as "silence", it will not be passed to whisper.
If you want to disable VAD, please use ``--no-vad`` option.
If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.

By default, Whisper does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
This is because Whisper is trained to make predictions for 30-second intervals.
Nevertheless, if you want to force Whisper to perform analysis even if a segment is less than 30 seconds, please use ``--allow-padding`` option like this.

```bash
whispering --language en --model tiny -n 20 --allow-padding
```

This forces Whisper to analyze every 3.75 seconds speech segment.
Using ``--allow-padding`` may sacrifice the accuracy, while you can get quick response.
The smaller value of ``-n`` with ``--allow-padding`` is, the worse the accuracy becomes.
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.

## Example of web socket

Expand All @@ -81,7 +72,7 @@ whispering --language en --model tiny --host 0.0.0.0 --port 8000
whispering --host ADDRESS_OF_HOST --port 8000 --mode client
```

You can set ``-n``, ``--allow-padding`` and other options.
You can set ``-n`` and other options.

## License

Expand Down
1 change: 0 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ def test_options():
"--mode server --mic 1",
"--mode server --beam_size 3",
"--mode server --temperature 0",
"--mode server --allow-padding",
"--mode server --num_block 3",
"--mode mic --host 0.0.0.0",
"--mode mic --port 8000",
Expand Down
13 changes: 7 additions & 6 deletions whispering/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,18 @@ def get_opts() -> argparse.Namespace:
action="append",
default=[],
)
group_ctx.add_argument(
"--allow-padding",
action="store_true",
)
group_ctx.add_argument(
"--vad",
type=float,
help="Threshold of VAD",
default=0.5,
)
group_ctx.add_argument(
"--max_nospeech_skip",
type=int,
help="Maximum number of skip to analyze because of nospeech",
default=16,
)

group_misc = parser.add_argument_group("Other options")
group_misc.add_argument(
Expand Down Expand Up @@ -224,7 +226,7 @@ def get_context(*, opts) -> Context:
protocol_version=CURRENT_PROTOCOL_VERSION,
beam_size=opts.beam_size,
temperatures=opts.temperature,
allow_padding=opts.allow_padding,
max_nospeech_skip=opts.max_nospeech_skip,
vad_threshold=opts.vad,
)
logger.debug(f"Context: {ctx}")
Expand All @@ -245,7 +247,6 @@ def is_valid_arg(opts) -> bool:
"mic",
"beam_size",
"temperature",
"allow_padding",
]
elif opts.mode == Mode.mic.value:
keys = [
Expand Down
3 changes: 2 additions & 1 deletion whispering/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class Context(BaseModel, arbitrary_types_allowed=True):
timestamp: float = 0.0
buffer_tokens: List[torch.Tensor] = []
buffer_mel: Optional[torch.Tensor] = None
nosoeech_skip_count: Optional[int] = None

temperatures: List[float]
allow_padding: bool = False
patience: Optional[float] = None
compression_ratio_threshold: Optional[float] = 2.4
logprob_threshold: Optional[float] = -1.0
Expand All @@ -46,6 +46,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
compression_ratio_threshold: Optional[float] = 2.4
buffer_threshold: Optional[float] = 0.5
vad_threshold: float
max_nospeech_skip: int


class ParsedChunk(BaseModel):
Expand Down
27 changes: 24 additions & 3 deletions whispering/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def transcribe(
ctx: Context,
) -> Iterator[ParsedChunk]:
logger.debug(f"{len(audio)}")
force_padding: bool = False

if ctx.vad_threshold > 0.0:
x = [
Expand All @@ -246,7 +247,20 @@ def transcribe(
if len(x) == 0: # No speech
logger.debug("No speech")
ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
return

if ctx.nosoeech_skip_count is not None:
ctx.nosoeech_skip_count += 1

if (
ctx.nosoeech_skip_count is None
or ctx.nosoeech_skip_count <= ctx.max_nospeech_skip
):
logger.debug(
f"nosoeech_skip_count: {ctx.nosoeech_skip_count} (<= {ctx.max_nospeech_skip})"
)
return
ctx.nosoeech_skip_count = None
force_padding = True

new_mel = log_mel_spectrogram(audio=audio)
logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
Expand All @@ -261,12 +275,15 @@ def transcribe(
seek: int = 0
while seek < mel.shape[-1]:
logger.debug(f"seek: {seek}")
if mel.shape[-1] - seek <= 0:
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
break
if mel.shape[-1] - seek < N_FRAMES:
logger.debug(
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
)
if ctx.allow_padding:
logger.warning("Padding is not expected while speaking")
if force_padding:
logger.debug("Padding")
else:
logger.debug("No padding")
break
Expand Down Expand Up @@ -319,9 +336,13 @@ def transcribe(
logger.debug(f"new seek={seek}, mel.shape: {mel.shape}")

if mel.shape[-1] - seek <= 0:
ctx.buffer_mel = None
ctx.nosoeech_skip_count = None
logger.debug(f"ctx.buffer_mel is None ({mel.shape}, {seek})")
return
ctx.buffer_mel = mel[:, seek:]
assert ctx.buffer_mel is not None
logger.debug(f"ctx.buffer_mel.shape: {ctx.buffer_mel.shape}")
del mel
if ctx.nosoeech_skip_count is None:
ctx.nosoeech_skip_count = 0 # start count
4 changes: 4 additions & 0 deletions whispering/vad.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

from logging import getLogger
from typing import Iterator, Optional

import numpy as np
Expand All @@ -8,6 +9,8 @@

from whispering.schema import SpeechSegment

logger = getLogger(__name__)


class VAD:
def __init__(
Expand Down Expand Up @@ -50,6 +53,7 @@ def my_ret(
torch.from_numpy(audio[start:end]),
SAMPLE_RATE,
).item()
logger.debug(f"VAD: {vad_prob} (threshold={threshold})")
if vad_prob > threshold:
if start_block_idx is None:
start_block_idx = idx
Expand Down

0 comments on commit 75147ca

Please sign in to comment.