huggingface · fabiocat93 · Sep 9, 2024 · Sep 9, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 __pycache__
 tmp
-cache
+cache
+mlx_models/
+asset/
+config/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.2.5 # Specify the latest stable version
+ hooks:
+ - id: codespell
+ args: ["-w"] # The -w flag tells codespell to automatically apply fixes
+
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
+ rev: v0.1.1 # Replace with the latest stable version of ruff-pre-commit
+ hooks:
+ - id: ruff
+ args: ["--fix"] # This will automatically fix linting issues
diff --git a/LLM/chat.py b/LLM/chat.py
@@ -6,7 +6,7 @@ class Chat:
  def __init__(self, size):
  self.size = size
  self.init_chat_message = None
- # maxlen is necessary pair, since a each new step we add an prompt and assitant answer
+ # maxlen is necessary pair, since a each new step we add an prompt and assistant answer
  self.buffer = []
 
  def append(self, item):

diff --git a/LLM/language_model.py b/LLM/language_model.py
@@ -68,7 +68,7 @@ def setup(
  if init_chat_role:
  if not init_chat_prompt:
  raise ValueError(
- "An initial promt needs to be specified when setting init_chat_role."
+ "An initial prompt needs to be specified when setting init_chat_role."
  )
  self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
  self.user_role = user_role
@@ -111,7 +111,7 @@ def warmup(self):
  )
 
  def process(self, prompt):
- logger.debug("infering language model...")
+ logger.debug("inferring language model...")
  language_code = None
  if isinstance(prompt, tuple):
  prompt, language_code = prompt

diff --git a/LLM/mlx_language_model.py b/LLM/mlx_language_model.py
@@ -42,7 +42,7 @@ def setup(
  if init_chat_role:
  if not init_chat_prompt:
  raise ValueError(
- "An initial promt needs to be specified when setting init_chat_role."
+ "An initial prompt needs to be specified when setting init_chat_role."
  )
  self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
  self.user_role = user_role
@@ -68,7 +68,7 @@ def warmup(self):
  )
 
  def process(self, prompt):
- logger.debug("infering language model...")
+ logger.debug("inferring language model...")
  language_code = None
 
  if isinstance(prompt, tuple):

diff --git a/LLM/openai_api_language_model.py b/LLM/openai_api_language_model.py
@@ -44,7 +44,7 @@ def setup(
  if init_chat_role:
  if not init_chat_prompt:
  raise ValueError(
- "An initial promt needs to be specified when setting init_chat_role."
+ "An initial prompt needs to be specified when setting init_chat_role."
  )
  self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
  self.user_role = user_role
@@ -54,7 +54,7 @@ def setup(
  def warmup(self):
  logger.info(f"Warming up {self.__class__.__name__}")
  start = time.time()
- response = self.client.chat.completions.create(
+ _ = self.client.chat.completions.create(
  model=self.model_name,
  messages=[
  {"role": "system", "content": "You are a helpful assistant"},

diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ This repository implements a speech-to-speech cascaded pipeline consisting of th
 2. **Speech to Text (STT)**
 3. **Language Model (LM)**
 4. **Text to Speech (TTS)**
+5. **Speech to Visemes (STV)**
 
 ### Modularity
 The pipeline provides a fully open and modular approach, with a focus on leveraging models available through the Transformers library on the Hugging Face hub. The code is designed for easy modification, and we already support device-specific and external library implementations:
@@ -50,6 +51,9 @@ The pipeline provides a fully open and modular approach, with a focus on leverag
 - [MeloTTS](https://github.com/myshell-ai/MeloTTS)
 - [ChatTTS](https://github.com/2noise/ChatTTS?tab=readme-ov-file)
 
+**STV**
+- [Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/en/model_doc/wav2vec2_phoneme) + [Phoneme to viseme mapping](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes)
+
 ## Setup
 
 Clone the repository:
@@ -80,7 +84,7 @@ The pipeline can be run in two ways:
 - **Server/Client approach**: Models run on a server, and audio input/output are streamed from a client.
 - **Local approach**: Runs locally.
 
-### Recommanded setup 
+### Recommended setup 
 
 ### Server/Client Approach
 
@@ -120,7 +124,7 @@ https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install
 
 ### Recommended usage with Cuda
 
-Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, futher reducing the overeall latency** 🚀:
+Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, further reducing the overeall latency** 🚀:
 
 ```bash
 python s2s_pipeline.py \
@@ -216,6 +220,13 @@ For example:
 --lm_model_name google/gemma-2b-it
 ```
 
+
+### STV parameters
+See [Wav2Vec2STVHandlerArguments](arguments_classes/w2v_stv_arguments.py) class. Notably:
+- `stv_model_name` is by default `bookbot/wav2vec2-ljspeech-gruut` and has been chosen because accurate and fast enough
+- `stv_skip`, flag it to `True` if you don't need visemes
+
+
 ### Generation parameters
 
 Other generation parameters of the model's generate method can be set using the part's prefix + `_gen_`, e.g., `--stt_gen_max_new_tokens 128`. These parameters can be added to the pipeline part's arguments class if not already exposed.

diff --git a/STT/lightning_whisper_mlx_handler.py b/STT/lightning_whisper_mlx_handler.py
@@ -4,7 +4,6 @@
 from lightning_whisper_mlx import LightningWhisperMLX
 import numpy as np
 from rich.console import Console
-from copy import copy
 import torch
 
 logger = logging.getLogger(__name__)
@@ -55,7 +54,7 @@ def warmup(self):
  _ = self.model.transcribe(dummy_input)["text"].strip()
 
  def process(self, spoken_prompt):
- logger.debug("infering whisper...")
+ logger.debug("inferring whisper...")
 
  global pipeline_start
  pipeline_start = perf_counter()

diff --git a/STT/paraformer_handler.py b/STT/paraformer_handler.py
@@ -28,7 +28,6 @@ def setup(
  device="cuda",
  gen_kwargs={},
  ):
- print(model_name)
  if len(model_name.split("/")) > 1:
  model_name = model_name.split("/")[-1]
  self.device = device
@@ -45,7 +44,7 @@ def warmup(self):
  _ = self.model.generate(dummy_input)[0]["text"].strip().replace(" ", "")
 
  def process(self, spoken_prompt):
- logger.debug("infering paraformer...")
+ logger.debug("inferring paraformer...")
 
  global pipeline_start
  pipeline_start = perf_counter()

diff --git a/STT/whisper_stt_handler.py b/STT/whisper_stt_handler.py
@@ -109,7 +109,7 @@ def warmup(self):
  )
 
  def process(self, spoken_prompt):
- logger.debug("infering whisper...")
+ logger.debug("inferring whisper...")
 
  global pipeline_start
  pipeline_start = perf_counter()

diff --git a/STV/phoneme_viseme_map.json b/STV/phoneme_viseme_map.json
@@ -0,0 +1 @@
+{"æ":[1],"ə":[1],"ʌ":[1],"ɑ":[2],"ɔ":[3],"ɛ":[4],"ʊ":[4],"ɝ":[5],"j":[6],"i":[6],"ɪ":[6],"w":[7],"u":[7],"o":[8],"aʊ":[9],"ɔɪ":[10],"aɪ":[11],"h":[12],"ɹ":[13],"l":[14],"s":[15],"z":[15],"ʃ":[16],"tʃ":[19,16],"dʒ":[19,16],"ʒ":[16],"ð":[17],"f":[18],"v":[18],"d":[19],"t":[19],"n":[19],"θ":[19],"k":[20],"g":[20],"ŋ":[20],"p":[21],"b":[21],"m":[21]," ":[0],"a":[2],"aː":[2],"iː":[6],"uː":[7],"dˤ":[19],"q":[20],"tˤ":[19],"ʔ":[19],"ħ":[12],"ðˤ":[17],"ɣ":[20],"x":[12],"sˤ":[15],"r":[13],"ʕ":[12],"j͡a":[6,2],"ɤ":[1],"j͡u":[6,7],"t͡s":[19,15],"zʲ":[15],"lʲ":[14],"nʲ":[19],"d͡ʒ":[19,16],"mʲ":[21],"tʲ":[19],"rʲ":[13],"pʲ":[21],"dʲ":[19],"vʲ":[18],"sʲ":[15],"bʲ":[21],"kʲ":[20],"gʲ":[20],"fʲ":[18],"t͡ʃ":[19,16],"d͡z":[19,15],"e":[4],"β":[21],"ʎ":[14],"ɲ":[19],"ɾ":[19],"ɛː":[4],"oː":[8],"o͡ʊ̯":[8,4],"a͡ʊ":[2,4],"ɛ͡ʊ̯":[4,4],"c":[16],"ɟ":[16],"r̝":[13],"ɦ":[12],"ɱ":[21],"r̝̊":[13],"ɑː":[2],"ɒ":[2],"ɒː":[2],"ɔː":[3],"ɐ":[4],"æː":[1],"ø":[1],"øː":[1],"eː":[4],"œ":[4],"œː":[4],"y":[4],"yː":[4],"kʰ":[20],"pʰ":[21],"ʁ":[13],"ɐ̯":[4],"ɕ":[16],"ʏ":[7],"ai":[2,6],"au":[2,7],"ɔy":[3,4],"ɔʏ̯":[3,4],"ʤ":[16],"pf":[21,18],"ʀ":[13],"ts":[19,15],"ç":[12],"ʝ":[12],"ɛə":[4,1],"ɜː":[5],"eɪ":[4,6],"ɪə":[6,1],"əʊ":[1,4],"ʊə":[4,1],"iy":[6],"oʊ":[8,4],"ju":[6,7],"ɪɹ":[6,13],"ɛɹ":[4,13],"ʊɹ":[4,13],"aɪɹ":[11,13],"aʊɹ":[9,13],"ɔɹ":[3,13],"ɑɹ":[2,13],"ɚ":[1],"j͡j":[6,6],"ɑ͡i":[2,6],"ɑ͡u":[2,7],"æ͡i":[1,6],"æ͡y":[1,4],"e͡i":[4,6],"ø͡i":[1,6],"ø͡y":[1,4],"e͡u":[4,7],"e͡y":[4,4],"i͡e":[6,4],"i͡u":[6,7],"i͡y":[6,4],"o͡i":[8,6],"o͡u":[8,7],"u͡i":[7,6],"u͡o":[7,8],"y͡ø":[4,1],"y͡i":[4,6],"ʋ":[18],"ɑ̃":[2],"ɛ̃":[4],"ɔ̃":[3],"œ̃":[4],"ɥ":[7],"n‿":[19],"t‿":[19],"z‿":[15],"ʨ":[16],"ʥ":[16],"bː":[21],"dː":[19],"ɟː":[16],"d͡ʒː":[19,16],"dz":[19,15],"dzː":[19,15],"fː":[18],"gː":[20],"hː":[12],"jː":[6],"ɲː":[19],"kː":[20],"lː":[14],"mː":[21],"nː":[19],"pː":[21],"rː":[13],"sː":[15],"ʃː":[16],"tː":[19],"cː":[16],"t͡sː":[19,15],"t͡ʃː":[19,16],"vː":[18],"ɰ":[20],"zː":[15],"ʒː":[16],"a͡i":[2,6],"ɔ͡i":[3,6],"ɛj":[4,6],"ɛu":[4,7],"ei":[4,6],"eu":[4,7],"ɔj":[3,6],"oi":[8,6],"ou":[8,7],"ʧ":[16],"tʃː":[19,16],"ʣ":[15],"ʣː":[15],"ʤː":[16],"ʎː":[14],"ʦ":[15],"ʦː":[15],"ɯ":[6],"ɰ͡i":[20,6],"w͡a":[7,2],"w͡ɛ":[7,4],"w͡e":[7,4],"w͡i":[7,6],"w͡ʌ":[7,1],"j͡ɛ":[6,4],"j͡e":[6,4],"j͡ʌ":[6,1],"j͡o":[6,8],"b̥":[21],"t͡ɕʰ":[19,16],"d̥":[19],"g̥":[20],"d͡ʑ":[19,16],"d͡ʑ̥":[19,16],"t͡ɕ":[19,16],"sʰ":[15],"tʰ":[19],"ʉ":[6],"ʉː":[6],"æɪ":[1,6],"æʉ":[1,6],"ɑɪ":[2,6],"œʏ":[4,7],"ɔʏ":[3,7],"ʉɪ":[6,6],"ʂ":[15],"ɖ":[19],"ɭ":[14],"ɳ":[19],"ʈ":[19],"ɛ͡i":[4,6],"œ͡y":[4,4],"χ":[12],"ɨ":[6],"t͡ʂ":[19,15],"d̪ʲ":[19],"ɡ":[20],"d͡ʐ":[19,15],"l̪ʲ":[14],"t̪ʲ":[19],"xʲ":[12],"ʑ":[16],"ĩ":[6],"ũ":[7],"ɐ̃":[4],"ẽ":[4],"õ":[8],"w̃":[7],"j̃":[6],"ɐj":[4,6],"ɐ̃j̃":[4,6],"ɐ̃w̃":[4,7],"ɐ͡w":[4,7],"a͡j":[2,6],"ɔ͡j":[3,6],"a͡w":[2,7],"ɛ͡w":[4,7],"e͡w":[4,7],"i͡w":[6,7],"o͡j":[8,6],"õj̃":[8,6],"u͡j":[7,6],"ũj̃":[7,6],"ɫ":[14],"e̯a":[4,2],"e̯o":[4,8],"o̯a":[8,2],"d͡ʒʲ":[19,16],"ʃʲ":[16],"t͡sʲ":[19,15],"t͡ʃʲ":[19,16],"ʒʲ":[16],"ʐ":[15],"ɕː":[16],"i͡a":[6,2],"r̩":[13],"r̩ː":[13],"l̩":[14],"l̩ː":[14],"ɴ":[19],"u̯":[7],"i̯":[6],"dˡ":[19],"dn":[19,19],"tˡ":[19],"tn":[19,19],"ʍ":[7],"a‿u":[2,7],"ɶ":[8],"ɵ":[1],"ɧ":[16],"ia":[6,2],"əː":[1],"ua":[7,2],"ɯː":[6],"ɯa":[6,2],"tɕʰ":[19,16],"œ͡ɟ":[4,16],"i͡ɟ":[6,16],"o͡ɟ":[8,16],"u͡ɟ":[7,16],"ɯ͡ɟ":[6,16],"y͡ɟ":[4,16],"ɮ":[6],"u͡a":[7,2],"ɛ̆j":[4,6],"ə͡j":[1,6],"i͡e͡w":[6,4,7],"ɨ͡ə":[6,1],"ie":[6,4],"ăw":[2,7],"ăj":[2,6],"ɨ͡ə͡j":[6,1,6],"ɔ̆w":[3,7],"ɨ͡w":[6,7],"e͡j":[4,6],"ɨ͡ʌ͡w":[6,1,7],"ɨ͡j":[6,6],"iə":[6,1],"a͡ʲ":[2],"ɓ":[21],"ɗ":[19]}