From a8eee9e906109746d54c0f1034e97e1103e6f7b4 Mon Sep 17 00:00:00 2001 From: rany Date: Fri, 22 Nov 2024 20:48:21 +0200 Subject: [PATCH] Drop words_in_cue code for SubMaker and switch to SRT WebVTT isn't a very common format in the first place and attempting to make WordBoundary play nice with input text is very hard. Instead we now just display the word that the TTS is saying at a given time. In the future, we could try to enable SentenceBoundary but there is a risk that it will be banned by Microsoft as it is not used by Microsoft Edge itself. Closes: https://github.com/rany2/edge-tts/issues/118 Closes: https://github.com/rany2/edge-tts/issues/171 Closes: https://github.com/rany2/edge-tts/issues/229 Closes: https://github.com/rany2/edge-tts/issues/234 Signed-off-by: rany --- README.md | 10 +-- examples/streaming_with_subtitles.py | 8 +- setup.py | 1 + src/edge_playback/__main__.py | 20 ++--- src/edge_tts/submaker.py | 114 ++++----------------------- src/edge_tts/util.py | 10 +-- 6 files changed, 39 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index abd6bc4..c13e98f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ If you only want to use the `edge-tts` and `edge-playback` commands, it would be If you want to use the `edge-tts` command, you can simply run it with the following command: - $ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.vtt + $ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.srt If you wish to play it back immediately with subtitles, you could use the `edge-playback` command: @@ -48,7 +48,7 @@ You can change the voice used by the text-to-speech service by using the `--voic ar-EG-SalmaNeural Female General Friendly, Positive ... - $ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.vtt + $ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.srt ### Custom SSML @@ -58,9 +58,9 @@ Support for custom SSML was removed because Microsoft prevents the use of any SS You can change the rate, volume and pitch of the generated speech by using the `--rate`, `--volume` and `--pitch` options. When using a negative value, you will need to use `--[option]=-50%` instead of `--[option] -50%` to avoid the option being interpreted as a command line option. - $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.vtt - $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.vtt - $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.vtt + $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.srt + $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.srt + $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.srt ## Python module diff --git a/examples/streaming_with_subtitles.py b/examples/streaming_with_subtitles.py index cccc904..52e91bb 100644 --- a/examples/streaming_with_subtitles.py +++ b/examples/streaming_with_subtitles.py @@ -14,7 +14,7 @@ TEXT = "Hello World!" VOICE = "en-GB-SoniaNeural" OUTPUT_FILE = "test.mp3" -WEBVTT_FILE = "test.vtt" +SRT_FILE = "test.srt" async def amain() -> None: @@ -26,10 +26,10 @@ async def amain() -> None: if chunk["type"] == "audio": file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": - submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) + submaker.add_cue((chunk["offset"], chunk["duration"]), chunk["text"]) - with open(WEBVTT_FILE, "w", encoding="utf-8") as file: - file.write(submaker.generate_subs()) + with open(SRT_FILE, "w", encoding="utf-8") as file: + file.write(submaker.get_srt()) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 5d06540..4af2066 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ install_requires=[ "aiohttp>=3.8.0", "certifi>=2023.11.17", + "srt>=3.4.1", "tabulate>=0.4.4", "typing-extensions>=4.1.0", ], diff --git a/src/edge_playback/__main__.py b/src/edge_playback/__main__.py index aed8801..89a5c27 100644 --- a/src/edge_playback/__main__.py +++ b/src/edge_playback/__main__.py @@ -25,7 +25,7 @@ def _main() -> None: keep = os.environ.get("EDGE_PLAYBACK_KEEP_TEMP") is not None mp3_fname = os.environ.get("EDGE_PLAYBACK_MP3_FILE") - vtt_fname = os.environ.get("EDGE_PLAYBACK_VTT_FILE") + srt_fname = os.environ.get("EDGE_PLAYBACK_SRT_FILE") media, subtitle = None, None try: if not mp3_fname: @@ -33,18 +33,18 @@ def _main() -> None: media.close() mp3_fname = media.name - if not vtt_fname: - subtitle = tempfile.NamedTemporaryFile(suffix=".vtt", delete=False) + if not srt_fname: + subtitle = tempfile.NamedTemporaryFile(suffix=".srt", delete=False) subtitle.close() - vtt_fname = subtitle.name + srt_fname = subtitle.name print(f"Media file: {mp3_fname}") - print(f"Subtitle file: {vtt_fname}\n") + print(f"Subtitle file: {srt_fname}\n") with subprocess.Popen( [ "edge-tts", f"--write-media={mp3_fname}", - f"--write-subtitles={vtt_fname}", + f"--write-subtitles={srt_fname}", ] + sys.argv[1:] ) as process: @@ -53,19 +53,19 @@ def _main() -> None: with subprocess.Popen( [ "mpv", - f"--sub-file={vtt_fname}", + f"--sub-file={srt_fname}", mp3_fname, ] ) as process: process.communicate() finally: if keep: - print(f"\nKeeping temporary files: {mp3_fname} and {vtt_fname}") + print(f"\nKeeping temporary files: {mp3_fname} and {srt_fname}") else: if mp3_fname is not None and os.path.exists(mp3_fname): os.unlink(mp3_fname) - if vtt_fname is not None and os.path.exists(vtt_fname): - os.unlink(vtt_fname) + if srt_fname is not None and os.path.exists(srt_fname): + os.unlink(srt_fname) if __name__ == "__main__": diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index 330fc51..ac60c33 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -1,33 +1,8 @@ """SubMaker module is used to generate subtitles from WordBoundary events.""" -import math from typing import List, Tuple -from xml.sax.saxutils import escape, unescape - -def formatter(start_time: float, end_time: float, subdata: str) -> str: - """ - formatter returns the timecode and the text of the subtitle. - """ - return ( - f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n" - f"{escape(subdata)}\r\n\r\n" - ) - - -def mktimestamp(time_unit: float) -> str: - """ - mktimestamp returns the timecode of the subtitle. - - The timecode is in the format of 00:00:00.000. - - Returns: - str: The timecode of the subtitle. - """ - hour = math.floor(time_unit / 10**7 / 3600) - minute = math.floor((time_unit / 10**7 / 60) % 60) - seconds = (time_unit / 10**7) % 60 - return f"{hour:02d}:{minute:02d}:{seconds:06.3f}" +import srt # type: ignore class SubMaker: @@ -36,19 +11,11 @@ class SubMaker: """ def __init__(self) -> None: - """ - SubMaker constructor initializes the list of subtitles and the list of offsets. + self.cues: List[srt.Subtitle] = [] # type: ignore - Returns: - None + def add_cue(self, timestamp: Tuple[float, float], text: str) -> None: """ - self.offset: List[Tuple[float, float]] = [] - self.subs: List[str] = [] - - def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: - """ - create_sub creates a subtitle from the given timestamp and text, - and appends it to the list of subtitles. + Add a subtitle part to the SubMaker object. Args: timestamp (tuple): The offset and duration of the subtitle. @@ -57,67 +24,20 @@ def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: Returns: None """ - self.offset.append((timestamp[0], timestamp[0] + timestamp[1])) - self.subs.append(text) - - def generate_subs(self, words_in_cue: int = 10) -> str: + self.cues.append( + srt.Subtitle( + index=len(self.cues) + 1, + start=srt.timedelta(microseconds=timestamp[0] / 10), + end=srt.timedelta(microseconds=sum(timestamp) / 10), + content=text, + ) + ) + + def get_srt(self) -> str: """ - generate_subs generates the complete subtitle file. - - Args: - words_in_cue (int): defines the number of words in a given cue + Get the SRT formatted subtitles from the SubMaker object. Returns: - str: The complete subtitle file. + str: The SRT formatted subtitles. """ - if len(self.subs) != len(self.offset): - raise ValueError("subs and offset are not of the same length") - - if words_in_cue <= 0: - raise ValueError("words_in_cue must be greater than 0") - - data = "WEBVTT\r\n\r\n" - sub_state_count = 0 - sub_state_start = -1.0 - sub_state_subs = "" - for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)): - start_time, end_time = offset - subs = unescape(subs) - - # wordboundary is guaranteed not to contain whitespace - if len(sub_state_subs) > 0: - sub_state_subs += " " - sub_state_subs += subs - - if sub_state_start == -1.0: - sub_state_start = start_time - sub_state_count += 1 - - if sub_state_count == words_in_cue or idx == len(self.offset) - 1: - subs = sub_state_subs - split_subs: List[str] = [ - subs[i : i + 79] for i in range(0, len(subs), 79) - ] - for i in range(len(split_subs) - 1): - sub = split_subs[i] - split_at_word = True - if sub[-1] == " ": - split_subs[i] = sub[:-1] - split_at_word = False - - if sub[0] == " ": - split_subs[i] = sub[1:] - split_at_word = False - - if split_at_word: - split_subs[i] += "-" - - data += formatter( - start_time=sub_state_start, - end_time=end_time, - subdata="\r\n".join(split_subs), - ) - sub_state_count = 0 - sub_state_start = -1 - sub_state_subs = "" - return data + return srt.compose(self.cues) # type: ignore diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index 64f0601..9781f56 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -61,7 +61,7 @@ async def _run_tts(args: Any) -> None: if chunk["type"] == "audio": audio_file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": - subs.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) + subs.add_cue((chunk["offset"], chunk["duration"]), chunk["text"]) sub_file: Union[TextIOWrapper, TextIO] = ( open(args.write_subtitles, "w", encoding="utf-8") @@ -69,7 +69,7 @@ async def _run_tts(args: Any) -> None: else sys.stderr ) with sub_file: - sub_file.write(subs.generate_subs(args.words_in_cue)) + sub_file.write(subs.get_srt()) async def amain() -> None: @@ -93,12 +93,6 @@ async def amain() -> None: parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%") parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%") parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz") - parser.add_argument( - "--words-in-cue", - help="number of words in a subtitle cue. Default: 10.", - default=10, - type=float, - ) parser.add_argument( "--write-media", help="send media output to file instead of stdout" )