diff --git a/README.md b/README.md index abd6bc4..c13e98f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ If you only want to use the `edge-tts` and `edge-playback` commands, it would be If you want to use the `edge-tts` command, you can simply run it with the following command: - $ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.vtt + $ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.srt If you wish to play it back immediately with subtitles, you could use the `edge-playback` command: @@ -48,7 +48,7 @@ You can change the voice used by the text-to-speech service by using the `--voic ar-EG-SalmaNeural Female General Friendly, Positive ... - $ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.vtt + $ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.srt ### Custom SSML @@ -58,9 +58,9 @@ Support for custom SSML was removed because Microsoft prevents the use of any SS You can change the rate, volume and pitch of the generated speech by using the `--rate`, `--volume` and `--pitch` options. When using a negative value, you will need to use `--[option]=-50%` instead of `--[option] -50%` to avoid the option being interpreted as a command line option. - $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.vtt - $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.vtt - $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.vtt + $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.srt + $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.srt + $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.srt ## Python module diff --git a/examples/streaming_with_subtitles.py b/examples/streaming_with_subtitles.py index cccc904..52e91bb 100644 --- a/examples/streaming_with_subtitles.py +++ b/examples/streaming_with_subtitles.py @@ -14,7 +14,7 @@ TEXT = "Hello World!" VOICE = "en-GB-SoniaNeural" OUTPUT_FILE = "test.mp3" -WEBVTT_FILE = "test.vtt" +SRT_FILE = "test.srt" async def amain() -> None: @@ -26,10 +26,10 @@ async def amain() -> None: if chunk["type"] == "audio": file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": - submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) + submaker.add_cue((chunk["offset"], chunk["duration"]), chunk["text"]) - with open(WEBVTT_FILE, "w", encoding="utf-8") as file: - file.write(submaker.generate_subs()) + with open(SRT_FILE, "w", encoding="utf-8") as file: + file.write(submaker.get_srt()) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 5d06540..4af2066 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ install_requires=[ "aiohttp>=3.8.0", "certifi>=2023.11.17", + "srt>=3.4.1", "tabulate>=0.4.4", "typing-extensions>=4.1.0", ], diff --git a/src/edge_playback/__main__.py b/src/edge_playback/__main__.py index aed8801..89a5c27 100644 --- a/src/edge_playback/__main__.py +++ b/src/edge_playback/__main__.py @@ -25,7 +25,7 @@ def _main() -> None: keep = os.environ.get("EDGE_PLAYBACK_KEEP_TEMP") is not None mp3_fname = os.environ.get("EDGE_PLAYBACK_MP3_FILE") - vtt_fname = os.environ.get("EDGE_PLAYBACK_VTT_FILE") + srt_fname = os.environ.get("EDGE_PLAYBACK_SRT_FILE") media, subtitle = None, None try: if not mp3_fname: @@ -33,18 +33,18 @@ def _main() -> None: media.close() mp3_fname = media.name - if not vtt_fname: - subtitle = tempfile.NamedTemporaryFile(suffix=".vtt", delete=False) + if not srt_fname: + subtitle = tempfile.NamedTemporaryFile(suffix=".srt", delete=False) subtitle.close() - vtt_fname = subtitle.name + srt_fname = subtitle.name print(f"Media file: {mp3_fname}") - print(f"Subtitle file: {vtt_fname}\n") + print(f"Subtitle file: {srt_fname}\n") with subprocess.Popen( [ "edge-tts", f"--write-media={mp3_fname}", - f"--write-subtitles={vtt_fname}", + f"--write-subtitles={srt_fname}", ] + sys.argv[1:] ) as process: @@ -53,19 +53,19 @@ def _main() -> None: with subprocess.Popen( [ "mpv", - f"--sub-file={vtt_fname}", + f"--sub-file={srt_fname}", mp3_fname, ] ) as process: process.communicate() finally: if keep: - print(f"\nKeeping temporary files: {mp3_fname} and {vtt_fname}") + print(f"\nKeeping temporary files: {mp3_fname} and {srt_fname}") else: if mp3_fname is not None and os.path.exists(mp3_fname): os.unlink(mp3_fname) - if vtt_fname is not None and os.path.exists(vtt_fname): - os.unlink(vtt_fname) + if srt_fname is not None and os.path.exists(srt_fname): + os.unlink(srt_fname) if __name__ == "__main__": diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index 330fc51..ac60c33 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -1,33 +1,8 @@ """SubMaker module is used to generate subtitles from WordBoundary events.""" -import math from typing import List, Tuple -from xml.sax.saxutils import escape, unescape - -def formatter(start_time: float, end_time: float, subdata: str) -> str: - """ - formatter returns the timecode and the text of the subtitle. - """ - return ( - f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n" - f"{escape(subdata)}\r\n\r\n" - ) - - -def mktimestamp(time_unit: float) -> str: - """ - mktimestamp returns the timecode of the subtitle. - - The timecode is in the format of 00:00:00.000. - - Returns: - str: The timecode of the subtitle. - """ - hour = math.floor(time_unit / 10**7 / 3600) - minute = math.floor((time_unit / 10**7 / 60) % 60) - seconds = (time_unit / 10**7) % 60 - return f"{hour:02d}:{minute:02d}:{seconds:06.3f}" +import srt # type: ignore class SubMaker: @@ -36,19 +11,11 @@ class SubMaker: """ def __init__(self) -> None: - """ - SubMaker constructor initializes the list of subtitles and the list of offsets. + self.cues: List[srt.Subtitle] = [] # type: ignore - Returns: - None + def add_cue(self, timestamp: Tuple[float, float], text: str) -> None: """ - self.offset: List[Tuple[float, float]] = [] - self.subs: List[str] = [] - - def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: - """ - create_sub creates a subtitle from the given timestamp and text, - and appends it to the list of subtitles. + Add a subtitle part to the SubMaker object. Args: timestamp (tuple): The offset and duration of the subtitle. @@ -57,67 +24,20 @@ def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: Returns: None """ - self.offset.append((timestamp[0], timestamp[0] + timestamp[1])) - self.subs.append(text) - - def generate_subs(self, words_in_cue: int = 10) -> str: + self.cues.append( + srt.Subtitle( + index=len(self.cues) + 1, + start=srt.timedelta(microseconds=timestamp[0] / 10), + end=srt.timedelta(microseconds=sum(timestamp) / 10), + content=text, + ) + ) + + def get_srt(self) -> str: """ - generate_subs generates the complete subtitle file. - - Args: - words_in_cue (int): defines the number of words in a given cue + Get the SRT formatted subtitles from the SubMaker object. Returns: - str: The complete subtitle file. + str: The SRT formatted subtitles. """ - if len(self.subs) != len(self.offset): - raise ValueError("subs and offset are not of the same length") - - if words_in_cue <= 0: - raise ValueError("words_in_cue must be greater than 0") - - data = "WEBVTT\r\n\r\n" - sub_state_count = 0 - sub_state_start = -1.0 - sub_state_subs = "" - for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)): - start_time, end_time = offset - subs = unescape(subs) - - # wordboundary is guaranteed not to contain whitespace - if len(sub_state_subs) > 0: - sub_state_subs += " " - sub_state_subs += subs - - if sub_state_start == -1.0: - sub_state_start = start_time - sub_state_count += 1 - - if sub_state_count == words_in_cue or idx == len(self.offset) - 1: - subs = sub_state_subs - split_subs: List[str] = [ - subs[i : i + 79] for i in range(0, len(subs), 79) - ] - for i in range(len(split_subs) - 1): - sub = split_subs[i] - split_at_word = True - if sub[-1] == " ": - split_subs[i] = sub[:-1] - split_at_word = False - - if sub[0] == " ": - split_subs[i] = sub[1:] - split_at_word = False - - if split_at_word: - split_subs[i] += "-" - - data += formatter( - start_time=sub_state_start, - end_time=end_time, - subdata="\r\n".join(split_subs), - ) - sub_state_count = 0 - sub_state_start = -1 - sub_state_subs = "" - return data + return srt.compose(self.cues) # type: ignore diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index 64f0601..9781f56 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -61,7 +61,7 @@ async def _run_tts(args: Any) -> None: if chunk["type"] == "audio": audio_file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": - subs.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) + subs.add_cue((chunk["offset"], chunk["duration"]), chunk["text"]) sub_file: Union[TextIOWrapper, TextIO] = ( open(args.write_subtitles, "w", encoding="utf-8") @@ -69,7 +69,7 @@ async def _run_tts(args: Any) -> None: else sys.stderr ) with sub_file: - sub_file.write(subs.generate_subs(args.words_in_cue)) + sub_file.write(subs.get_srt()) async def amain() -> None: @@ -93,12 +93,6 @@ async def amain() -> None: parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%") parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%") parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz") - parser.add_argument( - "--words-in-cue", - help="number of words in a subtitle cue. Default: 10.", - default=10, - type=float, - ) parser.add_argument( "--write-media", help="send media output to file instead of stdout" )