diff --git a/README.md b/README.md index 7a56b2ad..84740df 100644 --- a/README.md +++ b/README.md @@ -63,17 +63,16 @@ You must first check the available voices with the `--list-voices` option: Support for custom SSML has been removed since 5.0.0 because Microsoft has taken the initiative to prevent it from working. You cannot use custom SSML anymore. -### Changing rate and volume +### Changing rate, volume and pitch It is possible to make minor changes to the generated speech. $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_halved.mp3 --write-subtitles hello_with_rate_halved.vtt $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_halved.mp3 --write-subtitles hello_with_volume_halved.vtt + $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_halved.mp3 --write-subtitles hello_with_pitch_halved.vtt In addition, it is required to use `--rate=-50%` instead of `--rate -50%` (note the lack of an equal sign) otherwise the `-50%` would be interpreted as just another argument. -**NOTE**: `--pitch` was removed in 6.0.3 as it no longer appears to have any effect. - ### Note on the `edge-playback` command `edge-playback` is just a wrapper around `edge-tts` that plays back the generated speech. It takes the same arguments as the `edge-tts` option. diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py index 0a994da..d482b84 100644 --- a/src/edge_tts/communicate.py +++ b/src/edge_tts/communicate.py @@ -152,7 +152,7 @@ def split_text_by_byte_length( yield new_text -def mkssml(text: Union[str, bytes], voice: str, rate: str, volume: str) -> str: +def mkssml(text: Union[str, bytes], voice: str, rate: str, volume: str, pitch: str) -> str: """ Creates a SSML string from the given parameters. @@ -164,7 +164,7 @@ def mkssml(text: Union[str, bytes], voice: str, rate: str, volume: str) -> str: ssml = ( "" - f"" + f"" f"{text}" ) return ssml @@ -203,7 +203,7 @@ def ssml_headers_plus_data(request_id: str, timestamp: str, ssml: str) -> str: ) -def calc_max_mesg_size(voice: str, rate: str, volume: str) -> int: +def calc_max_mesg_size(voice: str, rate: str, volume: str, pitch: str) -> int: """Calculates the maximum message size for the given voice, rate, and volume. Returns: @@ -215,7 +215,7 @@ def calc_max_mesg_size(voice: str, rate: str, volume: str) -> int: ssml_headers_plus_data( connect_id(), date_to_string(), - mkssml("", voice, rate, volume), + mkssml("", voice, rate, volume, pitch), ) ) + 50 # margin of error @@ -235,6 +235,7 @@ def __init__( *, rate: str = "+0%", volume: str = "+0%", + pitch: str = "+0Hz", proxy: Optional[str] = None, ): """ @@ -289,6 +290,12 @@ def __init__( raise ValueError(f"Invalid volume '{volume}'.") self.volume: str = volume + if not isinstance(pitch, str): + raise TypeError("pitch must be str") + if re.match(r"^[+-]\d+Hz$", pitch) is None: + raise ValueError(f"Invalid pitch '{pitch}'.") + self.pitch: str = pitch + if proxy is not None and not isinstance(proxy, str): raise TypeError("proxy must be str") self.proxy: Optional[str] = proxy @@ -298,7 +305,7 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: texts = split_text_by_byte_length( escape(remove_incompatible_characters(self.text)), - calc_max_mesg_size(self.voice, self.rate, self.volume), + calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch), ) final_utterance: Dict[int, int] = {} prev_idx = -1 @@ -362,7 +369,7 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: ssml_headers_plus_data( connect_id(), date, - mkssml(text, self.voice, self.rate, self.volume), + mkssml(text, self.voice, self.rate, self.volume, self.pitch), ) ) diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index a26c9bd..0fde83e 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -57,6 +57,7 @@ async def _run_tts(args: Any) -> None: proxy=args.proxy, rate=args.rate, volume=args.volume, + pitch=args.pitch, ) subs: SubMaker = SubMaker() with open( @@ -97,6 +98,7 @@ async def amain() -> None: ) parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%") parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%") + parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz") parser.add_argument( "--words-in-cue", help="number of words in a subtitle cue. Default: 10.",