Refactor communicate for better readability

rany2 · rany2 · commit 755e543b3674 · 2024-02-16T18:36:20.000+02:00
Also improve performance on larger documents.

Signed-off-by: rany2 &lt;rany2@riseup.net&gt;
diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py
@@ -304,68 +304,40 @@ def __init__(
     async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
         """Streams audio and metadata from the service."""
 
-        texts = split_text_by_byte_length(
-            escape(remove_incompatible_characters(self.text)),
-            calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
-        )
-        final_utterance: Dict[int, int] = {}
-        prev_idx = -1
-        shift_time = -1
+        async def send_request(websocket: aiohttp.ClientWebSocketResponse) -> None:
+            """Sends the request to the service."""
+
+            # Each message needs to have the proper date.
+            date = date_to_string()
+
+            # Prepare the request to be sent to the service.
+            #
+            # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
+            # to be booleans, but Edge Browser seems to send them as strings.
+            #
+            # This is a bug in Edge as Azure Cognitive Services actually sends them as
+            # bool and not string. For now I will send them as bool unless it causes
+            # any problems.
+            #
+            # Also pay close attention to double { } in request (escape for f-string).
+            await websocket.send_str(
+                f"X-Timestamp:{date}\r\n"
+                "Content-Type:application/json; charset=utf-8\r\n"
+                "Path:speech.config\r\n\r\n"
+                '{"context":{"synthesis":{"audio":{"metadataoptions":{'
+                '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
+                '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
+                "}}}}\r\n"
+            )
 
-        ssl_ctx = ssl.create_default_context(cafile=certifi.where())
-        for idx, text in enumerate(texts):
-            async with aiohttp.ClientSession(
-                trust_env=True,
-            ) as session, session.ws_connect(
-                f"{WSS_URL}&ConnectionId={connect_id()}",
-                compress=15,
-                autoclose=True,
-                autoping=True,
-                proxy=self.proxy,
-                headers={
-                    "Pragma": "no-cache",
-                    "Cache-Control": "no-cache",
-                    "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-                    "Accept-Encoding": "gzip, deflate, br",
-                    "Accept-Language": "en-US,en;q=0.9",
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                    " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
-                },
-                ssl=ssl_ctx,
-            ) as websocket:
-                # download indicates whether we should be expecting audio data,
-                # this is so what we avoid getting binary data from the websocket
-                # and falsely thinking it's audio data.
-                download_audio = False
-
-                # audio_was_received indicates whether we have received audio data
-                # from the websocket. This is so we can raise an exception if we
-                # don't receive any audio data.
-                audio_was_received = False
-
-                # Each message needs to have the proper date.
-                date = date_to_string()
-
-                # Prepare the request to be sent to the service.
-                #
-                # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
-                # to be booleans, but Edge Browser seems to send them as strings.
-                #
-                # This is a bug in Edge as Azure Cognitive Services actually sends them as
-                # bool and not string. For now I will send them as bool unless it causes
-                # any problems.
-                #
-                # Also pay close attention to double { } in request (escape for f-string).
-                await websocket.send_str(
-                    f"X-Timestamp:{date}\r\n"
-                    "Content-Type:application/json; charset=utf-8\r\n"
-                    "Path:speech.config\r\n\r\n"
-                    '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                    '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
-                    '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
-                    "}}}}\r\n"
-                )
+            # Split the text into multiple strings if it is too long for the service.
+            texts = split_text_by_byte_length(
+                escape(remove_incompatible_characters(self.text)),
+                calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
+            )
 
+            # Send the request to the service.
+            for text in texts:
                 await websocket.send_str(
                     ssml_headers_plus_data(
                         connect_id(),
@@ -374,86 +346,89 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
                     )
                 )
 
-                async for received in websocket:
-                    if received.type == aiohttp.WSMsgType.TEXT:
-                        parameters, data = get_headers_and_data(received.data)
-                        path = parameters.get(b"Path")
-                        if path == b"turn.start":
-                            download_audio = True
-                        elif path == b"turn.end":
-                            download_audio = False
-                            break  # End of audio data
-                        elif path == b"audio.metadata":
-                            for meta_obj in json.loads(data)["Metadata"]:
-                                meta_type = meta_obj["Type"]
-                                if idx != prev_idx:
-                                    shift_time = sum(
-                                        final_utterance[i] for i in range(idx)
-                                    )
-                                    prev_idx = idx
-                                if meta_type == "WordBoundary":
-                                    final_utterance[idx] = (
-                                        meta_obj["Data"]["Offset"]
-                                        + meta_obj["Data"]["Duration"]
-                                        # Average padding added by the service
-                                        # Alternatively we could use ffmpeg to get value properly
-                                        # but I don't want to add an additional dependency
-                                        # if this is found to work well enough.
-                                        + 8_750_000
-                                    )
-                                    yield {
-                                        "type": meta_type,
-                                        "offset": meta_obj["Data"]["Offset"]
-                                        + shift_time,
-                                        "duration": meta_obj["Data"]["Duration"],
-                                        "text": meta_obj["Data"]["text"]["Text"],
-                                    }
-                                elif meta_type == "SessionEnd":
-                                    continue
-                                else:
-                                    raise UnknownResponse(
-                                        f"Unknown metadata type: {meta_type}"
-                                    )
-                        elif path == b"response":
-                            pass
-                        else:
-                            raise UnknownResponse(
-                                "The response from the service is not recognized.\n"
-                                + received.data
-                            )
-                    elif received.type == aiohttp.WSMsgType.BINARY:
-                        if not download_audio:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but we are not expecting one."
-                            )
-
-                        if len(received.data) < 2:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but it is missing the header length."
-                            )
-
-                        # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
-                        header_length = int.from_bytes(received.data[:2], "big")
-                        if len(received.data) < header_length + 2:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but it is missing the audio data."
-                            )
-
-                        yield {
-                            "type": "audio",
-                            "data": received.data[header_length + 2 :],
-                        }
-                        audio_was_received = True
-                    elif received.type == aiohttp.WSMsgType.ERROR:
-                        raise WebSocketError(
-                            received.data if received.data else "Unknown error"
+        ssl_ctx = ssl.create_default_context(cafile=certifi.where())
+        async with aiohttp.ClientSession(
+            trust_env=True,
+        ) as session, session.ws_connect(
+            f"{WSS_URL}&ConnectionId={connect_id()}",
+            compress=15,
+            autoclose=True,
+            autoping=True,
+            proxy=self.proxy,
+            headers={
+                "Pragma": "no-cache",
+                "Cache-Control": "no-cache",
+                "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+                "Accept-Encoding": "gzip, deflate, br",
+                "Accept-Language": "en-US,en;q=0.9",
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+                " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
+            },
+            ssl=ssl_ctx,
+        ) as websocket:
+            # audio_was_received indicates whether we have received audio data
+            # from the websocket. This is so we can raise an exception if we
+            # don't receive any audio data.
+            audio_was_received = False
+
+            # Send the request to the service.
+            await send_request(websocket)
+
+            async for received in websocket:
+                if received.type == aiohttp.WSMsgType.TEXT:
+                    parameters, data = get_headers_and_data(received.data)
+                    path = parameters.get(b"Path")
+                    if path == b"audio.metadata":
+                        for meta_obj in json.loads(data)["Metadata"]:
+                            meta_type = meta_obj["Type"]
+                            if meta_type == "WordBoundary":
+                                yield {
+                                    "type": meta_type,
+                                    "offset": meta_obj["Data"]["Offset"],
+                                    "duration": meta_obj["Data"]["Duration"],
+                                    "text": meta_obj["Data"]["text"]["Text"],
+                                }
+                            elif meta_type in ("SessionEnd",):
+                                continue
+                            else:
+                                raise UnknownResponse(
+                                    f"Unknown metadata type: {meta_type}"
+                                )
+                    elif path in (b"response", b"turn.start", b"turn.end"):
+                        pass
+                    else:
+                        raise UnknownResponse(
+                            "The response from the service is not recognized.\n"
+                            + received.data
+                        )
+                elif received.type == aiohttp.WSMsgType.BINARY:
+                    if len(received.data) < 2:
+                        raise UnexpectedResponse(
+                            "We received a binary message, but it is missing the header length."
+                        )
+
+                    # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
+                    header_length = int.from_bytes(received.data[:2], "big")
+                    if len(received.data) < header_length + 2:
+                        raise UnexpectedResponse(
+                            "We received a binary message, but it is missing the audio data."
                         )
 
-                if not audio_was_received:
-                    raise NoAudioReceived(
-                        "No audio was received. Please verify that your parameters are correct."
+                    audio_was_received = header_length > 0
+                    yield {
+                        "type": "audio",
+                        "data": received.data[header_length + 2 :],
+                    }
+                elif received.type == aiohttp.WSMsgType.ERROR:
+                    raise WebSocketError(
+                        received.data if received.data else "Unknown error"
                     )
 
+            if not audio_was_received:
+                raise NoAudioReceived(
+                    "No audio was received. Please verify that your parameters are correct."
+                )
+
     async def save(
         self,
         audio_fname: Union[str, bytes],