Skip to content

Commit 755e543

Browse files
committed
Refactor communicate for better readability
Also improve performance on larger documents. Signed-off-by: rany2 <[email protected]>
1 parent df6bac8 commit 755e543

File tree

1 file changed

+111
-136
lines changed

1 file changed

+111
-136
lines changed

src/edge_tts/communicate.py

+111-136
Original file line numberDiff line numberDiff line change
@@ -304,68 +304,40 @@ def __init__(
304304
async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
305305
"""Streams audio and metadata from the service."""
306306

307-
texts = split_text_by_byte_length(
308-
escape(remove_incompatible_characters(self.text)),
309-
calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
310-
)
311-
final_utterance: Dict[int, int] = {}
312-
prev_idx = -1
313-
shift_time = -1
307+
async def send_request(websocket: aiohttp.ClientWebSocketResponse) -> None:
308+
"""Sends the request to the service."""
309+
310+
# Each message needs to have the proper date.
311+
date = date_to_string()
312+
313+
# Prepare the request to be sent to the service.
314+
#
315+
# Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
316+
# to be booleans, but Edge Browser seems to send them as strings.
317+
#
318+
# This is a bug in Edge as Azure Cognitive Services actually sends them as
319+
# bool and not string. For now I will send them as bool unless it causes
320+
# any problems.
321+
#
322+
# Also pay close attention to double { } in request (escape for f-string).
323+
await websocket.send_str(
324+
f"X-Timestamp:{date}\r\n"
325+
"Content-Type:application/json; charset=utf-8\r\n"
326+
"Path:speech.config\r\n\r\n"
327+
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
328+
'"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
329+
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
330+
"}}}}\r\n"
331+
)
314332

315-
ssl_ctx = ssl.create_default_context(cafile=certifi.where())
316-
for idx, text in enumerate(texts):
317-
async with aiohttp.ClientSession(
318-
trust_env=True,
319-
) as session, session.ws_connect(
320-
f"{WSS_URL}&ConnectionId={connect_id()}",
321-
compress=15,
322-
autoclose=True,
323-
autoping=True,
324-
proxy=self.proxy,
325-
headers={
326-
"Pragma": "no-cache",
327-
"Cache-Control": "no-cache",
328-
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
329-
"Accept-Encoding": "gzip, deflate, br",
330-
"Accept-Language": "en-US,en;q=0.9",
331-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
332-
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
333-
},
334-
ssl=ssl_ctx,
335-
) as websocket:
336-
# download indicates whether we should be expecting audio data,
337-
# this is so what we avoid getting binary data from the websocket
338-
# and falsely thinking it's audio data.
339-
download_audio = False
340-
341-
# audio_was_received indicates whether we have received audio data
342-
# from the websocket. This is so we can raise an exception if we
343-
# don't receive any audio data.
344-
audio_was_received = False
345-
346-
# Each message needs to have the proper date.
347-
date = date_to_string()
348-
349-
# Prepare the request to be sent to the service.
350-
#
351-
# Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
352-
# to be booleans, but Edge Browser seems to send them as strings.
353-
#
354-
# This is a bug in Edge as Azure Cognitive Services actually sends them as
355-
# bool and not string. For now I will send them as bool unless it causes
356-
# any problems.
357-
#
358-
# Also pay close attention to double { } in request (escape for f-string).
359-
await websocket.send_str(
360-
f"X-Timestamp:{date}\r\n"
361-
"Content-Type:application/json; charset=utf-8\r\n"
362-
"Path:speech.config\r\n\r\n"
363-
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
364-
'"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
365-
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
366-
"}}}}\r\n"
367-
)
333+
# Split the text into multiple strings if it is too long for the service.
334+
texts = split_text_by_byte_length(
335+
escape(remove_incompatible_characters(self.text)),
336+
calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
337+
)
368338

339+
# Send the request to the service.
340+
for text in texts:
369341
await websocket.send_str(
370342
ssml_headers_plus_data(
371343
connect_id(),
@@ -374,86 +346,89 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
374346
)
375347
)
376348

377-
async for received in websocket:
378-
if received.type == aiohttp.WSMsgType.TEXT:
379-
parameters, data = get_headers_and_data(received.data)
380-
path = parameters.get(b"Path")
381-
if path == b"turn.start":
382-
download_audio = True
383-
elif path == b"turn.end":
384-
download_audio = False
385-
break # End of audio data
386-
elif path == b"audio.metadata":
387-
for meta_obj in json.loads(data)["Metadata"]:
388-
meta_type = meta_obj["Type"]
389-
if idx != prev_idx:
390-
shift_time = sum(
391-
final_utterance[i] for i in range(idx)
392-
)
393-
prev_idx = idx
394-
if meta_type == "WordBoundary":
395-
final_utterance[idx] = (
396-
meta_obj["Data"]["Offset"]
397-
+ meta_obj["Data"]["Duration"]
398-
# Average padding added by the service
399-
# Alternatively we could use ffmpeg to get value properly
400-
# but I don't want to add an additional dependency
401-
# if this is found to work well enough.
402-
+ 8_750_000
403-
)
404-
yield {
405-
"type": meta_type,
406-
"offset": meta_obj["Data"]["Offset"]
407-
+ shift_time,
408-
"duration": meta_obj["Data"]["Duration"],
409-
"text": meta_obj["Data"]["text"]["Text"],
410-
}
411-
elif meta_type == "SessionEnd":
412-
continue
413-
else:
414-
raise UnknownResponse(
415-
f"Unknown metadata type: {meta_type}"
416-
)
417-
elif path == b"response":
418-
pass
419-
else:
420-
raise UnknownResponse(
421-
"The response from the service is not recognized.\n"
422-
+ received.data
423-
)
424-
elif received.type == aiohttp.WSMsgType.BINARY:
425-
if not download_audio:
426-
raise UnexpectedResponse(
427-
"We received a binary message, but we are not expecting one."
428-
)
429-
430-
if len(received.data) < 2:
431-
raise UnexpectedResponse(
432-
"We received a binary message, but it is missing the header length."
433-
)
434-
435-
# See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
436-
header_length = int.from_bytes(received.data[:2], "big")
437-
if len(received.data) < header_length + 2:
438-
raise UnexpectedResponse(
439-
"We received a binary message, but it is missing the audio data."
440-
)
441-
442-
yield {
443-
"type": "audio",
444-
"data": received.data[header_length + 2 :],
445-
}
446-
audio_was_received = True
447-
elif received.type == aiohttp.WSMsgType.ERROR:
448-
raise WebSocketError(
449-
received.data if received.data else "Unknown error"
349+
ssl_ctx = ssl.create_default_context(cafile=certifi.where())
350+
async with aiohttp.ClientSession(
351+
trust_env=True,
352+
) as session, session.ws_connect(
353+
f"{WSS_URL}&ConnectionId={connect_id()}",
354+
compress=15,
355+
autoclose=True,
356+
autoping=True,
357+
proxy=self.proxy,
358+
headers={
359+
"Pragma": "no-cache",
360+
"Cache-Control": "no-cache",
361+
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
362+
"Accept-Encoding": "gzip, deflate, br",
363+
"Accept-Language": "en-US,en;q=0.9",
364+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
365+
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
366+
},
367+
ssl=ssl_ctx,
368+
) as websocket:
369+
# audio_was_received indicates whether we have received audio data
370+
# from the websocket. This is so we can raise an exception if we
371+
# don't receive any audio data.
372+
audio_was_received = False
373+
374+
# Send the request to the service.
375+
await send_request(websocket)
376+
377+
async for received in websocket:
378+
if received.type == aiohttp.WSMsgType.TEXT:
379+
parameters, data = get_headers_and_data(received.data)
380+
path = parameters.get(b"Path")
381+
if path == b"audio.metadata":
382+
for meta_obj in json.loads(data)["Metadata"]:
383+
meta_type = meta_obj["Type"]
384+
if meta_type == "WordBoundary":
385+
yield {
386+
"type": meta_type,
387+
"offset": meta_obj["Data"]["Offset"],
388+
"duration": meta_obj["Data"]["Duration"],
389+
"text": meta_obj["Data"]["text"]["Text"],
390+
}
391+
elif meta_type in ("SessionEnd",):
392+
continue
393+
else:
394+
raise UnknownResponse(
395+
f"Unknown metadata type: {meta_type}"
396+
)
397+
elif path in (b"response", b"turn.start", b"turn.end"):
398+
pass
399+
else:
400+
raise UnknownResponse(
401+
"The response from the service is not recognized.\n"
402+
+ received.data
403+
)
404+
elif received.type == aiohttp.WSMsgType.BINARY:
405+
if len(received.data) < 2:
406+
raise UnexpectedResponse(
407+
"We received a binary message, but it is missing the header length."
408+
)
409+
410+
# See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
411+
header_length = int.from_bytes(received.data[:2], "big")
412+
if len(received.data) < header_length + 2:
413+
raise UnexpectedResponse(
414+
"We received a binary message, but it is missing the audio data."
450415
)
451416

452-
if not audio_was_received:
453-
raise NoAudioReceived(
454-
"No audio was received. Please verify that your parameters are correct."
417+
audio_was_received = header_length > 0
418+
yield {
419+
"type": "audio",
420+
"data": received.data[header_length + 2 :],
421+
}
422+
elif received.type == aiohttp.WSMsgType.ERROR:
423+
raise WebSocketError(
424+
received.data if received.data else "Unknown error"
455425
)
456426

427+
if not audio_was_received:
428+
raise NoAudioReceived(
429+
"No audio was received. Please verify that your parameters are correct."
430+
)
431+
457432
async def save(
458433
self,
459434
audio_fname: Union[str, bytes],

0 commit comments

Comments
 (0)