@@ -304,68 +304,40 @@ def __init__(
304
304
async def stream (self ) -> AsyncGenerator [Dict [str , Any ], None ]:
305
305
"""Streams audio and metadata from the service."""
306
306
307
- texts = split_text_by_byte_length (
308
- escape (remove_incompatible_characters (self .text )),
309
- calc_max_mesg_size (self .voice , self .rate , self .volume , self .pitch ),
310
- )
311
- final_utterance : Dict [int , int ] = {}
312
- prev_idx = - 1
313
- shift_time = - 1
307
+ async def send_request (websocket : aiohttp .ClientWebSocketResponse ) -> None :
308
+ """Sends the request to the service."""
309
+
310
+ # Each message needs to have the proper date.
311
+ date = date_to_string ()
312
+
313
+ # Prepare the request to be sent to the service.
314
+ #
315
+ # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
316
+ # to be booleans, but Edge Browser seems to send them as strings.
317
+ #
318
+ # This is a bug in Edge as Azure Cognitive Services actually sends them as
319
+ # bool and not string. For now I will send them as bool unless it causes
320
+ # any problems.
321
+ #
322
+ # Also pay close attention to double { } in request (escape for f-string).
323
+ await websocket .send_str (
324
+ f"X-Timestamp:{ date } \r \n "
325
+ "Content-Type:application/json; charset=utf-8\r \n "
326
+ "Path:speech.config\r \n \r \n "
327
+ '{"context":{"synthesis":{"audio":{"metadataoptions":{'
328
+ '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
329
+ '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
330
+ "}}}}\r \n "
331
+ )
314
332
315
- ssl_ctx = ssl .create_default_context (cafile = certifi .where ())
316
- for idx , text in enumerate (texts ):
317
- async with aiohttp .ClientSession (
318
- trust_env = True ,
319
- ) as session , session .ws_connect (
320
- f"{ WSS_URL } &ConnectionId={ connect_id ()} " ,
321
- compress = 15 ,
322
- autoclose = True ,
323
- autoping = True ,
324
- proxy = self .proxy ,
325
- headers = {
326
- "Pragma" : "no-cache" ,
327
- "Cache-Control" : "no-cache" ,
328
- "Origin" : "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold" ,
329
- "Accept-Encoding" : "gzip, deflate, br" ,
330
- "Accept-Language" : "en-US,en;q=0.9" ,
331
- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
332
- " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41" ,
333
- },
334
- ssl = ssl_ctx ,
335
- ) as websocket :
336
- # download indicates whether we should be expecting audio data,
337
- # this is so what we avoid getting binary data from the websocket
338
- # and falsely thinking it's audio data.
339
- download_audio = False
340
-
341
- # audio_was_received indicates whether we have received audio data
342
- # from the websocket. This is so we can raise an exception if we
343
- # don't receive any audio data.
344
- audio_was_received = False
345
-
346
- # Each message needs to have the proper date.
347
- date = date_to_string ()
348
-
349
- # Prepare the request to be sent to the service.
350
- #
351
- # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
352
- # to be booleans, but Edge Browser seems to send them as strings.
353
- #
354
- # This is a bug in Edge as Azure Cognitive Services actually sends them as
355
- # bool and not string. For now I will send them as bool unless it causes
356
- # any problems.
357
- #
358
- # Also pay close attention to double { } in request (escape for f-string).
359
- await websocket .send_str (
360
- f"X-Timestamp:{ date } \r \n "
361
- "Content-Type:application/json; charset=utf-8\r \n "
362
- "Path:speech.config\r \n \r \n "
363
- '{"context":{"synthesis":{"audio":{"metadataoptions":{'
364
- '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
365
- '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
366
- "}}}}\r \n "
367
- )
333
+ # Split the text into multiple strings if it is too long for the service.
334
+ texts = split_text_by_byte_length (
335
+ escape (remove_incompatible_characters (self .text )),
336
+ calc_max_mesg_size (self .voice , self .rate , self .volume , self .pitch ),
337
+ )
368
338
339
+ # Send the request to the service.
340
+ for text in texts :
369
341
await websocket .send_str (
370
342
ssml_headers_plus_data (
371
343
connect_id (),
@@ -374,86 +346,89 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
374
346
)
375
347
)
376
348
377
- async for received in websocket :
378
- if received .type == aiohttp .WSMsgType .TEXT :
379
- parameters , data = get_headers_and_data (received .data )
380
- path = parameters .get (b"Path" )
381
- if path == b"turn.start" :
382
- download_audio = True
383
- elif path == b"turn.end" :
384
- download_audio = False
385
- break # End of audio data
386
- elif path == b"audio.metadata" :
387
- for meta_obj in json .loads (data )["Metadata" ]:
388
- meta_type = meta_obj ["Type" ]
389
- if idx != prev_idx :
390
- shift_time = sum (
391
- final_utterance [i ] for i in range (idx )
392
- )
393
- prev_idx = idx
394
- if meta_type == "WordBoundary" :
395
- final_utterance [idx ] = (
396
- meta_obj ["Data" ]["Offset" ]
397
- + meta_obj ["Data" ]["Duration" ]
398
- # Average padding added by the service
399
- # Alternatively we could use ffmpeg to get value properly
400
- # but I don't want to add an additional dependency
401
- # if this is found to work well enough.
402
- + 8_750_000
403
- )
404
- yield {
405
- "type" : meta_type ,
406
- "offset" : meta_obj ["Data" ]["Offset" ]
407
- + shift_time ,
408
- "duration" : meta_obj ["Data" ]["Duration" ],
409
- "text" : meta_obj ["Data" ]["text" ]["Text" ],
410
- }
411
- elif meta_type == "SessionEnd" :
412
- continue
413
- else :
414
- raise UnknownResponse (
415
- f"Unknown metadata type: { meta_type } "
416
- )
417
- elif path == b"response" :
418
- pass
419
- else :
420
- raise UnknownResponse (
421
- "The response from the service is not recognized.\n "
422
- + received .data
423
- )
424
- elif received .type == aiohttp .WSMsgType .BINARY :
425
- if not download_audio :
426
- raise UnexpectedResponse (
427
- "We received a binary message, but we are not expecting one."
428
- )
429
-
430
- if len (received .data ) < 2 :
431
- raise UnexpectedResponse (
432
- "We received a binary message, but it is missing the header length."
433
- )
434
-
435
- # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
436
- header_length = int .from_bytes (received .data [:2 ], "big" )
437
- if len (received .data ) < header_length + 2 :
438
- raise UnexpectedResponse (
439
- "We received a binary message, but it is missing the audio data."
440
- )
441
-
442
- yield {
443
- "type" : "audio" ,
444
- "data" : received .data [header_length + 2 :],
445
- }
446
- audio_was_received = True
447
- elif received .type == aiohttp .WSMsgType .ERROR :
448
- raise WebSocketError (
449
- received .data if received .data else "Unknown error"
349
+ ssl_ctx = ssl .create_default_context (cafile = certifi .where ())
350
+ async with aiohttp .ClientSession (
351
+ trust_env = True ,
352
+ ) as session , session .ws_connect (
353
+ f"{ WSS_URL } &ConnectionId={ connect_id ()} " ,
354
+ compress = 15 ,
355
+ autoclose = True ,
356
+ autoping = True ,
357
+ proxy = self .proxy ,
358
+ headers = {
359
+ "Pragma" : "no-cache" ,
360
+ "Cache-Control" : "no-cache" ,
361
+ "Origin" : "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold" ,
362
+ "Accept-Encoding" : "gzip, deflate, br" ,
363
+ "Accept-Language" : "en-US,en;q=0.9" ,
364
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
365
+ " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41" ,
366
+ },
367
+ ssl = ssl_ctx ,
368
+ ) as websocket :
369
+ # audio_was_received indicates whether we have received audio data
370
+ # from the websocket. This is so we can raise an exception if we
371
+ # don't receive any audio data.
372
+ audio_was_received = False
373
+
374
+ # Send the request to the service.
375
+ await send_request (websocket )
376
+
377
+ async for received in websocket :
378
+ if received .type == aiohttp .WSMsgType .TEXT :
379
+ parameters , data = get_headers_and_data (received .data )
380
+ path = parameters .get (b"Path" )
381
+ if path == b"audio.metadata" :
382
+ for meta_obj in json .loads (data )["Metadata" ]:
383
+ meta_type = meta_obj ["Type" ]
384
+ if meta_type == "WordBoundary" :
385
+ yield {
386
+ "type" : meta_type ,
387
+ "offset" : meta_obj ["Data" ]["Offset" ],
388
+ "duration" : meta_obj ["Data" ]["Duration" ],
389
+ "text" : meta_obj ["Data" ]["text" ]["Text" ],
390
+ }
391
+ elif meta_type in ("SessionEnd" ,):
392
+ continue
393
+ else :
394
+ raise UnknownResponse (
395
+ f"Unknown metadata type: { meta_type } "
396
+ )
397
+ elif path in (b"response" , b"turn.start" , b"turn.end" ):
398
+ pass
399
+ else :
400
+ raise UnknownResponse (
401
+ "The response from the service is not recognized.\n "
402
+ + received .data
403
+ )
404
+ elif received .type == aiohttp .WSMsgType .BINARY :
405
+ if len (received .data ) < 2 :
406
+ raise UnexpectedResponse (
407
+ "We received a binary message, but it is missing the header length."
408
+ )
409
+
410
+ # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
411
+ header_length = int .from_bytes (received .data [:2 ], "big" )
412
+ if len (received .data ) < header_length + 2 :
413
+ raise UnexpectedResponse (
414
+ "We received a binary message, but it is missing the audio data."
450
415
)
451
416
452
- if not audio_was_received :
453
- raise NoAudioReceived (
454
- "No audio was received. Please verify that your parameters are correct."
417
+ audio_was_received = header_length > 0
418
+ yield {
419
+ "type" : "audio" ,
420
+ "data" : received .data [header_length + 2 :],
421
+ }
422
+ elif received .type == aiohttp .WSMsgType .ERROR :
423
+ raise WebSocketError (
424
+ received .data if received .data else "Unknown error"
455
425
)
456
426
427
+ if not audio_was_received :
428
+ raise NoAudioReceived (
429
+ "No audio was received. Please verify that your parameters are correct."
430
+ )
431
+
457
432
async def save (
458
433
self ,
459
434
audio_fname : Union [str , bytes ],
0 commit comments