removed built-in audio players, split for websocket and rtc

microsoft · Jan 31, 2025 · 9268d10 · 9268d10
1 parent 6d117be
commit 9268d10
Show file tree

Hide file tree

Showing 15 changed files with 689 additions and 743 deletions.
diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md
diff --git a/python/samples/concepts/audio/utils.py b/python/samples/concepts/audio/utils.py
diff --git a/...audio/04-chat_with_realtime_api_simple.py → .../realtime/01-chat_with_realtime_webrtc.py b/...audio/04-chat_with_realtime_api_simple.py → .../realtime/01-chat_with_realtime_webrtc.py
@@ -3,16 +3,17 @@
 import asyncio
 import logging
 
-from samples.concepts.audio.utils import check_audio_devices
+from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
     ListenEvents,
     OpenAIRealtime,
     OpenAIRealtimeExecutionSettings,
     TurnDetection,
 )
-from semantic_kernel.connectors.ai.utils import SKAudioPlayer
 
 logging.basicConfig(level=logging.WARNING)
+utils_log = logging.getLogger("samples.concepts.realtime.utils")
+utils_log.setLevel(logging.INFO)
 aiortc_log = logging.getLogger("aiortc")
 aiortc_log.setLevel(logging.WARNING)
 aioice_log = logging.getLogger("aioice")
@@ -43,7 +44,12 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = OpenAIRealtime("webrtc")
+    audio_player = AudioPlayerWebRTC()
+    realtime_client = OpenAIRealtime(
+        "webrtc",
+        audio_output_callback=audio_player.client_callback,
+        audio_track=AudioRecorderWebRTC(),
+    )
     # Create the settings for the session
     settings = OpenAIRealtimeExecutionSettings(
         instructions="""
@@ -58,15 +64,15 @@ async def main() -> None:
         turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
     )
     # the context manager calls the create_session method on the client and start listening to the audio stream
-    audio_player = SKAudioPlayer()
+
     print("Mosscap (transcript): ", end="")
     async with realtime_client, audio_player:
         await realtime_client.update_session(settings=settings, create_response=True)
 
         async for event in realtime_client.receive():
             match event.event_type:
-                case "audio":
-                    await audio_player.add_audio(event.audio)
+                # case "audio":
+                #     await audio_player.add_audio(event.audio)
                 case "text":
                     print(event.text.text, end="")
                 case "service":

diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+
+from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
+from semantic_kernel.connectors.ai.open_ai import (
+    ListenEvents,
+    OpenAIRealtime,
+    OpenAIRealtimeExecutionSettings,
+    TurnDetection,
+)
+
+logging.basicConfig(level=logging.WARNING)
+utils_log = logging.getLogger("samples.concepts.realtime.utils")
+utils_log.setLevel(logging.INFO)
+aiortc_log = logging.getLogger("aiortc")
+aiortc_log.setLevel(logging.WARNING)
+aioice_log = logging.getLogger("aioice")
+aioice_log.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# This simple sample demonstrates how to use the OpenAI Realtime API to create
+# a chat bot that can listen and respond directly through audio.
+# It requires installing:
+# - semantic-kernel[openai_realtime]
+# - pyaudio
+# - sounddevice
+# - pydub
+# - aiortc
+# e.g. pip install pyaudio sounddevice pydub
+
+# The characterics of your speaker and microphone are a big factor in a smooth conversation
+# so you may need to try out different devices for each.
+# you can also play around with the turn_detection settings to get the best results.
+# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
+# so you may need to adjust these for your system.
+# you can check the available devices by uncommenting line below the function
+check_audio_devices()
+
+
+async def main() -> None:
+    # create the realtime client and optionally add the audio output function, this is optional
+    # you can define the protocol to use, either "websocket" or "webrtc"
+    # they will behave the same way, even though the underlying protocol is quite different
+    audio_player = AudioPlayerWebsocket()
+    realtime_client = OpenAIRealtime(
+        "websocket",
+        audio_output_callback=audio_player.client_callback,
+    )
+    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
+    # Create the settings for the session
+    settings = OpenAIRealtimeExecutionSettings(
+        instructions="""
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """,
+        voice="shimmer",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+    )
+    # the context manager calls the create_session method on the client and start listening to the audio stream
+    print("Mosscap (transcript): ", end="")
+
+    async with realtime_client, audio_player, audio_recorder:
+        await realtime_client.update_session(settings=settings, create_response=True)
+
+        async for event in realtime_client.receive():
+            match event.event_type:
+                # this can be used as an alternative to the callback function used above,
+                # the callback is faster and smoother
+                # case "audio":
+                #     await audio_player.add_audio(event.audio)
+                case "text":
+                    print(event.text.text, end="")
+                case "service":
+                    # OpenAI Specific events
+                    if event.service_type == ListenEvents.SESSION_UPDATED:
+                        print("Session updated")
+                    if event.service_type == ListenEvents.RESPONSE_CREATED:
+                        print("")
+                    if event.service_type == ListenEvents.ERROR:
+                        logger.error(event.event)
+
+
+if __name__ == "__main__":
+    print(
+        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())
diff --git a/...udio/05-chat_with_realtime_api_complex.py → ...realtime/02-chat_with_function_calling.py b/...udio/05-chat_with_realtime_api_complex.py → ...realtime/02-chat_with_function_calling.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 from random import randint
 
-from samples.concepts.audio.utils import check_audio_devices
+from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel import Kernel
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
@@ -14,11 +14,12 @@
     OpenAIRealtimeExecutionSettings,
     TurnDetection,
 )
-from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack
 from semantic_kernel.contents import ChatHistory
 from semantic_kernel.functions import kernel_function
 
 logging.basicConfig(level=logging.WARNING)
+utils_log = logging.getLogger("samples.concepts.realtime.utils")
+utils_log.setLevel(logging.INFO)
 aiortc_log = logging.getLogger("aiortc")
 aiortc_log.setLevel(logging.WARNING)
 aioice_log = logging.getLogger("aioice")
@@ -78,15 +79,15 @@ async def main() -> None:
 
     # create the audio player and audio track
     # both take a device_id parameter, which is the index of the device to use, if None the default device is used
-    audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
-    audio_track = SKAudioTrack()
+    audio_player = AudioPlayerWebRTC()
+    audio_track = AudioRecorderWebRTC()
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
     realtime_client = OpenAIRealtime(
-        protocol="websocket",
+        protocol="webrtc",
         audio_output_callback=audio_player.client_callback,
-        # audio_track=audio_track,
+        audio_track=audio_track,
     )
 
     # Create the settings for the session
@@ -116,7 +117,7 @@ async def main() -> None:
     chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
 
     # the context manager calls the create_session method on the client and start listening to the audio stream
-    async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
+    async with realtime_client, audio_player:
         await realtime_client.update_session(
             settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
         )