Skip to content

Commit

Permalink
removed built-in audio players, split for websocket and rtc
Browse files Browse the repository at this point in the history
  • Loading branch information
eavanvalkenburg committed Jan 31, 2025
1 parent 6d117be commit 9268d10
Show file tree
Hide file tree
Showing 15 changed files with 689 additions and 743 deletions.
346 changes: 0 additions & 346 deletions docs/decisions/00XX-realtime-api-clients.md

This file was deleted.

11 changes: 0 additions & 11 deletions python/samples/concepts/audio/utils.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import asyncio
import logging

from samples.concepts.audio.utils import check_audio_devices
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.utils import SKAudioPlayer

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
Expand Down Expand Up @@ -43,7 +44,12 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime("webrtc")
audio_player = AudioPlayerWebRTC()
realtime_client = OpenAIRealtime(
"webrtc",
audio_output_callback=audio_player.client_callback,
audio_track=AudioRecorderWebRTC(),
)
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
instructions="""
Expand All @@ -58,15 +64,15 @@ async def main() -> None:
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
audio_player = SKAudioPlayer()

print("Mosscap (transcript): ", end="")
async with realtime_client, audio_player:
await realtime_client.update_session(settings=settings, create_response=True)

async for event in realtime_client.receive():
match event.event_type:
case "audio":
await audio_player.add_audio(event.audio)
# case "audio":
# await audio_player.add_audio(event.audio)
case "text":
print(event.text.text, end="")
case "service":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging

from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function
check_audio_devices()


async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
audio_player = AudioPlayerWebsocket()
realtime_client = OpenAIRealtime(
"websocket",
audio_output_callback=audio_player.client_callback,
)
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
""",
voice="shimmer",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
print("Mosscap (transcript): ", end="")

async with realtime_client, audio_player, audio_recorder:
await realtime_client.update_session(settings=settings, create_response=True)

async for event in realtime_client.receive():
match event.event_type:
# this can be used as an alternative to the callback function used above,
# the callback is faster and smoother
# case "audio":
# await audio_player.add_audio(event.audio)
case "text":
print(event.text.text, end="")
case "service":
# OpenAI Specific events
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
if event.service_type == ListenEvents.RESPONSE_CREATED:
print("")
if event.service_type == ListenEvents.ERROR:
logger.error(event.event)


if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime
from random import randint

from samples.concepts.audio.utils import check_audio_devices
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
Expand All @@ -14,11 +14,12 @@
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack
from semantic_kernel.contents import ChatHistory
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
Expand Down Expand Up @@ -78,15 +79,15 @@ async def main() -> None:

# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
audio_track = SKAudioTrack()
audio_player = AudioPlayerWebRTC()
audio_track = AudioRecorderWebRTC()
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime(
protocol="websocket",
protocol="webrtc",
audio_output_callback=audio_player.client_callback,
# audio_track=audio_track,
audio_track=audio_track,
)

# Create the settings for the session
Expand Down Expand Up @@ -116,7 +117,7 @@ async def main() -> None:
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

# the context manager calls the create_session method on the client and start listening to the audio stream
async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
async with realtime_client, audio_player:
await realtime_client.update_session(
settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
)
Expand Down
Loading

0 comments on commit 9268d10

Please sign in to comment.