Merge pull request #280 from pipecat-ai/aleix/library-updates-070224

library updates 070224 and pipecat 0.0.36
pipecat-ai · Jul 2, 2024 · 065cfb2 · 065cfb2
2 parents 8f6db5e + 3147534
commit 065cfb2
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to **pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.0.36] - 2024-07-02
 
 ### Added
 
@@ -61,6 +61,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Other
 
+- Added Fly.io deployment example in `examples/deployment/flyio-example`.
+
 - Added new `17-detect-user-idle.py` example that shows how to use the new
   `UserIdleProcessor`.
 

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -67,11 +67,12 @@ async def main(room_url: str, token):
             "Respond bot",
             DailyParams(
                 audio_out_enabled=True,
+                camera_out_enabled=True,
                 camera_out_width=1024,
                 camera_out_height=1024,
                 transcription_enabled=True,
                 vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer()
+                vad_analyzer=SileroVADAnalyzer(),
             )
         )
 
@@ -116,7 +117,7 @@ async def main(room_url: str, token):
         async def on_first_participant_joined(transport, participant):
             participant_name = participant["info"]["userName"] or ''
             transport.capture_participant_transcription(participant["id"])
-            await task.queue_frames([TextFrame(f"Hi, this is {participant_name}.")])
+            await task.queue_frames([TextFrame(f"Hi there {participant_name}!")])
 
         runner = PipelineRunner()
 

diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt
@@ -17,7 +17,7 @@ aiosignal==1.3.1
     # via aiohttp
 annotated-types==0.7.0
     # via pydantic
-anthropic==0.25.9
+anthropic==0.28.1
     # via
     #   openpipe
     #   pipecat-ai (pyproject.toml)
@@ -38,7 +38,7 @@ attrs==23.2.0
     #   openpipe
 av==12.2.0
     # via faster-whisper
-azure-cognitiveservices-speech==1.37.0
+azure-cognitiveservices-speech==1.38.0
     # via pipecat-ai (pyproject.toml)
 blinker==1.8.2
     # via flask
@@ -117,7 +117,7 @@ fsspec==2024.6.1
     #   torch
 future==1.0.0
     # via pyloudnorm
-google-ai-generativelanguage==0.6.4
+google-ai-generativelanguage==0.6.6
     # via google-generativeai
 google-api-core[grpc]==2.19.1
     # via
@@ -135,7 +135,7 @@ google-auth==2.31.0
     #   google-generativeai
 google-auth-httplib2==0.2.0
     # via google-api-python-client
-google-generativeai==0.5.4
+google-generativeai==0.7.1
     # via pipecat-ai (pyproject.toml)
 googleapis-common-protos==1.63.2
     # via
@@ -197,6 +197,8 @@ jinja2==3.1.4
     #   fastapi
     #   flask
     #   torch
+jiter==0.5.0
+    # via anthropic
 jsonpatch==1.33
     # via langchain-core
 jsonpointer==3.0.0
@@ -217,7 +219,7 @@ langchain-openai==0.1.10
     # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.2
     # via langchain
-langsmith==0.1.82
+langsmith==0.1.83
     # via
     #   langchain
     #   langchain-community
@@ -294,12 +296,12 @@ nvidia-nvtx-cu12==12.1.105
     # via torch
 onnxruntime==1.18.1
     # via faster-whisper
-openai==1.26.0
+openai==1.27.0
     # via
     #   langchain-openai
     #   openpipe
     #   pipecat-ai (pyproject.toml)
-openpipe==4.14.0
+openpipe==4.16.0
     # via pipecat-ai (pyproject.toml)
 orjson==3.10.5
     # via

diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt
@@ -17,7 +17,7 @@ aiosignal==1.3.1
     # via aiohttp
 annotated-types==0.7.0
     # via pydantic
-anthropic==0.25.9
+anthropic==0.28.1
     # via
     #   openpipe
     #   pipecat-ai (pyproject.toml)
@@ -38,7 +38,7 @@ attrs==23.2.0
     #   openpipe
 av==12.2.0
     # via faster-whisper
-azure-cognitiveservices-speech==1.37.0
+azure-cognitiveservices-speech==1.38.0
     # via pipecat-ai (pyproject.toml)
 blinker==1.8.2
     # via flask
@@ -116,7 +116,7 @@ fsspec==2024.6.1
     #   torch
 future==1.0.0
     # via pyloudnorm
-google-ai-generativelanguage==0.6.4
+google-ai-generativelanguage==0.6.6
     # via google-generativeai
 google-api-core[grpc]==2.19.1
     # via
@@ -134,7 +134,7 @@ google-auth==2.31.0
     #   google-generativeai
 google-auth-httplib2==0.2.0
     # via google-api-python-client
-google-generativeai==0.5.4
+google-generativeai==0.7.1
     # via pipecat-ai (pyproject.toml)
 googleapis-common-protos==1.63.2
     # via
@@ -194,6 +194,8 @@ jinja2==3.1.4
     #   fastapi
     #   flask
     #   torch
+jiter==0.5.0
+    # via anthropic
 jsonpatch==1.33
     # via langchain-core
 jsonpointer==3.0.0
@@ -214,7 +216,7 @@ langchain-openai==0.1.10
     # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.2
     # via langchain
-langsmith==0.1.82
+langsmith==0.1.83
     # via
     #   langchain
     #   langchain-community
@@ -260,12 +262,12 @@ numpy==1.26.4
     #   transformers
 onnxruntime==1.18.1
     # via faster-whisper
-openai==1.26.0
+openai==1.27.0
     # via
     #   langchain-openai
     #   openpipe
     #   pipecat-ai (pyproject.toml)
-openpipe==4.14.0
+openpipe==4.16.0
     # via pipecat-ai (pyproject.toml)
 orjson==3.10.5
     # via

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,25 +34,25 @@ Source = "https://github.com/pipecat-ai/pipecat"
 Website = "https://pipecat.ai"
 
 [project.optional-dependencies]
-anthropic = [ "anthropic~=0.25.7" ]
-azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
-cartesia = [ "cartesia~=1.0.0" ]
+anthropic = [ "anthropic~=0.28.1" ]
+azure = [ "azure-cognitiveservices-speech~=1.38.0" ]
+cartesia = [ "cartesia~=1.0.3" ]
 daily = [ "daily-python~=0.10.1" ]
 deepgram = [ "deepgram-sdk~=3.2.7" ]
 examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
-fal = [ "fal-client~=0.4.0" ]
+fal = [ "fal-client~=0.4.1" ]
 gladia = [ "websockets~=12.0" ]
-google = [ "google-generativeai~=0.5.3" ]
-fireworks = [ "openai~=1.26.0" ]
-langchain = [ "langchain~=0.2.1", "langchain-community~=0.2.1", "langchain-openai~=0.1.8" ]
+google = [ "google-generativeai~=0.7.1" ]
+fireworks = [ "openai~=1.27.0" ]
+langchain = [ "langchain~=0.2.6", "langchain-community~=0.2.6", "langchain-openai~=0.1.10" ]
 local = [ "pyaudio~=0.2.0" ]
 moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ]
-openai = [ "openai~=1.26.0" ]
-openpipe = [ "openpipe~=4.14.0" ]
+openai = [ "openai~=1.27.0" ]
+openpipe = [ "openpipe~=4.16.0" ]
 playht = [ "pyht~=0.0.28" ]
-silero = [ "torch~=2.3.0", "torchaudio~=2.3.0" ]
+silero = [ "torch~=2.3.1", "torchaudio~=2.3.1" ]
 websocket = [ "websockets~=12.0", "fastapi~=0.111.0" ]
-whisper = [ "faster-whisper~=1.0.2" ]
+whisper = [ "faster-whisper~=1.0.3" ]
 xtts = [ "resampy~=0.4.3" ]
 
 [tool.setuptools.packages.find]

diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
@@ -19,12 +19,11 @@
     ErrorFrame,
     Frame,
     StartFrame,
-    StartInterruptionFrame,
     SystemFrame,
     TranscriptionFrame,
     URLImageRawFrame)
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import AIService, AsyncAIService, TTSService, ImageGenService
+from pipecat.services.ai_services import AsyncAIService, TTSService, ImageGenService
 from pipecat.services.openai import BaseOpenAILLMService
 
 from loguru import logger
@@ -83,7 +82,7 @@ def can_generate_metrics(self) -> bool:
         return True
 
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        logger.debug(f"Generating TTS: {text}")
+        logger.debug(f"Generating TTS: [{text}]")
 
         await self.start_ttfb_metrics()
 
@@ -148,9 +147,11 @@ async def start(self, frame: StartFrame):
 
     async def stop(self, frame: EndFrame):
         self._speech_recognizer.stop_continuous_recognition_async()
+        self._audio_stream.close()
 
     async def cancel(self, frame: CancelFrame):
         self._speech_recognizer.stop_continuous_recognition_async()
+        self._audio_stream.close()
 
     def _on_handle_recognized(self, event):
         if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
@@ -8,7 +8,7 @@
 
 from typing import AsyncGenerator
 
-from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, Frame, StartFrame
 from pipecat.services.ai_services import TTSService
 
 from loguru import logger
@@ -28,21 +28,32 @@ def __init__(
         super().__init__(**kwargs)
 
         self._api_key = api_key
+        self._voice_id = voice_id
         self._model_id = model_id
         self._output_format = {
             "container": "raw",
             "encoding": encoding,
             "sample_rate": sample_rate,
         }
+        self._client = None
 
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    async def start(self, frame: StartFrame):
         try:
             self._client = AsyncCartesia(api_key=self._api_key)
-            self._voice = self._client.voices.get(id=voice_id)
+            self._voice = self._client.voices.get(id=self._voice_id)
         except Exception as e:
             logger.exception(f"{self} initialization error: {e}")
 
-    def can_generate_metrics(self) -> bool:
-        return True
+    async def stop(self, frame: EndFrame):
+        if self._client:
+            await self._client.close()
+
+    async def cancel(self, frame: CancelFrame):
+        if self._client:
+            await self._client.close()
 
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")