Created
October 28, 2025 21:30
-
-
Save amosgyamfi/e6e8273c2c352648bff6d6c7f8d510f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| import logging | |
| from uuid import uuid4 | |
| from dotenv import load_dotenv | |
| from vision_agents.core.edge.types import User | |
| from vision_agents.core.agents import Agent | |
| from vision_agents.plugins import cartesia, getstream, deepgram, smart_turn, gemini | |
| load_dotenv() | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s") | |
| logger = logging.getLogger(__name__) | |
| async def start_agent() -> None: | |
| """ | |
| Example demonstrating Fish Audio TTS integration with Vision Agents. | |
| This example creates an agent that uses: | |
| - Cartesia Sonic 3 for text-to-speech (TTS) | |
| - Deepgram for speech-to-text (STT) | |
| - GetStream for edge/real-time communication | |
| - Smart Turn for turn detection | |
| - Gemini for language model | |
| Requirements: | |
| - CARTERSIA_API_KEY environment variable | |
| - DEEPGRAM_API_KEY environment variable | |
| - STREAM_API_KEY and STREAM_API_SECRET environment variables | |
| """ | |
| agent = Agent( | |
| edge=getstream.Edge(), | |
| agent_user=User(name="Friendly AI", id="agent"), | |
| instructions="You're a voice AI assistant. Short replies only, no special characters. Respond only in English using Sarah's voice.", | |
| tts=cartesia.TTS(), # Uses Cartesia Sonic 3 for text-to-speech | |
| stt=deepgram.STT(), | |
| llm=gemini.LLM("gemini-2.0-flash"), | |
| turn_detection=smart_turn.TurnDetection(buffer_duration=2.0, confidence_threshold=0.5), | |
| ) | |
| await agent.create_user() | |
| call = agent.edge.client.video.call("default", str(uuid4())) | |
| await agent.edge.open_demo(call) | |
| with await agent.join(call): | |
| await asyncio.sleep(5) | |
| # The agent will greet the user using Fish Audio TTS | |
| await agent.llm.simple_response(text="Hello! I'm using Cartesia Sonic 3 for text-to-speech. How can I help you today?") | |
| await agent.finish() | |
| if __name__ == "__main__": | |
| asyncio.run(start_agent()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment