Skip to content

Instantly share code, notes, and snippets.

@philschmid
Last active April 10, 2025 15:30
Show Gist options
  • Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
import asyncio
import base64
import json
import os
import pyaudio
from websockets.asyncio.client import connect
class SimpleGeminiVoice:
def __init__(self):
self.audio_queue = asyncio.Queue()
self.api_key = os.environ.get("GEMINI_API_KEY")
self.model = "gemini-2.0-flash-exp"
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
# Audio settings
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.CHUNK = 512
self.RATE = 16000
async def start(self):
# Initialize websocket
self.ws = await connect(
self.uri, additional_headers={"Content-Type": "application/json"}
)
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}}))
await self.ws.recv(decode=False)
print("Connected to Gemini, You can start talking now")
# Start audio streaming
async with asyncio.TaskGroup() as tg:
tg.create_task(self.capture_audio())
tg.create_task(self.stream_audio())
tg.create_task(self.play_response())
async def capture_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK,
)
while True:
data = await asyncio.to_thread(stream.read, self.CHUNK)
await self.ws.send(
json.dumps(
{
"realtime_input": {
"media_chunks": [
{
"data": base64.b64encode(data).decode(),
"mime_type": "audio/pcm",
}
]
}
}
)
)
async def stream_audio(self):
async for msg in self.ws:
response = json.loads(msg)
try:
audio_data = response["serverContent"]["modelTurn"]["parts"][0][
"inlineData"
]["data"]
self.audio_queue.put_nowait(base64.b64decode(audio_data))
except KeyError:
pass
try:
turn_complete = response["serverContent"]["turnComplete"]
except KeyError:
pass
else:
if turn_complete:
# If you interrupt the model, it sends an end_of_turn. For interruptions to work, we need to empty out the audio queue
print("\nEnd of turn")
while not self.audio_queue.empty():
self.audio_queue.get_nowait()
async def play_response(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True
)
while True:
data = await self.audio_queue.get()
await asyncio.to_thread(stream.write, data)
if __name__ == "__main__":
client = SimpleGeminiVoice()
asyncio.run(client.start())
import asyncio
import base64
import json
import os
import pyaudio
from websockets.asyncio.client import connect
class SimpleGeminiVoice:
def __init__(self):
self.api_key = os.environ.get("GEMINI_API_KEY")
self.model = "gemini-2.0-flash-exp"
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
# Audio settings
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.CHUNK = 512
async def start(self):
# Initialize websocket
self.ws = await connect(
self.uri, additional_headers={"Content-Type": "application/json"}
)
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}}))
await self.ws.recv(decode=False)
print("Connected to Gemini, You can start talking now")
# Start audio streaming
async with asyncio.TaskGroup() as tg:
tg.create_task(self.send_user_audio())
tg.create_task(self.recv_model_audio())
async def send_user_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate="16000",
input=True,
frames_per_buffer=self.CHUNK,
)
while True:
data = await asyncio.to_thread(stream.read, self.CHUNK)
await self.ws.send(
json.dumps(
{
"realtime_input": {
"media_chunks": [
{
"data": base64.b64encode(data).decode(),
"mime_type": "audio/pcm",
}
]
}
}
)
)
async def recv_model_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True
)
async for msg in self.ws:
response = json.loads(msg)
try:
audio_data = response["serverContent"]["modelTurn"]["parts"][0][
"inlineData"
]["data"]
await asyncio.to_thread(stream.write, base64.b64decode(audio_data))
except KeyError:
pass
if __name__ == "__main__":
client = SimpleGeminiVoice()
asyncio.run(client.start())
@2187Nick
Copy link

Here for the system instruction shouldn't the role be system rather than user? @2187Nick

Im just winging it. Based it off of python-genai/google/genai/tests/live/live_test.py
image
But now the responses have slowed way down. Basically unusable until they get it back running faster.

@paimonian
Copy link

I would like to use it in a voice channel through discord.py, but I can't seem to get it to work.

@saharmor
Copy link

I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground
You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk

@avinashgawali
Copy link

I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk

Thank you

@KorowaLisa
Copy link

And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.

Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.

Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.

Пока нет. Ждем 2025. Середина января/начало февраля.

@jlia0
Copy link

jlia0 commented Dec 16, 2024

Perhaps we can use VAD to filter the noise form the audio and only send voice? @philschmid

@saharmor
Copy link

@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground

@jlia0
Copy link

jlia0 commented Dec 16, 2024

@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground

thanks so much 🫶

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment