-
Star
(122)
You must be signed in to star a gist -
Fork
(40)
You must be signed in to fork a gist
-
-
Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
import asyncio | |
import base64 | |
import json | |
import os | |
import pyaudio | |
from websockets.asyncio.client import connect | |
class SimpleGeminiVoice: | |
def __init__(self): | |
self.audio_queue = asyncio.Queue() | |
self.api_key = os.environ.get("GEMINI_API_KEY") | |
self.model = "gemini-2.0-flash-exp" | |
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
# Audio settings | |
self.FORMAT = pyaudio.paInt16 | |
self.CHANNELS = 1 | |
self.CHUNK = 512 | |
self.RATE = 16000 | |
async def start(self): | |
# Initialize websocket | |
self.ws = await connect( | |
self.uri, additional_headers={"Content-Type": "application/json"} | |
) | |
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
await self.ws.recv(decode=False) | |
print("Connected to Gemini, You can start talking now") | |
# Start audio streaming | |
async with asyncio.TaskGroup() as tg: | |
tg.create_task(self.capture_audio()) | |
tg.create_task(self.stream_audio()) | |
tg.create_task(self.play_response()) | |
async def capture_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, | |
channels=self.CHANNELS, | |
rate=self.RATE, | |
input=True, | |
frames_per_buffer=self.CHUNK, | |
) | |
while True: | |
data = await asyncio.to_thread(stream.read, self.CHUNK) | |
await self.ws.send( | |
json.dumps( | |
{ | |
"realtime_input": { | |
"media_chunks": [ | |
{ | |
"data": base64.b64encode(data).decode(), | |
"mime_type": "audio/pcm", | |
} | |
] | |
} | |
} | |
) | |
) | |
async def stream_audio(self): | |
async for msg in self.ws: | |
response = json.loads(msg) | |
try: | |
audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
"inlineData" | |
]["data"] | |
self.audio_queue.put_nowait(base64.b64decode(audio_data)) | |
except KeyError: | |
pass | |
try: | |
turn_complete = response["serverContent"]["turnComplete"] | |
except KeyError: | |
pass | |
else: | |
if turn_complete: | |
# If you interrupt the model, it sends an end_of_turn. For interruptions to work, we need to empty out the audio queue | |
print("\nEnd of turn") | |
while not self.audio_queue.empty(): | |
self.audio_queue.get_nowait() | |
async def play_response(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
) | |
while True: | |
data = await self.audio_queue.get() | |
await asyncio.to_thread(stream.write, data) | |
if __name__ == "__main__": | |
client = SimpleGeminiVoice() | |
asyncio.run(client.start()) |
import asyncio | |
import base64 | |
import json | |
import os | |
import pyaudio | |
from websockets.asyncio.client import connect | |
class SimpleGeminiVoice: | |
def __init__(self): | |
self.api_key = os.environ.get("GEMINI_API_KEY") | |
self.model = "gemini-2.0-flash-exp" | |
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
# Audio settings | |
self.FORMAT = pyaudio.paInt16 | |
self.CHANNELS = 1 | |
self.CHUNK = 512 | |
async def start(self): | |
# Initialize websocket | |
self.ws = await connect( | |
self.uri, additional_headers={"Content-Type": "application/json"} | |
) | |
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
await self.ws.recv(decode=False) | |
print("Connected to Gemini, You can start talking now") | |
# Start audio streaming | |
async with asyncio.TaskGroup() as tg: | |
tg.create_task(self.send_user_audio()) | |
tg.create_task(self.recv_model_audio()) | |
async def send_user_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, | |
channels=self.CHANNELS, | |
rate="16000", | |
input=True, | |
frames_per_buffer=self.CHUNK, | |
) | |
while True: | |
data = await asyncio.to_thread(stream.read, self.CHUNK) | |
await self.ws.send( | |
json.dumps( | |
{ | |
"realtime_input": { | |
"media_chunks": [ | |
{ | |
"data": base64.b64encode(data).decode(), | |
"mime_type": "audio/pcm", | |
} | |
] | |
} | |
} | |
) | |
) | |
async def recv_model_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
) | |
async for msg in self.ws: | |
response = json.loads(msg) | |
try: | |
audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
"inlineData" | |
]["data"] | |
await asyncio.to_thread(stream.write, base64.b64decode(audio_data)) | |
except KeyError: | |
pass | |
if __name__ == "__main__": | |
client = SimpleGeminiVoice() | |
asyncio.run(client.start()) |
I would like to use it in a voice channel through discord.py, but I can't seem to get it to work.
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground
You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
Thank you
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.
Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.
Пока нет. Ждем 2025. Середина января/начало февраля.
Perhaps we can use VAD to filter the noise form the audio and only send voice? @philschmid
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
thanks so much 🫶
Im just winging it. Based it off of python-genai/google/genai/tests/live/live_test.py

But now the responses have slowed way down. Basically unusable until they get it back running faster.