Skip to content

Instantly share code, notes, and snippets.

@josiahbryan
Created March 13, 2025 13:49
Show Gist options
  • Save josiahbryan/393904be297ac48e3e865076be175e42 to your computer and use it in GitHub Desktop.
Save josiahbryan/393904be297ac48e3e865076be175e42 to your computer and use it in GitHub Desktop.
Improved Audio Handler for LiveKit Background Audio
# Full disclosure:
# This was heavily based on https://github.com/ShayneP/meditation-assistant/blob/main/audio_handler.py
import asyncio
import numpy as np
import wave
from pathlib import Path
from livekit import rtc
import logging
import uuid # Built-in UUID generator
logger = logging.getLogger("voice-agent")
# async def before_llm_cb(agent: VoicePipelineAgent, chat_ctx: llm.ChatContext):
# """Callback that runs before the LLM generates a response.
# """
# await play_empty_audio(agent._room, duration_seconds=0.25)
# return None
async def play_wav_file(room: rtc.Room, file_path: str, interrupt_event: asyncio.Event = None) -> None:
"""
Play a WAV file through a LiveKit track. Can be interrupted if interrupt_event is provided.
Args:
room: The LiveKit room to publish to
file_path: Path to the WAV file to play
interrupt_event: Optional asyncio.Event that when set will stop playback
"""
file_path = Path(file_path)
# logger.info(f"[play_wav_file] Playing WAV file: {file_path}")
# Open and read WAV file
with wave.open(str(file_path), 'rb') as wav_file:
# Get audio parameters
channels = wav_file.getnchannels()
sample_rate = wav_file.getframerate()
sample_width = wav_file.getsampwidth()
# Create audio source and track
source = rtc.AudioSource(sample_rate, channels)
track = rtc.LocalAudioTrack.create_audio_track("audio_playback", source)
options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)
# Publish track
publication = await room.local_participant.publish_track(track, options)
await publication.wait_for_subscription()
# Calculate frame size for 20ms chunks
frame_duration = 0.02 # 20ms
frame_samples = int(sample_rate * frame_duration)
bytes_per_sample = sample_width * channels
frame_bytes = frame_samples * bytes_per_sample
frames_played = 0
try:
# Read and play audio in chunks
while True:
# Check for interrupt signal if event provided
if interrupt_event and interrupt_event.is_set():
logger.info("Audio playback interrupted")
break
data = wav_file.readframes(frame_samples)
if not data:
break
# Pad last frame if needed
if len(data) < frame_bytes:
data = data + b'\0' * (frame_bytes - len(data))
# Create and capture frame
frame = rtc.AudioFrame(
data=data,
sample_rate=sample_rate,
num_channels=channels,
samples_per_channel=frame_samples // channels
)
await source.capture_frame(frame)
frames_played += 1
finally:
# Always wait for playout to finish and cleanup
await source.wait_for_playout()
await publication.stop()
track.stop()
async def play_empty_audio(room: rtc.Room, duration_seconds: float = 1.5) -> None:
"""
Play empty (silent) audio for a specified duration.
Args:
room: The LiveKit room to publish to
duration_seconds: Duration of silence to play in seconds
"""
# Use standard audio parameters
sample_rate = 48000 # Standard sample rate
channels = 1 # Mono audio
# Create audio source and track
source = rtc.AudioSource(sample_rate, channels)
track = rtc.LocalAudioTrack.create_audio_track("empty_audio", source)
options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)
publication = await room.local_participant.publish_track(track, options)
await publication.wait_for_subscription()
# Calculate frame size for 20ms chunks
frame_duration = 0.02 # 20ms
frame_samples = int(sample_rate * frame_duration)
# Calculate total number of frames needed
total_frames = int(duration_seconds / frame_duration)
frames_played = 0
# Play empty frames
for _ in range(total_frames):
# Create empty frame (silence)
empty_data = bytes(frame_samples * 2) # 2 bytes per sample for int16
frame = rtc.AudioFrame(
data=empty_data,
sample_rate=sample_rate,
num_channels=channels,
samples_per_channel=frame_samples
)
await source.capture_frame(frame)
frames_played += 1
await source.wait_for_playout()
class AudioHandler:
def __init__(self, sample_rate=48000, channels=1, track_id=None):
self.audio_source = rtc.AudioSource(sample_rate, channels)
self.track_id = track_id or f"audio_{str(uuid.uuid4())[:8]}" # Use first 8 chars of UUID
self.audio_track = rtc.LocalAudioTrack.create_audio_track(self.track_id, self.audio_source)
self.audio_task = None
self.audio_running = asyncio.Event()
self.fade_out = False
self.fade_duration = 5.0 # Default fade duration
async def start_audio(self, wav_path: Path | str, volume: float = 0.3):
self.audio_running.set()
self.fade_out = False
self.fade_duration = 5.0 # Reset to default
self.audio_task = asyncio.create_task(self._play_audio(wav_path, volume))
async def stop_audio(self, fade_duration: float = None):
"""Stop audio playback with optional fade out.
Args:
fade_duration: Optional duration in seconds for fade out.
If None, stops immediately.
If > 0, fades out over that duration.
"""
if fade_duration and fade_duration > 0:
self.fade_duration = fade_duration # Set the fade duration
self.fade_out = True
await asyncio.sleep(fade_duration)
self.audio_running.clear()
if self.audio_task:
await self.audio_task
self.audio_task = None
async def _play_audio(self, wav_path: Path | str, volume: float):
samples_per_channel = 9600
wav_path = Path(wav_path)
fade_start_time = None
# logger.info(f"[AudioHandler] Playing WAV file: {wav_path}")
while self.audio_running.is_set():
with wave.open(str(wav_path), 'rb') as wav_file:
sample_rate = wav_file.getframerate()
num_channels = wav_file.getnchannels()
audio_data = wav_file.readframes(wav_file.getnframes())
audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
if num_channels == 2:
audio_array = audio_array.reshape(-1, 2).mean(axis=1)
for i in range(0, len(audio_array), samples_per_channel):
if not self.audio_running.is_set():
break
chunk = audio_array[i:i + samples_per_channel]
if len(chunk) < samples_per_channel:
chunk = np.pad(chunk, (0, samples_per_channel - len(chunk)))
if self.fade_out:
if fade_start_time is None:
fade_start_time = asyncio.get_event_loop().time()
elapsed_fade_time = asyncio.get_event_loop().time() - fade_start_time
if elapsed_fade_time >= self.fade_duration: # Use the shared fade_duration
break
fade_factor = max(0.0, 1.0 - (elapsed_fade_time / self.fade_duration))
volume = volume * fade_factor
chunk = np.tanh(chunk / 32768.0) * 32768.0
chunk = np.round(chunk * volume).astype(np.int16)
await self.audio_source.capture_frame(rtc.AudioFrame(
data=chunk.tobytes(),
sample_rate=48000,
samples_per_channel=samples_per_channel,
num_channels=1
))
await asyncio.sleep((samples_per_channel / 48000) * 0.98)
async def publish_track(self, room):
await room.local_participant.publish_track(
self.audio_track,
rtc.TrackPublishOptions(
source=rtc.TrackSource.SOURCE_MICROPHONE,
stream=self.track_id # Use the same unique ID for the stream name
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment