Created
March 13, 2025 13:49
-
-
Save josiahbryan/393904be297ac48e3e865076be175e42 to your computer and use it in GitHub Desktop.
Improved Audio Handler for LiveKit Background Audio
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Full disclosure: | |
# This was heavily based on https://github.com/ShayneP/meditation-assistant/blob/main/audio_handler.py | |
import asyncio | |
import numpy as np | |
import wave | |
from pathlib import Path | |
from livekit import rtc | |
import logging | |
import uuid # Built-in UUID generator | |
logger = logging.getLogger("voice-agent") | |
# async def before_llm_cb(agent: VoicePipelineAgent, chat_ctx: llm.ChatContext): | |
# """Callback that runs before the LLM generates a response. | |
# """ | |
# await play_empty_audio(agent._room, duration_seconds=0.25) | |
# return None | |
async def play_wav_file(room: rtc.Room, file_path: str, interrupt_event: asyncio.Event = None) -> None: | |
""" | |
Play a WAV file through a LiveKit track. Can be interrupted if interrupt_event is provided. | |
Args: | |
room: The LiveKit room to publish to | |
file_path: Path to the WAV file to play | |
interrupt_event: Optional asyncio.Event that when set will stop playback | |
""" | |
file_path = Path(file_path) | |
# logger.info(f"[play_wav_file] Playing WAV file: {file_path}") | |
# Open and read WAV file | |
with wave.open(str(file_path), 'rb') as wav_file: | |
# Get audio parameters | |
channels = wav_file.getnchannels() | |
sample_rate = wav_file.getframerate() | |
sample_width = wav_file.getsampwidth() | |
# Create audio source and track | |
source = rtc.AudioSource(sample_rate, channels) | |
track = rtc.LocalAudioTrack.create_audio_track("audio_playback", source) | |
options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE) | |
# Publish track | |
publication = await room.local_participant.publish_track(track, options) | |
await publication.wait_for_subscription() | |
# Calculate frame size for 20ms chunks | |
frame_duration = 0.02 # 20ms | |
frame_samples = int(sample_rate * frame_duration) | |
bytes_per_sample = sample_width * channels | |
frame_bytes = frame_samples * bytes_per_sample | |
frames_played = 0 | |
try: | |
# Read and play audio in chunks | |
while True: | |
# Check for interrupt signal if event provided | |
if interrupt_event and interrupt_event.is_set(): | |
logger.info("Audio playback interrupted") | |
break | |
data = wav_file.readframes(frame_samples) | |
if not data: | |
break | |
# Pad last frame if needed | |
if len(data) < frame_bytes: | |
data = data + b'\0' * (frame_bytes - len(data)) | |
# Create and capture frame | |
frame = rtc.AudioFrame( | |
data=data, | |
sample_rate=sample_rate, | |
num_channels=channels, | |
samples_per_channel=frame_samples // channels | |
) | |
await source.capture_frame(frame) | |
frames_played += 1 | |
finally: | |
# Always wait for playout to finish and cleanup | |
await source.wait_for_playout() | |
await publication.stop() | |
track.stop() | |
async def play_empty_audio(room: rtc.Room, duration_seconds: float = 1.5) -> None: | |
""" | |
Play empty (silent) audio for a specified duration. | |
Args: | |
room: The LiveKit room to publish to | |
duration_seconds: Duration of silence to play in seconds | |
""" | |
# Use standard audio parameters | |
sample_rate = 48000 # Standard sample rate | |
channels = 1 # Mono audio | |
# Create audio source and track | |
source = rtc.AudioSource(sample_rate, channels) | |
track = rtc.LocalAudioTrack.create_audio_track("empty_audio", source) | |
options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE) | |
publication = await room.local_participant.publish_track(track, options) | |
await publication.wait_for_subscription() | |
# Calculate frame size for 20ms chunks | |
frame_duration = 0.02 # 20ms | |
frame_samples = int(sample_rate * frame_duration) | |
# Calculate total number of frames needed | |
total_frames = int(duration_seconds / frame_duration) | |
frames_played = 0 | |
# Play empty frames | |
for _ in range(total_frames): | |
# Create empty frame (silence) | |
empty_data = bytes(frame_samples * 2) # 2 bytes per sample for int16 | |
frame = rtc.AudioFrame( | |
data=empty_data, | |
sample_rate=sample_rate, | |
num_channels=channels, | |
samples_per_channel=frame_samples | |
) | |
await source.capture_frame(frame) | |
frames_played += 1 | |
await source.wait_for_playout() | |
class AudioHandler: | |
def __init__(self, sample_rate=48000, channels=1, track_id=None): | |
self.audio_source = rtc.AudioSource(sample_rate, channels) | |
self.track_id = track_id or f"audio_{str(uuid.uuid4())[:8]}" # Use first 8 chars of UUID | |
self.audio_track = rtc.LocalAudioTrack.create_audio_track(self.track_id, self.audio_source) | |
self.audio_task = None | |
self.audio_running = asyncio.Event() | |
self.fade_out = False | |
self.fade_duration = 5.0 # Default fade duration | |
async def start_audio(self, wav_path: Path | str, volume: float = 0.3): | |
self.audio_running.set() | |
self.fade_out = False | |
self.fade_duration = 5.0 # Reset to default | |
self.audio_task = asyncio.create_task(self._play_audio(wav_path, volume)) | |
async def stop_audio(self, fade_duration: float = None): | |
"""Stop audio playback with optional fade out. | |
Args: | |
fade_duration: Optional duration in seconds for fade out. | |
If None, stops immediately. | |
If > 0, fades out over that duration. | |
""" | |
if fade_duration and fade_duration > 0: | |
self.fade_duration = fade_duration # Set the fade duration | |
self.fade_out = True | |
await asyncio.sleep(fade_duration) | |
self.audio_running.clear() | |
if self.audio_task: | |
await self.audio_task | |
self.audio_task = None | |
async def _play_audio(self, wav_path: Path | str, volume: float): | |
samples_per_channel = 9600 | |
wav_path = Path(wav_path) | |
fade_start_time = None | |
# logger.info(f"[AudioHandler] Playing WAV file: {wav_path}") | |
while self.audio_running.is_set(): | |
with wave.open(str(wav_path), 'rb') as wav_file: | |
sample_rate = wav_file.getframerate() | |
num_channels = wav_file.getnchannels() | |
audio_data = wav_file.readframes(wav_file.getnframes()) | |
audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) | |
if num_channels == 2: | |
audio_array = audio_array.reshape(-1, 2).mean(axis=1) | |
for i in range(0, len(audio_array), samples_per_channel): | |
if not self.audio_running.is_set(): | |
break | |
chunk = audio_array[i:i + samples_per_channel] | |
if len(chunk) < samples_per_channel: | |
chunk = np.pad(chunk, (0, samples_per_channel - len(chunk))) | |
if self.fade_out: | |
if fade_start_time is None: | |
fade_start_time = asyncio.get_event_loop().time() | |
elapsed_fade_time = asyncio.get_event_loop().time() - fade_start_time | |
if elapsed_fade_time >= self.fade_duration: # Use the shared fade_duration | |
break | |
fade_factor = max(0.0, 1.0 - (elapsed_fade_time / self.fade_duration)) | |
volume = volume * fade_factor | |
chunk = np.tanh(chunk / 32768.0) * 32768.0 | |
chunk = np.round(chunk * volume).astype(np.int16) | |
await self.audio_source.capture_frame(rtc.AudioFrame( | |
data=chunk.tobytes(), | |
sample_rate=48000, | |
samples_per_channel=samples_per_channel, | |
num_channels=1 | |
)) | |
await asyncio.sleep((samples_per_channel / 48000) * 0.98) | |
async def publish_track(self, room): | |
await room.local_participant.publish_track( | |
self.audio_track, | |
rtc.TrackPublishOptions( | |
source=rtc.TrackSource.SOURCE_MICROPHONE, | |
stream=self.track_id # Use the same unique ID for the stream name | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment