josiahbryan · March 13, 2025 13:49
diff --git a/audio_handler.py b/audio_handler.py
 # Full disclosure:
 # This was heavily based on https://github.com/ShayneP/meditation-assistant/blob/main/audio_handler.py

 import asyncio
 import numpy as np
 import wave
 from pathlib import Path
 from livekit import rtc
 import logging
 import uuid  # Built-in UUID generator

 logger = logging.getLogger("voice-agent")

 # async def before_llm_cb(agent: VoicePipelineAgent, chat_ctx: llm.ChatContext):
 #     """Callback that runs before the LLM generates a response.
 #     """
 #     await play_empty_audio(agent._room, duration_seconds=0.25)
 #     return None

 async def play_wav_file(room: rtc.Room, file_path: str, interrupt_event: asyncio.Event = None) -> None:
    """
    Play a WAV file through a LiveKit track. Can be interrupted if interrupt_event is provided.
    
    Args:
        room: The LiveKit room to publish to
        file_path: Path to the WAV file to play
        interrupt_event: Optional asyncio.Event that when set will stop playback
    """

    file_path = Path(file_path)

    # logger.info(f"[play_wav_file] Playing WAV file: {file_path}")

    # Open and read WAV file
    with wave.open(str(file_path), 'rb') as wav_file:
        # Get audio parameters
        channels = wav_file.getnchannels()
        sample_rate = wav_file.getframerate()
        sample_width = wav_file.getsampwidth()

        # Create audio source and track
        source = rtc.AudioSource(sample_rate, channels)
        track = rtc.LocalAudioTrack.create_audio_track("audio_playback", source)
        options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)
        
        # Publish track
        publication = await room.local_participant.publish_track(track, options)
        await publication.wait_for_subscription()
        
        # Calculate frame size for 20ms chunks
        frame_duration = 0.02  # 20ms
        frame_samples = int(sample_rate * frame_duration)
        bytes_per_sample = sample_width * channels
        frame_bytes = frame_samples * bytes_per_sample
        
        frames_played = 0
        try:
            # Read and play audio in chunks
            while True:
                # Check for interrupt signal if event provided
                if interrupt_event and interrupt_event.is_set():
                    logger.info("Audio playback interrupted")
                    break

                data = wav_file.readframes(frame_samples)
                if not data:
                    break
                    
                # Pad last frame if needed
                if len(data) < frame_bytes:
                    data = data + b'\0' * (frame_bytes - len(data))
                
                # Create and capture frame
                frame = rtc.AudioFrame(
                    data=data,
                    sample_rate=sample_rate,
                    num_channels=channels,
                    samples_per_channel=frame_samples // channels
                )
                await source.capture_frame(frame)
                frames_played += 1
        finally:
            # Always wait for playout to finish and cleanup
            await source.wait_for_playout()
            await publication.stop()
            track.stop()


 async def play_empty_audio(room: rtc.Room, duration_seconds: float = 1.5) -> None:
    """
    Play empty (silent) audio for a specified duration.
    
    Args:
        room: The LiveKit room to publish to
        duration_seconds: Duration of silence to play in seconds
    """
    
    # Use standard audio parameters
    sample_rate = 48000  # Standard sample rate
    channels = 1  # Mono audio
    
    # Create audio source and track
    source = rtc.AudioSource(sample_rate, channels)
    track = rtc.LocalAudioTrack.create_audio_track("empty_audio", source)
    options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)
    
    publication = await room.local_participant.publish_track(track, options)
    await publication.wait_for_subscription()

    # Calculate frame size for 20ms chunks
    frame_duration = 0.02  # 20ms
    frame_samples = int(sample_rate * frame_duration)
    
    # Calculate total number of frames needed
    total_frames = int(duration_seconds / frame_duration)
    
    frames_played = 0
    # Play empty frames
    for _ in range(total_frames):
        # Create empty frame (silence)
        empty_data = bytes(frame_samples * 2)  # 2 bytes per sample for int16
        frame = rtc.AudioFrame(
            data=empty_data,
            sample_rate=sample_rate,
            num_channels=channels,
            samples_per_channel=frame_samples
        )
        await source.capture_frame(frame)
        frames_played += 1

    await source.wait_for_playout()


 class AudioHandler:
    def __init__(self, sample_rate=48000, channels=1, track_id=None):
        self.audio_source = rtc.AudioSource(sample_rate, channels)
        self.track_id = track_id or f"audio_{str(uuid.uuid4())[:8]}"  # Use first 8 chars of UUID
        self.audio_track = rtc.LocalAudioTrack.create_audio_track(self.track_id, self.audio_source)
        self.audio_task = None
        self.audio_running = asyncio.Event()
        self.fade_out = False
        self.fade_duration = 5.0  # Default fade duration
        
    async def start_audio(self, wav_path: Path | str, volume: float = 0.3):
        self.audio_running.set()
        self.fade_out = False
        self.fade_duration = 5.0  # Reset to default
        self.audio_task = asyncio.create_task(self._play_audio(wav_path, volume))
        
    async def stop_audio(self, fade_duration: float = None):
        """Stop audio playback with optional fade out.
        
        Args:
            fade_duration: Optional duration in seconds for fade out.
                         If None, stops immediately.
                         If > 0, fades out over that duration.
        """
        if fade_duration and fade_duration > 0:
            self.fade_duration = fade_duration  # Set the fade duration
            self.fade_out = True
            await asyncio.sleep(fade_duration)
        
        self.audio_running.clear()
        if self.audio_task:
            await self.audio_task
            self.audio_task = None
            
    async def _play_audio(self, wav_path: Path | str, volume: float):
        samples_per_channel = 9600
        wav_path = Path(wav_path)
        fade_start_time = None
        
        # logger.info(f"[AudioHandler] Playing WAV file: {wav_path}")

        while self.audio_running.is_set():
            with wave.open(str(wav_path), 'rb') as wav_file:
                sample_rate = wav_file.getframerate()
                num_channels = wav_file.getnchannels()
                
                audio_data = wav_file.readframes(wav_file.getnframes())
                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
                
                if num_channels == 2:
                    audio_array = audio_array.reshape(-1, 2).mean(axis=1)
                
                for i in range(0, len(audio_array), samples_per_channel):
                    if not self.audio_running.is_set():
                        break
                    
                    chunk = audio_array[i:i + samples_per_channel]
                    
                    if len(chunk) < samples_per_channel:
                        chunk = np.pad(chunk, (0, samples_per_channel - len(chunk)))
                    
                    if self.fade_out:
                        if fade_start_time is None:
                            fade_start_time = asyncio.get_event_loop().time()
                        
                        elapsed_fade_time = asyncio.get_event_loop().time() - fade_start_time
                        if elapsed_fade_time >= self.fade_duration:  # Use the shared fade_duration
                            break
                        
                        fade_factor = max(0.0, 1.0 - (elapsed_fade_time / self.fade_duration))
                        volume = volume * fade_factor
                    
                    chunk = np.tanh(chunk / 32768.0) * 32768.0
                    chunk = np.round(chunk * volume).astype(np.int16)
                    
                    await self.audio_source.capture_frame(rtc.AudioFrame(
                        data=chunk.tobytes(),
                        sample_rate=48000,
                        samples_per_channel=samples_per_channel,
                        num_channels=1
                    ))
                    
                    await asyncio.sleep((samples_per_channel / 48000) * 0.98)
    
    async def publish_track(self, room):
        await room.local_participant.publish_track(
            self.audio_track,
            rtc.TrackPublishOptions(
                source=rtc.TrackSource.SOURCE_MICROPHONE,
                stream=self.track_id  # Use the same unique ID for the stream name
            )
        )
	# Full disclosure:
	# This was heavily based on https://github.com/ShayneP/meditation-assistant/blob/main/audio_handler.py

	import asyncio
	import numpy as np
	import wave
	from pathlib import Path
	from livekit import rtc
	import logging
	import uuid # Built-in UUID generator

	logger = logging.getLogger("voice-agent")

	# async def before_llm_cb(agent: VoicePipelineAgent, chat_ctx: llm.ChatContext):
	# """Callback that runs before the LLM generates a response.
	# """
	# await play_empty_audio(agent._room, duration_seconds=0.25)
	# return None

	async def play_wav_file(room: rtc.Room, file_path: str, interrupt_event: asyncio.Event = None) -> None:
	"""
	Play a WAV file through a LiveKit track. Can be interrupted if interrupt_event is provided.

	Args:
	room: The LiveKit room to publish to
	file_path: Path to the WAV file to play
	interrupt_event: Optional asyncio.Event that when set will stop playback
	"""

	file_path = Path(file_path)

	# logger.info(f"[play_wav_file] Playing WAV file: {file_path}")

	# Open and read WAV file
	with wave.open(str(file_path), 'rb') as wav_file:
	# Get audio parameters
	channels = wav_file.getnchannels()
	sample_rate = wav_file.getframerate()
	sample_width = wav_file.getsampwidth()

	# Create audio source and track
	source = rtc.AudioSource(sample_rate, channels)
	track = rtc.LocalAudioTrack.create_audio_track("audio_playback", source)
	options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)

	# Publish track
	publication = await room.local_participant.publish_track(track, options)
	await publication.wait_for_subscription()

	# Calculate frame size for 20ms chunks
	frame_duration = 0.02 # 20ms
	frame_samples = int(sample_rate * frame_duration)
	bytes_per_sample = sample_width * channels
	frame_bytes = frame_samples * bytes_per_sample

	frames_played = 0
	try:
	# Read and play audio in chunks
	while True:
	# Check for interrupt signal if event provided
	if interrupt_event and interrupt_event.is_set():
	logger.info("Audio playback interrupted")
	break

	data = wav_file.readframes(frame_samples)
	if not data:
	break

	# Pad last frame if needed
	if len(data) < frame_bytes:
	data = data + b'\0' * (frame_bytes - len(data))

	# Create and capture frame
	frame = rtc.AudioFrame(
	data=data,
	sample_rate=sample_rate,
	num_channels=channels,
	samples_per_channel=frame_samples // channels
	)
	await source.capture_frame(frame)
	frames_played += 1
	finally:
	# Always wait for playout to finish and cleanup
	await source.wait_for_playout()
	await publication.stop()
	track.stop()


	async def play_empty_audio(room: rtc.Room, duration_seconds: float = 1.5) -> None:
	"""
	Play empty (silent) audio for a specified duration.

	Args:
	room: The LiveKit room to publish to
	duration_seconds: Duration of silence to play in seconds
	"""

	# Use standard audio parameters
	sample_rate = 48000 # Standard sample rate
	channels = 1 # Mono audio

	# Create audio source and track
	source = rtc.AudioSource(sample_rate, channels)
	track = rtc.LocalAudioTrack.create_audio_track("empty_audio", source)
	options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)

	publication = await room.local_participant.publish_track(track, options)
	await publication.wait_for_subscription()

	# Calculate frame size for 20ms chunks
	frame_duration = 0.02 # 20ms
	frame_samples = int(sample_rate * frame_duration)

	# Calculate total number of frames needed
	total_frames = int(duration_seconds / frame_duration)

	frames_played = 0
	# Play empty frames
	for _ in range(total_frames):
	# Create empty frame (silence)
	empty_data = bytes(frame_samples * 2) # 2 bytes per sample for int16
	frame = rtc.AudioFrame(
	data=empty_data,
	sample_rate=sample_rate,
	num_channels=channels,
	samples_per_channel=frame_samples
	)
	await source.capture_frame(frame)
	frames_played += 1

	await source.wait_for_playout()


	class AudioHandler:
	def __init__(self, sample_rate=48000, channels=1, track_id=None):
	self.audio_source = rtc.AudioSource(sample_rate, channels)
	self.track_id = track_id or f"audio_{str(uuid.uuid4())[:8]}" # Use first 8 chars of UUID
	self.audio_track = rtc.LocalAudioTrack.create_audio_track(self.track_id, self.audio_source)
	self.audio_task = None
	self.audio_running = asyncio.Event()
	self.fade_out = False
	self.fade_duration = 5.0 # Default fade duration

	async def start_audio(self, wav_path: Path \| str, volume: float = 0.3):
	self.audio_running.set()
	self.fade_out = False
	self.fade_duration = 5.0 # Reset to default
	self.audio_task = asyncio.create_task(self._play_audio(wav_path, volume))

	async def stop_audio(self, fade_duration: float = None):
	"""Stop audio playback with optional fade out.

	Args:
	fade_duration: Optional duration in seconds for fade out.
	If None, stops immediately.
	If > 0, fades out over that duration.
	"""
	if fade_duration and fade_duration > 0:
	self.fade_duration = fade_duration # Set the fade duration
	self.fade_out = True
	await asyncio.sleep(fade_duration)

	self.audio_running.clear()
	if self.audio_task:
	await self.audio_task
	self.audio_task = None

	async def _play_audio(self, wav_path: Path \| str, volume: float):
	samples_per_channel = 9600
	wav_path = Path(wav_path)
	fade_start_time = None

	# logger.info(f"[AudioHandler] Playing WAV file: {wav_path}")

	while self.audio_running.is_set():
	with wave.open(str(wav_path), 'rb') as wav_file:
	sample_rate = wav_file.getframerate()
	num_channels = wav_file.getnchannels()

	audio_data = wav_file.readframes(wav_file.getnframes())
	audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)

	if num_channels == 2:
	audio_array = audio_array.reshape(-1, 2).mean(axis=1)

	for i in range(0, len(audio_array), samples_per_channel):
	if not self.audio_running.is_set():
	break

	chunk = audio_array[i:i + samples_per_channel]

	if len(chunk) < samples_per_channel:
	chunk = np.pad(chunk, (0, samples_per_channel - len(chunk)))

	if self.fade_out:
	if fade_start_time is None:
	fade_start_time = asyncio.get_event_loop().time()

	elapsed_fade_time = asyncio.get_event_loop().time() - fade_start_time
	if elapsed_fade_time >= self.fade_duration: # Use the shared fade_duration
	break

	fade_factor = max(0.0, 1.0 - (elapsed_fade_time / self.fade_duration))
	volume = volume * fade_factor

	chunk = np.tanh(chunk / 32768.0) * 32768.0
	chunk = np.round(chunk * volume).astype(np.int16)

	await self.audio_source.capture_frame(rtc.AudioFrame(
	data=chunk.tobytes(),
	sample_rate=48000,
	samples_per_channel=samples_per_channel,
	num_channels=1
	))

	await asyncio.sleep((samples_per_channel / 48000) * 0.98)

	async def publish_track(self, room):
	await room.local_participant.publish_track(
	self.audio_track,
	rtc.TrackPublishOptions(
	source=rtc.TrackSource.SOURCE_MICROPHONE,
	stream=self.track_id # Use the same unique ID for the stream name
	)
	)