Created
January 9, 2025 19:42
-
-
Save ianb/39983a2aa4203296eecd1d3c6a153aa0 to your computer and use it in GitHub Desktop.
An example of using @ricky0123/vad-react with buffered audio/transcription
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Note: this isn't complete, but is an extraction of most of the relevants bits of a working module | |
import { useMicVAD, utils } from "@ricky0123/vad-react"; | |
const FLUSH_AFTER_SPEECH_LENGTH = 20000; // transcribe after the first break after 20 seconds | |
const SILENCE_LENGTH = 2000; // transcribe after 2 seconds of non-speech | |
export function LiveVoiceWidget({ | |
onTranscript, | |
onAudio, | |
onStop, | |
className, | |
}: { | |
onTranscript?: (transcript: TranscriptionType) => Promise<void>; | |
onAudio?: (audio: Float32Array) => Promise<void>; | |
onStop?: () => void; | |
className?: string; | |
}) { | |
const [isTranscribing, setIsTranscribing] = useState(false); | |
const transcriber = useRef( | |
new AudioTranscriber({ | |
onTranscript, | |
onAudio, | |
onStartTranscribing: () => { | |
setIsTranscribing(true); | |
}, | |
onStopTranscribing: () => { | |
setIsTranscribing(false); | |
}, | |
}) | |
); | |
const lastStart = useRef(0); | |
const { listening, errored, loading, userSpeaking, pause, start } = useMicVAD( | |
{ | |
startOnLoad: true, | |
minSpeechFrames: 2, | |
submitUserSpeechOnPause: true, | |
preSpeechPadFrames: 2, | |
onSpeechStart: () => { | |
console.log(`Speech started${lastStart.current ? " again" : ""}`); | |
if (lastStart.current === 0) { | |
lastStart.current = Date.now(); | |
} | |
transcriber.current.onSpeechStart(); | |
}, | |
onSpeechEnd: async (audio) => { | |
const startTime = lastStart.current; | |
const time = Date.now() - lastStart.current; | |
lastStart.current = 0; | |
console.log("length of audio", audio.length, time); | |
transcriber.current.queueAudio(audio, startTime); | |
}, | |
} | |
); | |
if (!listening) { | |
transcriber.current.maybeFlush(); | |
} | |
if (loading) { | |
return ( | |
<div | |
className={twMerge( | |
"rounded-md border bg-red-800 text-center", | |
className | |
)} | |
> | |
<ClockIcon className="h-4 w-4 text-white inline-block" /> | |
</div> | |
); | |
} | |
if (errored) { | |
return ( | |
<div className={twMerge(className, "bg-red-800 text-white")}> | |
Error: {errored} | |
</div> | |
); | |
} | |
if (!listening) { | |
return ( | |
<Button | |
className={twMerge("bg-red-900 text-white hover:bg-red-700", className)} | |
onClick={start} | |
Icon={MicrophoneIcon} | |
/> | |
); | |
} | |
return ( | |
<div> | |
<Button | |
onClick={() => { | |
pause(); | |
if (onStop) { | |
onStop(); | |
} | |
}} | |
className={twMerge( | |
"bg-red-600 text-red-300 hover:bg-red-400 hover:text-white", | |
className | |
)} | |
> | |
{isTranscribing ? ( | |
<CloudArrowUpIcon className="h-4 w-4 text-white inline-block" /> | |
) : ( | |
<StopCircleIcon className="h-4 w-4 text-red-300 hover:text-white inline-block" /> | |
)} | |
{userSpeaking ? ( | |
<ChatBubbleOvalLeftEllipsisIcon className="h-4 w-4 text-white inline-block" /> | |
) : ( | |
<ChatBubbleOvalLeftIcon className="h-4 w-4 text-red-300 inline-block" /> | |
)} | |
</Button> | |
</div> | |
); | |
} | |
export class AudioTranscriber { | |
queue: Float32Array[] = []; | |
isTranscribing = false; | |
history: string[] = []; | |
continueFlush = false; | |
timerId = 0; | |
queueStartTime = 0; | |
onTranscript?: (t: TranscriptionType) => Promise<void>; | |
onAudio?: (audio: Float32Array) => Promise<void>; | |
onStartTranscribing: () => void; | |
onStopTranscribing: () => void; | |
constructor({ | |
onTranscript, | |
onAudio, | |
onStartTranscribing, | |
onStopTranscribing, | |
}: { | |
onTranscript?: (t: TranscriptionType) => Promise<void>; | |
onAudio?: (audio: Float32Array) => Promise<void>; | |
onStartTranscribing?: () => void; | |
onStopTranscribing?: () => void; | |
}) { | |
this.onTranscript = onTranscript; | |
this.onAudio = onAudio; | |
this.onStartTranscribing = onStartTranscribing || (() => {}); | |
this.onStopTranscribing = onStopTranscribing || (() => {}); | |
} | |
public async queueAudio(audio: Float32Array, startTime: number) { | |
if (this.queueStartTime === 0) { | |
this.queueStartTime = startTime; | |
} | |
this.queue.push(audio); | |
console.log("Time of audio", Date.now() - this.queueStartTime); | |
if ( | |
Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH && | |
!this.isTranscribing | |
) { | |
console.log("Flushing due to total length"); | |
await this.flush(); | |
} | |
if (this.timerId) { | |
console.log("canceling flush"); | |
clearTimeout(this.timerId); | |
} | |
this.timerId = setTimeout(() => { | |
console.log("flushing after silence"); | |
this.timerId = 0; | |
this.maybeFlush(); | |
}, SILENCE_LENGTH) as unknown as number; | |
} | |
public onSpeechStart() { | |
if (this.timerId) { | |
console.log("reseting flush timer"); | |
clearTimeout(this.timerId); | |
this.timerId = 0; | |
} | |
} | |
maybeFlush() { | |
if (this.queue.length > 0) { | |
if (this.isTranscribing) { | |
this.continueFlush = true; | |
} else { | |
this.flush(); | |
} | |
} | |
} | |
// Combine queued audio, run transcription, clear the queue, and repeat if there's leftover audio. | |
private async flush() { | |
if (this.onTranscript) { | |
await this.flushTranscript(); | |
} | |
if (this.onAudio) { | |
await this.flushAudio(); | |
} | |
} | |
async flushTranscript() { | |
this.isTranscribing = true; | |
try { | |
this.onStartTranscribing(); | |
const combined = this.combineQueue(); | |
this.queueStartTime = 0; | |
const prompt = this.history.join(" ").slice(-100); // adjust slice as needed | |
// Convert to WAV (or whatever format) and transcribe | |
const array = utils.encodeWAV(combined, 1, 16000); | |
const blob = new Blob([array], { type: "audio/wav" }); | |
const transcript = await transcribe(blob, { | |
prompt, | |
mimetype: "audio/wav", | |
}); | |
// Push new text to history, notify caller, then see if new audio arrived mid-transcription | |
this.history.push(transcript.text); | |
await this.onTranscript!(transcript); | |
} finally { | |
this.isTranscribing = false; | |
this.onStopTranscribing(); | |
} | |
// If there's new audio in the queue (arrived during transcription), keep flushing | |
if ( | |
this.queue.length > 0 && | |
Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH && | |
!this.isTranscribing | |
) { | |
await this.flush(); | |
} else if (this.continueFlush) { | |
this.continueFlush = false; | |
if (this.queue.length > 0) { | |
this.flush(); | |
} | |
} | |
} | |
async flushAudio() { | |
this.isTranscribing = true; | |
try { | |
this.onStartTranscribing(); | |
const combined = this.combineQueue(); | |
this.queueStartTime = 0; | |
await this.onAudio!(combined); | |
} finally { | |
this.isTranscribing = false; | |
this.onStopTranscribing(); | |
} | |
} | |
// Simple combination of the queued Float32Array segments into one | |
private combineQueue(): Float32Array { | |
const totalLen = this.queue.reduce((sum, a) => sum + a.length, 0); | |
const combined = new Float32Array(totalLen); | |
let offset = 0; | |
for (const chunk of this.queue) { | |
combined.set(chunk, offset); | |
offset += chunk.length; | |
} | |
// Reset the queue | |
this.queue = []; | |
return combined; | |
} | |
} | |
export function LiveVoiceTranscript({ | |
onComplete, | |
onStop, | |
projectId, | |
}: { | |
onComplete: (transcript: string) => Promise<void>; | |
onStop?: () => void; | |
projectId: number; | |
}) { | |
const transcriptAtom = useMemo(() => atom(""), []); | |
const [transcript, setTranscript] = useAtom(transcriptAtom); | |
const [submitting, setSubmitting] = useState(false); | |
const onAudio = useCallback( | |
async (audio: Float32Array) => { | |
const store = getDefaultStore(); | |
// Not sure why I can't use the external-to-the-closer transcript here, but apparently I can't | |
const transcript = store.get(transcriptAtom); | |
const array = utils.encodeWAV(audio, 1, 16000); | |
const audioBlob = new Blob([array], { type: "audio/wav" }); | |
const audioUrlBody = btoa( | |
new Uint8Array(await audioBlob.arrayBuffer()).reduce( | |
(data, byte) => data + String.fromCharCode(byte), | |
"" | |
) | |
); | |
const audioUrl = `data:audio/wav;base64,${audioUrlBody}`; | |
const { newTranscript, explanationText, audioTranscript, complete } = | |
await todos.completeTranscript({ | |
audioUrl, | |
previousTranscript: transcript, | |
projectId, | |
}); | |
console.log("set transcript to", newTranscript); | |
setTranscript(newTranscript); | |
if (complete) { | |
setSubmitting(true); | |
await onComplete(newTranscript); | |
store.set(transcriptAtom, ""); | |
setSubmitting(false); | |
} | |
}, | |
[transcript, setTranscript, projectId] | |
); | |
return ( | |
<div className="w-full"> | |
{transcript ? ( | |
<Markdown | |
className={twMerge( | |
"border-2 p-2 rounded-mn bg-white", | |
submitting ? "text-gray-400 bg-gray-200" : "text-gray-900" | |
)} | |
> | |
{transcript} | |
</Markdown> | |
) : ( | |
<div className="border-2 p-2 text-center rounded-mn bg-white text-gray-400 pb-8"> | |
Waiting... | |
</div> | |
)} | |
<div> | |
<LiveVoiceWidget onAudio={onAudio} onStop={onStop} className="w-full" /> | |
</div> | |
</div> | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment