Skip to content

Instantly share code, notes, and snippets.

@ianb
Created January 9, 2025 19:42
Show Gist options
  • Save ianb/39983a2aa4203296eecd1d3c6a153aa0 to your computer and use it in GitHub Desktop.
Save ianb/39983a2aa4203296eecd1d3c6a153aa0 to your computer and use it in GitHub Desktop.
An example of using @ricky0123/vad-react with buffered audio/transcription
// Note: this isn't complete, but is an extraction of most of the relevants bits of a working module
import { useMicVAD, utils } from "@ricky0123/vad-react";
const FLUSH_AFTER_SPEECH_LENGTH = 20000; // transcribe after the first break after 20 seconds
const SILENCE_LENGTH = 2000; // transcribe after 2 seconds of non-speech
export function LiveVoiceWidget({
onTranscript,
onAudio,
onStop,
className,
}: {
onTranscript?: (transcript: TranscriptionType) => Promise<void>;
onAudio?: (audio: Float32Array) => Promise<void>;
onStop?: () => void;
className?: string;
}) {
const [isTranscribing, setIsTranscribing] = useState(false);
const transcriber = useRef(
new AudioTranscriber({
onTranscript,
onAudio,
onStartTranscribing: () => {
setIsTranscribing(true);
},
onStopTranscribing: () => {
setIsTranscribing(false);
},
})
);
const lastStart = useRef(0);
const { listening, errored, loading, userSpeaking, pause, start } = useMicVAD(
{
startOnLoad: true,
minSpeechFrames: 2,
submitUserSpeechOnPause: true,
preSpeechPadFrames: 2,
onSpeechStart: () => {
console.log(`Speech started${lastStart.current ? " again" : ""}`);
if (lastStart.current === 0) {
lastStart.current = Date.now();
}
transcriber.current.onSpeechStart();
},
onSpeechEnd: async (audio) => {
const startTime = lastStart.current;
const time = Date.now() - lastStart.current;
lastStart.current = 0;
console.log("length of audio", audio.length, time);
transcriber.current.queueAudio(audio, startTime);
},
}
);
if (!listening) {
transcriber.current.maybeFlush();
}
if (loading) {
return (
<div
className={twMerge(
"rounded-md border bg-red-800 text-center",
className
)}
>
<ClockIcon className="h-4 w-4 text-white inline-block" />
</div>
);
}
if (errored) {
return (
<div className={twMerge(className, "bg-red-800 text-white")}>
Error: {errored}
</div>
);
}
if (!listening) {
return (
<Button
className={twMerge("bg-red-900 text-white hover:bg-red-700", className)}
onClick={start}
Icon={MicrophoneIcon}
/>
);
}
return (
<div>
<Button
onClick={() => {
pause();
if (onStop) {
onStop();
}
}}
className={twMerge(
"bg-red-600 text-red-300 hover:bg-red-400 hover:text-white",
className
)}
>
{isTranscribing ? (
<CloudArrowUpIcon className="h-4 w-4 text-white inline-block" />
) : (
<StopCircleIcon className="h-4 w-4 text-red-300 hover:text-white inline-block" />
)}
{userSpeaking ? (
<ChatBubbleOvalLeftEllipsisIcon className="h-4 w-4 text-white inline-block" />
) : (
<ChatBubbleOvalLeftIcon className="h-4 w-4 text-red-300 inline-block" />
)}
</Button>
</div>
);
}
export class AudioTranscriber {
queue: Float32Array[] = [];
isTranscribing = false;
history: string[] = [];
continueFlush = false;
timerId = 0;
queueStartTime = 0;
onTranscript?: (t: TranscriptionType) => Promise<void>;
onAudio?: (audio: Float32Array) => Promise<void>;
onStartTranscribing: () => void;
onStopTranscribing: () => void;
constructor({
onTranscript,
onAudio,
onStartTranscribing,
onStopTranscribing,
}: {
onTranscript?: (t: TranscriptionType) => Promise<void>;
onAudio?: (audio: Float32Array) => Promise<void>;
onStartTranscribing?: () => void;
onStopTranscribing?: () => void;
}) {
this.onTranscript = onTranscript;
this.onAudio = onAudio;
this.onStartTranscribing = onStartTranscribing || (() => {});
this.onStopTranscribing = onStopTranscribing || (() => {});
}
public async queueAudio(audio: Float32Array, startTime: number) {
if (this.queueStartTime === 0) {
this.queueStartTime = startTime;
}
this.queue.push(audio);
console.log("Time of audio", Date.now() - this.queueStartTime);
if (
Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
!this.isTranscribing
) {
console.log("Flushing due to total length");
await this.flush();
}
if (this.timerId) {
console.log("canceling flush");
clearTimeout(this.timerId);
}
this.timerId = setTimeout(() => {
console.log("flushing after silence");
this.timerId = 0;
this.maybeFlush();
}, SILENCE_LENGTH) as unknown as number;
}
public onSpeechStart() {
if (this.timerId) {
console.log("reseting flush timer");
clearTimeout(this.timerId);
this.timerId = 0;
}
}
maybeFlush() {
if (this.queue.length > 0) {
if (this.isTranscribing) {
this.continueFlush = true;
} else {
this.flush();
}
}
}
// Combine queued audio, run transcription, clear the queue, and repeat if there's leftover audio.
private async flush() {
if (this.onTranscript) {
await this.flushTranscript();
}
if (this.onAudio) {
await this.flushAudio();
}
}
async flushTranscript() {
this.isTranscribing = true;
try {
this.onStartTranscribing();
const combined = this.combineQueue();
this.queueStartTime = 0;
const prompt = this.history.join(" ").slice(-100); // adjust slice as needed
// Convert to WAV (or whatever format) and transcribe
const array = utils.encodeWAV(combined, 1, 16000);
const blob = new Blob([array], { type: "audio/wav" });
const transcript = await transcribe(blob, {
prompt,
mimetype: "audio/wav",
});
// Push new text to history, notify caller, then see if new audio arrived mid-transcription
this.history.push(transcript.text);
await this.onTranscript!(transcript);
} finally {
this.isTranscribing = false;
this.onStopTranscribing();
}
// If there's new audio in the queue (arrived during transcription), keep flushing
if (
this.queue.length > 0 &&
Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
!this.isTranscribing
) {
await this.flush();
} else if (this.continueFlush) {
this.continueFlush = false;
if (this.queue.length > 0) {
this.flush();
}
}
}
async flushAudio() {
this.isTranscribing = true;
try {
this.onStartTranscribing();
const combined = this.combineQueue();
this.queueStartTime = 0;
await this.onAudio!(combined);
} finally {
this.isTranscribing = false;
this.onStopTranscribing();
}
}
// Simple combination of the queued Float32Array segments into one
private combineQueue(): Float32Array {
const totalLen = this.queue.reduce((sum, a) => sum + a.length, 0);
const combined = new Float32Array(totalLen);
let offset = 0;
for (const chunk of this.queue) {
combined.set(chunk, offset);
offset += chunk.length;
}
// Reset the queue
this.queue = [];
return combined;
}
}
export function LiveVoiceTranscript({
onComplete,
onStop,
projectId,
}: {
onComplete: (transcript: string) => Promise<void>;
onStop?: () => void;
projectId: number;
}) {
const transcriptAtom = useMemo(() => atom(""), []);
const [transcript, setTranscript] = useAtom(transcriptAtom);
const [submitting, setSubmitting] = useState(false);
const onAudio = useCallback(
async (audio: Float32Array) => {
const store = getDefaultStore();
// Not sure why I can't use the external-to-the-closer transcript here, but apparently I can't
const transcript = store.get(transcriptAtom);
const array = utils.encodeWAV(audio, 1, 16000);
const audioBlob = new Blob([array], { type: "audio/wav" });
const audioUrlBody = btoa(
new Uint8Array(await audioBlob.arrayBuffer()).reduce(
(data, byte) => data + String.fromCharCode(byte),
""
)
);
const audioUrl = `data:audio/wav;base64,${audioUrlBody}`;
const { newTranscript, explanationText, audioTranscript, complete } =
await todos.completeTranscript({
audioUrl,
previousTranscript: transcript,
projectId,
});
console.log("set transcript to", newTranscript);
setTranscript(newTranscript);
if (complete) {
setSubmitting(true);
await onComplete(newTranscript);
store.set(transcriptAtom, "");
setSubmitting(false);
}
},
[transcript, setTranscript, projectId]
);
return (
<div className="w-full">
{transcript ? (
<Markdown
className={twMerge(
"border-2 p-2 rounded-mn bg-white",
submitting ? "text-gray-400 bg-gray-200" : "text-gray-900"
)}
>
{transcript}
</Markdown>
) : (
<div className="border-2 p-2 text-center rounded-mn bg-white text-gray-400 pb-8">
Waiting...
</div>
)}
<div>
<LiveVoiceWidget onAudio={onAudio} onStop={onStop} className="w-full" />
</div>
</div>
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment