ianb · January 9, 2025 19:42
diff --git a/voice.tsx b/voice.tsx
 // Note: this isn't complete, but is an extraction of most of the relevants bits of a working module

 import { useMicVAD, utils } from "@ricky0123/vad-react";

 const FLUSH_AFTER_SPEECH_LENGTH = 20000; // transcribe after the first break after 20 seconds
 const SILENCE_LENGTH = 2000; // transcribe after 2 seconds of non-speech


 export function LiveVoiceWidget({
  onTranscript,
  onAudio,
  onStop,
  className,
 }: {
  onTranscript?: (transcript: TranscriptionType) => Promise<void>;
  onAudio?: (audio: Float32Array) => Promise<void>;
  onStop?: () => void;
  className?: string;
 }) {
  const [isTranscribing, setIsTranscribing] = useState(false);
  const transcriber = useRef(
    new AudioTranscriber({
      onTranscript,
      onAudio,
      onStartTranscribing: () => {
        setIsTranscribing(true);
      },
      onStopTranscribing: () => {
        setIsTranscribing(false);
      },
    })
  );
  const lastStart = useRef(0);
  const { listening, errored, loading, userSpeaking, pause, start } = useMicVAD(
    {
      startOnLoad: true,
      minSpeechFrames: 2,
      submitUserSpeechOnPause: true,
      preSpeechPadFrames: 2,
      onSpeechStart: () => {
        console.log(`Speech started${lastStart.current ? " again" : ""}`);
        if (lastStart.current === 0) {
          lastStart.current = Date.now();
        }
        transcriber.current.onSpeechStart();
      },
      onSpeechEnd: async (audio) => {
        const startTime = lastStart.current;
        const time = Date.now() - lastStart.current;
        lastStart.current = 0;
        console.log("length of audio", audio.length, time);
        transcriber.current.queueAudio(audio, startTime);
      },
    }
  );
  if (!listening) {
    transcriber.current.maybeFlush();
  }
  if (loading) {
    return (
      <div
        className={twMerge(
          "rounded-md border bg-red-800 text-center",
          className
        )}
      >
        <ClockIcon className="h-4 w-4 text-white inline-block" />
      </div>
    );
  }
  if (errored) {
    return (
      <div className={twMerge(className, "bg-red-800 text-white")}>
        Error: {errored}
      </div>
    );
  }
  if (!listening) {
    return (
      <Button
        className={twMerge("bg-red-900 text-white hover:bg-red-700", className)}
        onClick={start}
        Icon={MicrophoneIcon}
      />
    );
  }
  return (
    <div>
      <Button
        onClick={() => {
          pause();
          if (onStop) {
            onStop();
          }
        }}
        className={twMerge(
          "bg-red-600 text-red-300 hover:bg-red-400 hover:text-white",
          className
        )}
      >
        {isTranscribing ? (
          <CloudArrowUpIcon className="h-4 w-4 text-white inline-block" />
        ) : (
          <StopCircleIcon className="h-4 w-4 text-red-300 hover:text-white inline-block" />
        )}
        {userSpeaking ? (
          <ChatBubbleOvalLeftEllipsisIcon className="h-4 w-4 text-white inline-block" />
        ) : (
          <ChatBubbleOvalLeftIcon className="h-4 w-4 text-red-300 inline-block" />
        )}
      </Button>
    </div>
  );
 }

 export class AudioTranscriber {
  queue: Float32Array[] = [];
  isTranscribing = false;
  history: string[] = [];
  continueFlush = false;
  timerId = 0;
  queueStartTime = 0;
  onTranscript?: (t: TranscriptionType) => Promise<void>;
  onAudio?: (audio: Float32Array) => Promise<void>;
  onStartTranscribing: () => void;
  onStopTranscribing: () => void;

  constructor({
    onTranscript,
    onAudio,
    onStartTranscribing,
    onStopTranscribing,
  }: {
    onTranscript?: (t: TranscriptionType) => Promise<void>;
    onAudio?: (audio: Float32Array) => Promise<void>;
    onStartTranscribing?: () => void;
    onStopTranscribing?: () => void;
  }) {
    this.onTranscript = onTranscript;
    this.onAudio = onAudio;
    this.onStartTranscribing = onStartTranscribing || (() => {});
    this.onStopTranscribing = onStopTranscribing || (() => {});
  }

  public async queueAudio(audio: Float32Array, startTime: number) {
    if (this.queueStartTime === 0) {
      this.queueStartTime = startTime;
    }
    this.queue.push(audio);
    console.log("Time of audio", Date.now() - this.queueStartTime);

    if (
      Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
      !this.isTranscribing
    ) {
      console.log("Flushing due to total length");
      await this.flush();
    }
    if (this.timerId) {
      console.log("canceling flush");
      clearTimeout(this.timerId);
    }
    this.timerId = setTimeout(() => {
      console.log("flushing after silence");
      this.timerId = 0;
      this.maybeFlush();
    }, SILENCE_LENGTH) as unknown as number;
  }

  public onSpeechStart() {
    if (this.timerId) {
      console.log("reseting flush timer");
      clearTimeout(this.timerId);
      this.timerId = 0;
    }
  }

  maybeFlush() {
    if (this.queue.length > 0) {
      if (this.isTranscribing) {
        this.continueFlush = true;
      } else {
        this.flush();
      }
    }
  }

  // Combine queued audio, run transcription, clear the queue, and repeat if there's leftover audio.
  private async flush() {
    if (this.onTranscript) {
      await this.flushTranscript();
    }
    if (this.onAudio) {
      await this.flushAudio();
    }
  }

  async flushTranscript() {
    this.isTranscribing = true;
    try {
      this.onStartTranscribing();
      const combined = this.combineQueue();
      this.queueStartTime = 0;
      const prompt = this.history.join(" ").slice(-100); // adjust slice as needed

      // Convert to WAV (or whatever format) and transcribe
      const array = utils.encodeWAV(combined, 1, 16000);
      const blob = new Blob([array], { type: "audio/wav" });
      const transcript = await transcribe(blob, {
        prompt,
        mimetype: "audio/wav",
      });

      // Push new text to history, notify caller, then see if new audio arrived mid-transcription
      this.history.push(transcript.text);
      await this.onTranscript!(transcript);
    } finally {
      this.isTranscribing = false;
      this.onStopTranscribing();
    }

    // If there's new audio in the queue (arrived during transcription), keep flushing
    if (
      this.queue.length > 0 &&
      Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
      !this.isTranscribing
    ) {
      await this.flush();
    } else if (this.continueFlush) {
      this.continueFlush = false;
      if (this.queue.length > 0) {
        this.flush();
      }
    }
  }

  async flushAudio() {
    this.isTranscribing = true;
    try {
      this.onStartTranscribing();
      const combined = this.combineQueue();
      this.queueStartTime = 0;
      await this.onAudio!(combined);
    } finally {
      this.isTranscribing = false;
      this.onStopTranscribing();
    }
  }

  // Simple combination of the queued Float32Array segments into one
  private combineQueue(): Float32Array {
    const totalLen = this.queue.reduce((sum, a) => sum + a.length, 0);
    const combined = new Float32Array(totalLen);
    let offset = 0;
    for (const chunk of this.queue) {
      combined.set(chunk, offset);
      offset += chunk.length;
    }
    // Reset the queue
    this.queue = [];
    return combined;
  }
 }

    export function LiveVoiceTranscript({
  onComplete,
  onStop,
  projectId,
 }: {
  onComplete: (transcript: string) => Promise<void>;
  onStop?: () => void;
  projectId: number;
 }) {
  const transcriptAtom = useMemo(() => atom(""), []);
  const [transcript, setTranscript] = useAtom(transcriptAtom);
  const [submitting, setSubmitting] = useState(false);
  const onAudio = useCallback(
    async (audio: Float32Array) => {
      const store = getDefaultStore();
      // Not sure why I can't use the external-to-the-closer transcript here, but apparently I can't
      const transcript = store.get(transcriptAtom);
      const array = utils.encodeWAV(audio, 1, 16000);
      const audioBlob = new Blob([array], { type: "audio/wav" });
      const audioUrlBody = btoa(
        new Uint8Array(await audioBlob.arrayBuffer()).reduce(
          (data, byte) => data + String.fromCharCode(byte),
          ""
        )
      );
      const audioUrl = `data:audio/wav;base64,${audioUrlBody}`;
      const { newTranscript, explanationText, audioTranscript, complete } =
        await todos.completeTranscript({
          audioUrl,
          previousTranscript: transcript,
          projectId,
        });
      console.log("set transcript to", newTranscript);
      setTranscript(newTranscript);
      if (complete) {
        setSubmitting(true);
        await onComplete(newTranscript);
        store.set(transcriptAtom, "");
        setSubmitting(false);
      }
    },
    [transcript, setTranscript, projectId]
  );
  return (
    <div className="w-full">
      {transcript ? (
        <Markdown
          className={twMerge(
            "border-2 p-2 rounded-mn bg-white",
            submitting ? "text-gray-400 bg-gray-200" : "text-gray-900"
          )}
        >
          {transcript}
        </Markdown>
      ) : (
        <div className="border-2 p-2 text-center rounded-mn bg-white text-gray-400 pb-8">
          Waiting...
        </div>
      )}
      <div>
        <LiveVoiceWidget onAudio={onAudio} onStop={onStop} className="w-full" />
      </div>
    </div>
  );
 }
	// Note: this isn't complete, but is an extraction of most of the relevants bits of a working module

	import { useMicVAD, utils } from "@ricky0123/vad-react";

	const FLUSH_AFTER_SPEECH_LENGTH = 20000; // transcribe after the first break after 20 seconds
	const SILENCE_LENGTH = 2000; // transcribe after 2 seconds of non-speech


	export function LiveVoiceWidget({
	onTranscript,
	onAudio,
	onStop,
	className,
	}: {
	onTranscript?: (transcript: TranscriptionType) => Promise<void>;
	onAudio?: (audio: Float32Array) => Promise<void>;
	onStop?: () => void;
	className?: string;
	}) {
	const [isTranscribing, setIsTranscribing] = useState(false);
	const transcriber = useRef(
	new AudioTranscriber({
	onTranscript,
	onAudio,
	onStartTranscribing: () => {
	setIsTranscribing(true);
	},
	onStopTranscribing: () => {
	setIsTranscribing(false);
	},
	})
	);
	const lastStart = useRef(0);
	const { listening, errored, loading, userSpeaking, pause, start } = useMicVAD(
	{
	startOnLoad: true,
	minSpeechFrames: 2,
	submitUserSpeechOnPause: true,
	preSpeechPadFrames: 2,
	onSpeechStart: () => {
	console.log(`Speech started${lastStart.current ? " again" : ""}`);
	if (lastStart.current === 0) {
	lastStart.current = Date.now();
	}
	transcriber.current.onSpeechStart();
	},
	onSpeechEnd: async (audio) => {
	const startTime = lastStart.current;
	const time = Date.now() - lastStart.current;
	lastStart.current = 0;
	console.log("length of audio", audio.length, time);
	transcriber.current.queueAudio(audio, startTime);
	},
	}
	);
	if (!listening) {
	transcriber.current.maybeFlush();
	}
	if (loading) {
	return (
	<div
	className={twMerge(
	"rounded-md border bg-red-800 text-center",
	className
	)}
	>
	<ClockIcon className="h-4 w-4 text-white inline-block" />
	</div>
	);
	}
	if (errored) {
	return (
	<div className={twMerge(className, "bg-red-800 text-white")}>
	Error: {errored}
	</div>
	);
	}
	if (!listening) {
	return (
	<Button
	className={twMerge("bg-red-900 text-white hover:bg-red-700", className)}
	onClick={start}
	Icon={MicrophoneIcon}
	/>
	);
	}
	return (
	<div>
	<Button
	onClick={() => {
	pause();
	if (onStop) {
	onStop();
	}
	}}
	className={twMerge(
	"bg-red-600 text-red-300 hover:bg-red-400 hover:text-white",
	className
	)}
	>
	{isTranscribing ? (
	<CloudArrowUpIcon className="h-4 w-4 text-white inline-block" />
	) : (
	<StopCircleIcon className="h-4 w-4 text-red-300 hover:text-white inline-block" />
	)}
	{userSpeaking ? (
	<ChatBubbleOvalLeftEllipsisIcon className="h-4 w-4 text-white inline-block" />
	) : (
	<ChatBubbleOvalLeftIcon className="h-4 w-4 text-red-300 inline-block" />
	)}
	</Button>
	</div>
	);
	}

	export class AudioTranscriber {
	queue: Float32Array[] = [];
	isTranscribing = false;
	history: string[] = [];
	continueFlush = false;
	timerId = 0;
	queueStartTime = 0;
	onTranscript?: (t: TranscriptionType) => Promise<void>;
	onAudio?: (audio: Float32Array) => Promise<void>;
	onStartTranscribing: () => void;
	onStopTranscribing: () => void;

	constructor({
	onTranscript,
	onAudio,
	onStartTranscribing,
	onStopTranscribing,
	}: {
	onTranscript?: (t: TranscriptionType) => Promise<void>;
	onAudio?: (audio: Float32Array) => Promise<void>;
	onStartTranscribing?: () => void;
	onStopTranscribing?: () => void;
	}) {
	this.onTranscript = onTranscript;
	this.onAudio = onAudio;
	this.onStartTranscribing = onStartTranscribing \|\| (() => {});
	this.onStopTranscribing = onStopTranscribing \|\| (() => {});
	}

	public async queueAudio(audio: Float32Array, startTime: number) {
	if (this.queueStartTime === 0) {
	this.queueStartTime = startTime;
	}
	this.queue.push(audio);
	console.log("Time of audio", Date.now() - this.queueStartTime);

	if (
	Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
	!this.isTranscribing
	) {
	console.log("Flushing due to total length");
	await this.flush();
	}
	if (this.timerId) {
	console.log("canceling flush");
	clearTimeout(this.timerId);
	}
	this.timerId = setTimeout(() => {
	console.log("flushing after silence");
	this.timerId = 0;
	this.maybeFlush();
	}, SILENCE_LENGTH) as unknown as number;
	}

	public onSpeechStart() {
	if (this.timerId) {
	console.log("reseting flush timer");
	clearTimeout(this.timerId);
	this.timerId = 0;
	}
	}

	maybeFlush() {
	if (this.queue.length > 0) {
	if (this.isTranscribing) {
	this.continueFlush = true;
	} else {
	this.flush();
	}
	}
	}

	// Combine queued audio, run transcription, clear the queue, and repeat if there's leftover audio.
	private async flush() {
	if (this.onTranscript) {
	await this.flushTranscript();
	}
	if (this.onAudio) {
	await this.flushAudio();
	}
	}

	async flushTranscript() {
	this.isTranscribing = true;
	try {
	this.onStartTranscribing();
	const combined = this.combineQueue();
	this.queueStartTime = 0;
	const prompt = this.history.join(" ").slice(-100); // adjust slice as needed

	// Convert to WAV (or whatever format) and transcribe
	const array = utils.encodeWAV(combined, 1, 16000);
	const blob = new Blob([array], { type: "audio/wav" });
	const transcript = await transcribe(blob, {
	prompt,
	mimetype: "audio/wav",
	});

	// Push new text to history, notify caller, then see if new audio arrived mid-transcription
	this.history.push(transcript.text);
	await this.onTranscript!(transcript);
	} finally {
	this.isTranscribing = false;
	this.onStopTranscribing();
	}

	// If there's new audio in the queue (arrived during transcription), keep flushing
	if (
	this.queue.length > 0 &&
	Date.now() - this.queueStartTime >= FLUSH_AFTER_SPEECH_LENGTH &&
	!this.isTranscribing
	) {
	await this.flush();
	} else if (this.continueFlush) {
	this.continueFlush = false;
	if (this.queue.length > 0) {
	this.flush();
	}
	}
	}

	async flushAudio() {
	this.isTranscribing = true;
	try {
	this.onStartTranscribing();
	const combined = this.combineQueue();
	this.queueStartTime = 0;
	await this.onAudio!(combined);
	} finally {
	this.isTranscribing = false;
	this.onStopTranscribing();
	}
	}

	// Simple combination of the queued Float32Array segments into one
	private combineQueue(): Float32Array {
	const totalLen = this.queue.reduce((sum, a) => sum + a.length, 0);
	const combined = new Float32Array(totalLen);
	let offset = 0;
	for (const chunk of this.queue) {
	combined.set(chunk, offset);
	offset += chunk.length;
	}
	// Reset the queue
	this.queue = [];
	return combined;
	}
	}

	export function LiveVoiceTranscript({
	onComplete,
	onStop,
	projectId,
	}: {
	onComplete: (transcript: string) => Promise<void>;
	onStop?: () => void;
	projectId: number;
	}) {
	const transcriptAtom = useMemo(() => atom(""), []);
	const [transcript, setTranscript] = useAtom(transcriptAtom);
	const [submitting, setSubmitting] = useState(false);
	const onAudio = useCallback(
	async (audio: Float32Array) => {
	const store = getDefaultStore();
	// Not sure why I can't use the external-to-the-closer transcript here, but apparently I can't
	const transcript = store.get(transcriptAtom);
	const array = utils.encodeWAV(audio, 1, 16000);
	const audioBlob = new Blob([array], { type: "audio/wav" });
	const audioUrlBody = btoa(
	new Uint8Array(await audioBlob.arrayBuffer()).reduce(
	(data, byte) => data + String.fromCharCode(byte),
	""
	)
	);
	const audioUrl = `data:audio/wav;base64,${audioUrlBody}`;
	const { newTranscript, explanationText, audioTranscript, complete } =
	await todos.completeTranscript({
	audioUrl,
	previousTranscript: transcript,
	projectId,
	});
	console.log("set transcript to", newTranscript);
	setTranscript(newTranscript);
	if (complete) {
	setSubmitting(true);
	await onComplete(newTranscript);
	store.set(transcriptAtom, "");
	setSubmitting(false);
	}
	},
	[transcript, setTranscript, projectId]
	);
	return (
	<div className="w-full">
	{transcript ? (
	<Markdown
	className={twMerge(
	"border-2 p-2 rounded-mn bg-white",
	submitting ? "text-gray-400 bg-gray-200" : "text-gray-900"
	)}
	>
	{transcript}
	</Markdown>
	) : (
	<div className="border-2 p-2 text-center rounded-mn bg-white text-gray-400 pb-8">
	Waiting...
	</div>
	)}
	<div>
	<LiveVoiceWidget onAudio={onAudio} onStop={onStop} className="w-full" />
	</div>
	</div>
	);
	}