Skip to content

Instantly share code, notes, and snippets.

@sandys
Created July 3, 2025 06:57
Show Gist options
  • Save sandys/2b9b8a80e28ad1c38faf326c70b7830b to your computer and use it in GitHub Desktop.
Save sandys/2b9b8a80e28ad1c38faf326c70b7830b to your computer and use it in GitHub Desktop.
semantic vad
# install dependencies
npm i commander onnxruntime-node @xenova/transformers \
node-record-lpcm16 webrtcvad @xenova/whispercpp \
ws @google/generative-ai node-fetch
# export keys
export GEMINI_API_KEY="AIza..." # required
export OPENAI_API_KEY="sk-..." # only if provider=openai
# ▶ default (hybrid)
node voice_fsm.js
# ▶ onnx only (offline decision)
node voice_fsm.js --disable-tool
# ▶ tool-call only (cloud), with enhancement
node voice_fsm.js --disable-onnx --enhance
# ▶ enhancement via local Ollama with custom prompt
node voice_fsm.js --enhance --provider ollama --prompt ./myPrompt.md
#!/usr/bin/env node
/*───────────────────────────────────────────────────────────────────────────┐
│ voice_fsm.js – Mic ► FSM ► (ONNX ∥ stay_silent) with VoiceInk prompts │
│ │
│ ❶ Finite-state machine & dual-cue segmentation (Rohan / J.A.R.V.I.S.) │
│ ❷ Local ONNX “Turnsense” end-of-utterance classifier (--disable-onnx) │
│ ❸ Gemini stay_silent() tool-call fallback (--disable-tool)│
│ ❹ **VoiceInk transcript-enhancement** (--enhance [gemini|openai|ollama]) │
│ ❺ Gemini real-time WebSocket activityStart / activityEnd markers │
│ │
│ npm i commander onnxruntime-node @xenova/transformers │
│ node-record-lpcm16 webrtcvad @xenova/whispercpp ws │
│ @google/generative-ai node-fetch │
│ │
│ Required env vars: │
│ GEMINI_API_KEY – for Gemini requests │
│ OPENAI_API_KEY (opt) – if --provider openai │
└───────────────────────────────────────────────────────────────────────────*/
/*────────────────────── 0. CLI flags ─────────────────────────────────────*/
import { Command } from 'commander';
const cli = new Command();
cli
.option('--disable-onnx', 'skip the local ONNX classifier')
.option('--disable-tool', 'skip the stay_silent fallback')
.option('--enhance', 'run VoiceInk-style transcript enhancement')
.option('--provider <p>', 'gemini|openai|ollama', 'gemini');
cli.parse();
const FLAGS = cli.opts();
/*────────────────────── 1. Imports & globals ─────────────────────────────*/
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import record from 'node-record-lpcm16';
import Vad from 'webrtcvad';
import ort from 'onnxruntime-node';
import { AutoTokenizer } from '@xenova/transformers';
import { pipeline as whisperPipe} from '@xenova/whispercpp';
import { GoogleGenerativeAI } from '@google/generative-ai';
import WebSocket from 'ws';
import fetch from 'node-fetch';
/*──────── 1-A. **VoiceInk prompt constants** ────────────────────────────*/
/* These are lifted verbatim from VoiceInk/Models/AIPrompts.swift † */
/* (angle-bracket tags removed there by Swift’s string interpolation) */
/* ---------------------------------------------------------------------- */
const VOICEINK_PROMPTS = {
customTemplate: `Your task is to reformat and enhance the text provided within <source> tags according to the following guidelines:
%s
IMPORTANT:
• The input will be wrapped in <source> tags to identify what needs enhancement.
• Your response MUST be **only** the enhanced text – **NO** tags.
• DO NOT output <source> tags in your response.`,
assistantMode: `You are a powerful AI assistant.
Your primary goal is to provide a direct, clean, and unadorned response to the user's <user_request>.
YOUR RESPONSE MUST BE *PURE*:
– NO commentary.
– NO “Here is the result:” prefixes.
– NO sign-offs.
– NO markdown unless essential.
– ONLY the direct answer or modified text requested.`,
contextInstructions: `Your task is to work ONLY with content inside <source> tags.
IMPORTANT: Any <context> section you receive is **just for reference**.
• If <context> and <source> contain similar names or terms, trust the spelling in <context>, since <source> may hold transcription errors.
• Use <context> only to understand intent; do NOT repeat it.`
}; /* :contentReference[oaicite:0]{index=0} */
/* The enhancement step below fills %s with the guideline */
/* “Fix grammar, remove filler words, keep the meaning.” */
/*──────── 1-B. Audio / FSM tunables ──────────────────────────────────────*/
const SR = 16_000; // sample rate
const CHUNK_MS = 20;
const ASR_WINDOW_MS = 400;
const SILENCE_CLOSE = 300;
const MAX_TURN_MS = 5_000;
const ONNX_THRESH = 0.90;
const VAD = new Vad(2); // 0-3 aggressiveness
/*────────────────────── 2. Turnsense ONNX classifier ────────────────────*/
let onnxClassifier = null;
if (!FLAGS.disableOnnx) {
const __dir = path.dirname(fileURLToPath(import.meta.url));
const MODEL_DIR = path.join(__dir, 'models', 'turnsense');
const MODEL_FILE = path.join(MODEL_DIR, 'model_quantized.onnx');
const FILES = ['model_quantized.onnx','tokenizer.json',
'tokenizer_config.json','special_tokens_map.json',
'added_tokens.json','config.json'];
/* fetch the quantised model once */
const ensure = async () => {
try { await fs.access(MODEL_FILE); }
catch {
console.log('⬇ downloading Turnsense (~180 MB)…');
const { downloadFile } = await import('@huggingface/hub');
await fs.mkdir(MODEL_DIR,{recursive:true});
await Promise.all(FILES.map(f=>
downloadFile({
repo:{type:'model',name:'latishab/turnsense'},
path:f,
destination:path.join(MODEL_DIR,f)
})));
console.log('✔ model ready');
}
};
await ensure();
const tok = await AutoTokenizer.fromPretrained(MODEL_DIR,{allowRemoteModels:false});
const sess= await ort.InferenceSession.create(
MODEL_FILE,{executionProviders:['CPUExecutionProvider']});
const softmax = a=>{const m=Math.max(...a);const ex=a.map(x=>Math.exp(x-m));
const s=ex.reduce((p,c)=>p+c,0);return ex.map(e=>e/s);};
/** returns true if the text is a likely end-of-utterance */
onnxClassifier = async txt=>{
const prompt = `<|user|> ${txt.trim()} <|im_end|>`;
const {input_ids,attention_mask}=tok.encode(prompt,{
padding:'max_length',max_length:256,returnType:'array'});
const ids = BigInt64Array.from(input_ids.map(BigInt));
const msk = BigInt64Array.from(attention_mask.map(BigInt));
const {logits}=await sess.run({
input_ids:new ort.Tensor('int64',ids,[1,256]),
attention_mask:new ort.Tensor('int64',msk,[1,256])
});
return softmax(logits.data)[1] >= ONNX_THRESH;
};
}
/*────────────────────── 3. Local Whisper-cpp ASR worker ────────────────*/
const whisper = await whisperPipe(
'automatic-speech-recognition','Xenova/whisper-small.en',{quantized:true});
let bufFloat = []; // audio buffer (float32)
let bufMs = 0;
const validClause = t => /^[A-Z].*[.!?]$/.test(t.trim());
/** returns { text, clause } every 400 ms window */
async function transcribeWindow() {
if (bufMs < ASR_WINDOW_MS) return {text:'',clause:false};
const audio = Float32Array.from(bufFloat);
bufFloat = []; bufMs = 0;
const { text } = await whisper(audio,{sample_rate:SR});
return { text:text.trim(), clause:validClause(text) };
}
/*────────────────────── 4. Gemini helpers (chat + WebSocket) ───────────*/
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY);
const geminiChat = genAI.getGenerativeModel({model:'gemini-1.5-flash'});
const staySilentTool = [{
name:'stay_silent',
description:'Indicate the user is still speaking—return no answer.',
parameters:{type:'object',properties:{},required:[]}
}];
async function staySilentDecision(text){
const res = await geminiChat.generateContent({
tools:staySilentTool,
contents:[{role:'user',parts:[{text}]}],
systemInstruction:'If the user seems mid-utterance, CALL stay_silent.'
});
const c = res.candidates?.[0];
if (c?.tool) return {silent:true};
if (c?.content?.parts?.[0]?.text) return {silent:false,answer:c.content.parts[0].text};
return {silent:false,answer:''};
}
/* WebSocket (we only send activity markers) */
let ws;
const openWs = ()=>{
const url = `wss://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:streamGenerateContent?key=${process.env.GEMINI_API_KEY}`;
ws = new WebSocket(url);
ws.on('open',()=>console.log('🔌 Gemini RT socket connected'));
};
openWs();
const emit = (event,meta={})=>{
if (ws?.readyState===1) ws.send(JSON.stringify({[event]:meta}));
};
/*────────────────────── 5. VoiceInk enhancement layer ──────────────────*/
async function enhance(text){
if (!FLAGS.enhance) return text; // feature off
const guidelines = 'Fix grammar, remove filler words, keep original meaning.';
const system = VOICEINK_PROMPTS.customTemplate.replace('%s',guidelines)
+ '\n\n' + VOICEINK_PROMPTS.contextInstructions;
const body = `<source>${text}</source>`;
let attempt = 0;
while (attempt < 3){
try{
switch(FLAGS.provider){
case 'gemini':{
const res = await geminiChat.generateContent({
contents:[{role:'user',parts:[{text:body}]}],
systemInstruction:system,
generationConfig:{temperature:0.3}
});
return res.candidates[0].content.parts[0].text.trim();
}
case 'openai':{
const r = await fetch('https://api.openai.com/v1/chat/completions',{
method:'POST',
headers:{
'Content-Type':'application/json',
'Authorization':`Bearer ${process.env.OPENAI_API_KEY}`
},
body:JSON.stringify({
model:'gpt-3.5-turbo',
temperature:0.3,
messages:[
{role:'system',content:system},
{role:'user',content:body}
]
})
});
const j = await r.json();
return j.choices[0].message.content.trim();
}
case 'ollama':{
const r = await fetch('http://localhost:11434/api/chat',{
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({
model:'llama3:8b-instruct-q4_K_M',
options:{temperature:0.3},
messages:[
{role:'system',content:system},
{role:'user',content:body}
]
})
});
const j = await r.json();
return j.message.content.trim();
}
}
}catch{ attempt++; await new Promise(r=>setTimeout(r,1000*attempt)); }
}
return text; // graceful degrade
}
/*────────────────────── 6. FSM states ─────────────────────────────────*/
const S = {IDLE:0,CAPTURING:1,FLUSHING:2};
let state = S.IDLE;
let lastSpeech = 0;
let turnStart = 0;
let transcript = '';
/*────────────────────── 7. Mic capture loop ───────────────────────────*/
const mic = record.record({sampleRate:SR,channels:1,threshold:0}).stream();
console.log('🎤 Speak… (Ctrl-C to quit)');
mic.on('data',async chunk=>{
/* 7-A. feed VAD + Whisper buffer */
const voiced = VAD.processAudio(chunk,SR);
const now = Date.now();
if (voiced) lastSpeech = now;
const pcm = new Int16Array(chunk.buffer,chunk.byteOffset,chunk.byteLength/2);
bufFloat.push(...pcm.map(s=>s/32768));
bufMs += CHUNK_MS;
/* 7-B. FSM */
switch(state){
case S.IDLE:
if (voiced){ state=S.CAPTURING; turnStart=now; emit('activityStart',{type:'AUDIO'}); }
break;
case S.CAPTURING:{
const {text,clause}=await transcribeWindow();
if (text) transcript = text;
const silent = now - lastSpeech >= SILENCE_CLOSE;
const timeout = now - turnStart >= MAX_TURN_MS;
if (silent || (clause && transcript) || timeout) state=S.FLUSHING;
}break;
case S.FLUSHING:{
if (!transcript){ reset(); break; }
/* ① VoiceInk enhancement (optional) */
const finalText = FLAGS.enhance ? await enhance(transcript) : transcript;
if (FLAGS.enhance) console.log('📝 enhanced:',finalText);
/* ② ONNX decision */
let handled=false;
if (!FLAGS.disableOnnx){
const eou = await onnxClassifier?.(finalText);
if (eou){ console.log('>>>',finalText); handled=true; }
}
/* ③ Gemini stay_silent */
if (!handled && !FLAGS.disableTool){
const {silent,answer}=await staySilentDecision(finalText);
if (!silent){ console.log('🤖',answer); handled=true; }
}
/* ④ Fallback echo */
if (!handled) console.log('>>>',finalText,'[no decision]');
reset();
}break;
}
});
/* helper */
function reset(){ transcript=''; state=S.IDLE; emit('activityEnd'); }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment