Created
July 3, 2025 06:57
-
-
Save sandys/2b9b8a80e28ad1c38faf326c70b7830b to your computer and use it in GitHub Desktop.
semantic vad
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install dependencies | |
npm i commander onnxruntime-node @xenova/transformers \ | |
node-record-lpcm16 webrtcvad @xenova/whispercpp \ | |
ws @google/generative-ai node-fetch | |
# export keys | |
export GEMINI_API_KEY="AIza..." # required | |
export OPENAI_API_KEY="sk-..." # only if provider=openai | |
# ▶ default (hybrid) | |
node voice_fsm.js | |
# ▶ onnx only (offline decision) | |
node voice_fsm.js --disable-tool | |
# ▶ tool-call only (cloud), with enhancement | |
node voice_fsm.js --disable-onnx --enhance | |
# ▶ enhancement via local Ollama with custom prompt | |
node voice_fsm.js --enhance --provider ollama --prompt ./myPrompt.md |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
/*───────────────────────────────────────────────────────────────────────────┐ | |
│ voice_fsm.js – Mic ► FSM ► (ONNX ∥ stay_silent) with VoiceInk prompts │ | |
│ │ | |
│ ❶ Finite-state machine & dual-cue segmentation (Rohan / J.A.R.V.I.S.) │ | |
│ ❷ Local ONNX “Turnsense” end-of-utterance classifier (--disable-onnx) │ | |
│ ❸ Gemini stay_silent() tool-call fallback (--disable-tool)│ | |
│ ❹ **VoiceInk transcript-enhancement** (--enhance [gemini|openai|ollama]) │ | |
│ ❺ Gemini real-time WebSocket activityStart / activityEnd markers │ | |
│ │ | |
│ npm i commander onnxruntime-node @xenova/transformers │ | |
│ node-record-lpcm16 webrtcvad @xenova/whispercpp ws │ | |
│ @google/generative-ai node-fetch │ | |
│ │ | |
│ Required env vars: │ | |
│ GEMINI_API_KEY – for Gemini requests │ | |
│ OPENAI_API_KEY (opt) – if --provider openai │ | |
└───────────────────────────────────────────────────────────────────────────*/ | |
/*────────────────────── 0. CLI flags ─────────────────────────────────────*/ | |
import { Command } from 'commander'; | |
const cli = new Command(); | |
cli | |
.option('--disable-onnx', 'skip the local ONNX classifier') | |
.option('--disable-tool', 'skip the stay_silent fallback') | |
.option('--enhance', 'run VoiceInk-style transcript enhancement') | |
.option('--provider <p>', 'gemini|openai|ollama', 'gemini'); | |
cli.parse(); | |
const FLAGS = cli.opts(); | |
/*────────────────────── 1. Imports & globals ─────────────────────────────*/ | |
import fs from 'fs/promises'; | |
import path from 'path'; | |
import { fileURLToPath } from 'url'; | |
import record from 'node-record-lpcm16'; | |
import Vad from 'webrtcvad'; | |
import ort from 'onnxruntime-node'; | |
import { AutoTokenizer } from '@xenova/transformers'; | |
import { pipeline as whisperPipe} from '@xenova/whispercpp'; | |
import { GoogleGenerativeAI } from '@google/generative-ai'; | |
import WebSocket from 'ws'; | |
import fetch from 'node-fetch'; | |
/*──────── 1-A. **VoiceInk prompt constants** ────────────────────────────*/ | |
/* These are lifted verbatim from VoiceInk/Models/AIPrompts.swift † */ | |
/* (angle-bracket tags removed there by Swift’s string interpolation) */ | |
/* ---------------------------------------------------------------------- */ | |
const VOICEINK_PROMPTS = { | |
customTemplate: `Your task is to reformat and enhance the text provided within <source> tags according to the following guidelines: | |
%s | |
IMPORTANT: | |
• The input will be wrapped in <source> tags to identify what needs enhancement. | |
• Your response MUST be **only** the enhanced text – **NO** tags. | |
• DO NOT output <source> tags in your response.`, | |
assistantMode: `You are a powerful AI assistant. | |
Your primary goal is to provide a direct, clean, and unadorned response to the user's <user_request>. | |
YOUR RESPONSE MUST BE *PURE*: | |
– NO commentary. | |
– NO “Here is the result:” prefixes. | |
– NO sign-offs. | |
– NO markdown unless essential. | |
– ONLY the direct answer or modified text requested.`, | |
contextInstructions: `Your task is to work ONLY with content inside <source> tags. | |
IMPORTANT: Any <context> section you receive is **just for reference**. | |
• If <context> and <source> contain similar names or terms, trust the spelling in <context>, since <source> may hold transcription errors. | |
• Use <context> only to understand intent; do NOT repeat it.` | |
}; /* :contentReference[oaicite:0]{index=0} */ | |
/* The enhancement step below fills %s with the guideline */ | |
/* “Fix grammar, remove filler words, keep the meaning.” */ | |
/*──────── 1-B. Audio / FSM tunables ──────────────────────────────────────*/ | |
const SR = 16_000; // sample rate | |
const CHUNK_MS = 20; | |
const ASR_WINDOW_MS = 400; | |
const SILENCE_CLOSE = 300; | |
const MAX_TURN_MS = 5_000; | |
const ONNX_THRESH = 0.90; | |
const VAD = new Vad(2); // 0-3 aggressiveness | |
/*────────────────────── 2. Turnsense ONNX classifier ────────────────────*/ | |
let onnxClassifier = null; | |
if (!FLAGS.disableOnnx) { | |
const __dir = path.dirname(fileURLToPath(import.meta.url)); | |
const MODEL_DIR = path.join(__dir, 'models', 'turnsense'); | |
const MODEL_FILE = path.join(MODEL_DIR, 'model_quantized.onnx'); | |
const FILES = ['model_quantized.onnx','tokenizer.json', | |
'tokenizer_config.json','special_tokens_map.json', | |
'added_tokens.json','config.json']; | |
/* fetch the quantised model once */ | |
const ensure = async () => { | |
try { await fs.access(MODEL_FILE); } | |
catch { | |
console.log('⬇ downloading Turnsense (~180 MB)…'); | |
const { downloadFile } = await import('@huggingface/hub'); | |
await fs.mkdir(MODEL_DIR,{recursive:true}); | |
await Promise.all(FILES.map(f=> | |
downloadFile({ | |
repo:{type:'model',name:'latishab/turnsense'}, | |
path:f, | |
destination:path.join(MODEL_DIR,f) | |
}))); | |
console.log('✔ model ready'); | |
} | |
}; | |
await ensure(); | |
const tok = await AutoTokenizer.fromPretrained(MODEL_DIR,{allowRemoteModels:false}); | |
const sess= await ort.InferenceSession.create( | |
MODEL_FILE,{executionProviders:['CPUExecutionProvider']}); | |
const softmax = a=>{const m=Math.max(...a);const ex=a.map(x=>Math.exp(x-m)); | |
const s=ex.reduce((p,c)=>p+c,0);return ex.map(e=>e/s);}; | |
/** returns true if the text is a likely end-of-utterance */ | |
onnxClassifier = async txt=>{ | |
const prompt = `<|user|> ${txt.trim()} <|im_end|>`; | |
const {input_ids,attention_mask}=tok.encode(prompt,{ | |
padding:'max_length',max_length:256,returnType:'array'}); | |
const ids = BigInt64Array.from(input_ids.map(BigInt)); | |
const msk = BigInt64Array.from(attention_mask.map(BigInt)); | |
const {logits}=await sess.run({ | |
input_ids:new ort.Tensor('int64',ids,[1,256]), | |
attention_mask:new ort.Tensor('int64',msk,[1,256]) | |
}); | |
return softmax(logits.data)[1] >= ONNX_THRESH; | |
}; | |
} | |
/*────────────────────── 3. Local Whisper-cpp ASR worker ────────────────*/ | |
const whisper = await whisperPipe( | |
'automatic-speech-recognition','Xenova/whisper-small.en',{quantized:true}); | |
let bufFloat = []; // audio buffer (float32) | |
let bufMs = 0; | |
const validClause = t => /^[A-Z].*[.!?]$/.test(t.trim()); | |
/** returns { text, clause } every 400 ms window */ | |
async function transcribeWindow() { | |
if (bufMs < ASR_WINDOW_MS) return {text:'',clause:false}; | |
const audio = Float32Array.from(bufFloat); | |
bufFloat = []; bufMs = 0; | |
const { text } = await whisper(audio,{sample_rate:SR}); | |
return { text:text.trim(), clause:validClause(text) }; | |
} | |
/*────────────────────── 4. Gemini helpers (chat + WebSocket) ───────────*/ | |
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY); | |
const geminiChat = genAI.getGenerativeModel({model:'gemini-1.5-flash'}); | |
const staySilentTool = [{ | |
name:'stay_silent', | |
description:'Indicate the user is still speaking—return no answer.', | |
parameters:{type:'object',properties:{},required:[]} | |
}]; | |
async function staySilentDecision(text){ | |
const res = await geminiChat.generateContent({ | |
tools:staySilentTool, | |
contents:[{role:'user',parts:[{text}]}], | |
systemInstruction:'If the user seems mid-utterance, CALL stay_silent.' | |
}); | |
const c = res.candidates?.[0]; | |
if (c?.tool) return {silent:true}; | |
if (c?.content?.parts?.[0]?.text) return {silent:false,answer:c.content.parts[0].text}; | |
return {silent:false,answer:''}; | |
} | |
/* WebSocket (we only send activity markers) */ | |
let ws; | |
const openWs = ()=>{ | |
const url = `wss://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:streamGenerateContent?key=${process.env.GEMINI_API_KEY}`; | |
ws = new WebSocket(url); | |
ws.on('open',()=>console.log('🔌 Gemini RT socket connected')); | |
}; | |
openWs(); | |
const emit = (event,meta={})=>{ | |
if (ws?.readyState===1) ws.send(JSON.stringify({[event]:meta})); | |
}; | |
/*────────────────────── 5. VoiceInk enhancement layer ──────────────────*/ | |
async function enhance(text){ | |
if (!FLAGS.enhance) return text; // feature off | |
const guidelines = 'Fix grammar, remove filler words, keep original meaning.'; | |
const system = VOICEINK_PROMPTS.customTemplate.replace('%s',guidelines) | |
+ '\n\n' + VOICEINK_PROMPTS.contextInstructions; | |
const body = `<source>${text}</source>`; | |
let attempt = 0; | |
while (attempt < 3){ | |
try{ | |
switch(FLAGS.provider){ | |
case 'gemini':{ | |
const res = await geminiChat.generateContent({ | |
contents:[{role:'user',parts:[{text:body}]}], | |
systemInstruction:system, | |
generationConfig:{temperature:0.3} | |
}); | |
return res.candidates[0].content.parts[0].text.trim(); | |
} | |
case 'openai':{ | |
const r = await fetch('https://api.openai.com/v1/chat/completions',{ | |
method:'POST', | |
headers:{ | |
'Content-Type':'application/json', | |
'Authorization':`Bearer ${process.env.OPENAI_API_KEY}` | |
}, | |
body:JSON.stringify({ | |
model:'gpt-3.5-turbo', | |
temperature:0.3, | |
messages:[ | |
{role:'system',content:system}, | |
{role:'user',content:body} | |
] | |
}) | |
}); | |
const j = await r.json(); | |
return j.choices[0].message.content.trim(); | |
} | |
case 'ollama':{ | |
const r = await fetch('http://localhost:11434/api/chat',{ | |
method:'POST', | |
headers:{'Content-Type':'application/json'}, | |
body:JSON.stringify({ | |
model:'llama3:8b-instruct-q4_K_M', | |
options:{temperature:0.3}, | |
messages:[ | |
{role:'system',content:system}, | |
{role:'user',content:body} | |
] | |
}) | |
}); | |
const j = await r.json(); | |
return j.message.content.trim(); | |
} | |
} | |
}catch{ attempt++; await new Promise(r=>setTimeout(r,1000*attempt)); } | |
} | |
return text; // graceful degrade | |
} | |
/*────────────────────── 6. FSM states ─────────────────────────────────*/ | |
const S = {IDLE:0,CAPTURING:1,FLUSHING:2}; | |
let state = S.IDLE; | |
let lastSpeech = 0; | |
let turnStart = 0; | |
let transcript = ''; | |
/*────────────────────── 7. Mic capture loop ───────────────────────────*/ | |
const mic = record.record({sampleRate:SR,channels:1,threshold:0}).stream(); | |
console.log('🎤 Speak… (Ctrl-C to quit)'); | |
mic.on('data',async chunk=>{ | |
/* 7-A. feed VAD + Whisper buffer */ | |
const voiced = VAD.processAudio(chunk,SR); | |
const now = Date.now(); | |
if (voiced) lastSpeech = now; | |
const pcm = new Int16Array(chunk.buffer,chunk.byteOffset,chunk.byteLength/2); | |
bufFloat.push(...pcm.map(s=>s/32768)); | |
bufMs += CHUNK_MS; | |
/* 7-B. FSM */ | |
switch(state){ | |
case S.IDLE: | |
if (voiced){ state=S.CAPTURING; turnStart=now; emit('activityStart',{type:'AUDIO'}); } | |
break; | |
case S.CAPTURING:{ | |
const {text,clause}=await transcribeWindow(); | |
if (text) transcript = text; | |
const silent = now - lastSpeech >= SILENCE_CLOSE; | |
const timeout = now - turnStart >= MAX_TURN_MS; | |
if (silent || (clause && transcript) || timeout) state=S.FLUSHING; | |
}break; | |
case S.FLUSHING:{ | |
if (!transcript){ reset(); break; } | |
/* ① VoiceInk enhancement (optional) */ | |
const finalText = FLAGS.enhance ? await enhance(transcript) : transcript; | |
if (FLAGS.enhance) console.log('📝 enhanced:',finalText); | |
/* ② ONNX decision */ | |
let handled=false; | |
if (!FLAGS.disableOnnx){ | |
const eou = await onnxClassifier?.(finalText); | |
if (eou){ console.log('>>>',finalText); handled=true; } | |
} | |
/* ③ Gemini stay_silent */ | |
if (!handled && !FLAGS.disableTool){ | |
const {silent,answer}=await staySilentDecision(finalText); | |
if (!silent){ console.log('🤖',answer); handled=true; } | |
} | |
/* ④ Fallback echo */ | |
if (!handled) console.log('>>>',finalText,'[no decision]'); | |
reset(); | |
}break; | |
} | |
}); | |
/* helper */ | |
function reset(){ transcript=''; state=S.IDLE; emit('activityEnd'); } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment