Skip to content

Instantly share code, notes, and snippets.

@Boorj
Last active January 21, 2025 12:05
Show Gist options
  • Save Boorj/7487017634afb0a98e467ed099b43604 to your computer and use it in GitHub Desktop.
Save Boorj/7487017634afb0a98e467ed099b43604 to your computer and use it in GitHub Desktop.
Vosk TTS model N-API wrapper using ffi-rs library

Updated version of vosk for node

Nothing difficult: just changed define a signature and wrapped each call parameter group to tuples.

// @ts-check
'use strict';
/**
* @module vosk
*/
const os = require('os');
const path = require('path');
const {DataType, open, close, define} = require('ffi-rs');
const fs = require('fs');
const soname = (function () {
if (os.platform() === 'win32') {
let currentPath = process.env.Path;
let dllDirectory = path.resolve(path.join(__dirname, 'lib', 'win-x86_64'));
process.env.Path = dllDirectory + path.delimiter + currentPath;
return path.join(__dirname, 'lib', 'win-x86_64', 'libvosk.dll');
}
if (os.platform() === 'darwin') {
return path.join(__dirname, 'lib', 'osx-universal', 'libvosk.dylib');
}
if (os.platform() === 'linux' && os.arch() === 'arm64') {
return path.join(__dirname, 'lib', 'linux-arm64', 'libvosk.so');
}
return path.join(__dirname, 'lib', 'linux-x86_64', 'libvosk.so');
})();
if (!fs.existsSync(soname)) {
throw new Error(`File doesn't exist: ${soname}`);
}
open({
library: 'libvosk',
path : soname,
});
/** @type {LibVosk} */
const libvosk = define({
vosk_set_log_level : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.I32 ]},
vosk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]},
vosk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]},
vosk_spk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]},
vosk_spk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]},
vosk_recognizer_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float ]},
vosk_recognizer_new_spk : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.External]},
vosk_recognizer_new_grm : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.String ]},
vosk_recognizer_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]},
vosk_recognizer_set_max_alternatives: {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.I32 ]},
vosk_recognizer_set_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]},
vosk_recognizer_set_partial_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]},
vosk_recognizer_set_spk_model : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.External ]},
vosk_recognizer_accept_waveform : {library: 'libvosk', retType: DataType.Boolean , paramsType: [DataType.External, DataType.U8Array, DataType.I32 ]},
vosk_recognizer_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]},
vosk_recognizer_final_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]},
vosk_recognizer_partial_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]},
vosk_recognizer_reset : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}
});
/**
* Set log level for Kaldi messages
* @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence.
*/
function setLogLevel(level) {
libvosk.vosk_set_log_level([level]);
}
/**
* Build a Model from a model file.
* @see models [models](https://alphacephei.com/vosk/models)
*/
class Model {
/**
* Build a Model to be used with the voice recognition. Each language should have it's own Model
* for the speech recognition to work.
* @param {string} modelPath The abstract pathname to the model
* @see models [models](https://alphacephei.com/vosk/models)
*/
constructor(modelPath) {
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = libvosk.vosk_model_new([modelPath]);
console.log("model is created");
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_model_free([this.handle]);
}
}
/**
* Build a Speaker Model from a speaker model file.
* The Speaker Model enables speaker identification.
* @see models [models](https://alphacephei.com/vosk/models)
*/
class SpeakerModel {
/**
* Loads speaker model data from the file and returns the model object
*
* @param {string} modelPath the path of the model on the filesystem
* @see models [models](https://alphacephei.com/vosk/models)
*/
constructor(modelPath) {
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = libvosk.vosk_spk_model_new([modelPath]);
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_spk_model_free([this.handle]);
}
}
/**
* Helper to narrow down type while using `hasOwnProperty`.
* @see hasOwnProperty [typescript issue](https://fettblog.eu/typescript-hasownproperty/)
* @template {Object} Obj
* @template {PropertyKey} Key
* @param {Obj} obj
* @param {Key} prop
* @returns {obj is Obj & Record<Key, unknown>}
*/
function hasOwnProperty(obj, prop) {
return obj.hasOwnProperty(prop);
}
/**
* @template T
* @template U
* @typedef {{ [P in Exclude<keyof T, keyof U>]?: never }} Without
*/
/**
* @template T
* @template U
* @typedef {(T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U} XOR
*/
/**
* Create a Recognizer that will be able to transform audio streams into text using a Model.
* @template {XOR<SpeakerRecognizerParam, Partial<GrammarRecognizerParam>>} T extra parameter
* @see Model
*/
class Recognizer {
handle;
/**
* Create a Recognizer that will handle speech to text recognition.
* @constructor
* @param {T & BaseRecognizerParam & Partial<SpeakerRecognizerParam>} param The Recognizer parameters
*
* Sometimes when you want to improve recognition accuracy and when you don't need
* to recognize large vocabulary you can specify a list of phrases to recognize. This
* will improve recognizer speed and accuracy but might return [unk] if user said
* something different.
*
* Only recognizers with lookahead models support this type of quick configuration.
* Precompiled HCLG graph models are not supported.
*/
constructor(param) {
const {model, sampleRate} = param;
// Prevent the user to receive unpredictable results
if (hasOwnProperty(param, 'speakerModel') && hasOwnProperty(param, 'grammar')) {
throw new Error('grammar and speakerModel cannot be used together for now.');
}
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = hasOwnProperty(param, 'speakerModel')
? libvosk.vosk_recognizer_new_spk([model.handle, sampleRate, param.speakerModel.handle])
: hasOwnProperty(param, 'grammar')
? libvosk.vosk_recognizer_new_grm([model.handle, sampleRate, JSON.stringify(param.grammar)])
: libvosk.vosk_recognizer_new([model.handle, sampleRate]);
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_recognizer_free([this.handle]);
}
/** Configures recognizer to output n-best results
*
* <pre>
* {
* "alternatives": [
* { "text": "one two three four five", "confidence": 0.97 },
* { "text": "one two three for five", "confidence": 0.03 },
* ]
* }
* </pre>
*
* @param max_alternatives - maximum alternatives to return from recognition results
*/
setMaxAlternatives(max_alternatives) {
libvosk.vosk_recognizer_set_max_alternatives([this.handle, max_alternatives]);
}
/** Configures recognizer to output words with times
*
* <pre>
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* </pre>
*
* @param words - boolean value
*/
setWords(words) {
libvosk.vosk_recognizer_set_words([this.handle, words]);
}
/** Same as above, but for partial results*/
setPartialWords(partial_words) {
libvosk.vosk_recognizer_set_partial_words([this.handle, partial_words]);
}
/** Adds speaker recognition model to already created recognizer. Helps to initialize
* speaker recognition for grammar-based recognizer.
*
* @param spk_model Speaker recognition model
*/
setSpkModel(spk_model) {
libvosk.vosk_recognizer_set_spk_model([this.handle, spk_model.handle]);
}
/**
* Accept voice data
*
* accept and process new chunk of voice data
*
* @param {Buffer} data audio data in PCM 16-bit mono format
* @returns true if silence is occured and you can retrieve a new utterance with result method
*/
acceptWaveform(data) {
return libvosk.vosk_recognizer_accept_waveform([this.handle, data, data.length]);
};
/**
* Accept voice data
*
* accept and process new chunk of voice data
*
* @param {Buffer} data audio data in PCM 16-bit mono format
* @returns true if silence is occured and you can retrieve a new utterance with result method
*/
acceptWaveformAsync(data) {
return new Promise((resolve, reject) => {
libvosk.vosk_recognizer_accept_waveform.async([this.handle, data, data.length], function (err, result) {
if (err) {
reject(err);
}
else {
resolve(result);
}
});
});
};
/** Returns speech recognition result in a string
*
* @returns the result in JSON format which contains decoded line, decoded
* words, times in seconds and confidences. You can parse this result
* with any json parser
* <pre>
* {
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* "text" : "what zero zero zero one"
* }
* </pre>
*/
resultString() {
return libvosk.vosk_recognizer_result([this.handle]);
};
/**
* Returns speech recognition results
* @returns {Result<T>} The results
*/
result() {
return JSON.parse(libvosk.vosk_recognizer_result([this.handle]));
};
/**
* speech recognition text which is not yet finalized.
* result may change as recognizer process more data.
*
* @returns {PartialResults} The partial results
*/
partialResult() {
return JSON.parse(libvosk.vosk_recognizer_partial_result([this.handle]));
};
/**
* Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns {Result<T>} speech result.
*/
finalResult() {
return JSON.parse(libvosk.vosk_recognizer_final_result([this.handle]));
};
/**
*
* Resets current results so the recognition can continue from scratch
*/
reset() {
libvosk.vosk_recognizer_reset([this.handle]);
}
}
exports.setLogLevel = setLogLevel;
exports.Model = Model;
exports.SpeakerModel = SpeakerModel;
exports.Recognizer = Recognizer;
// Optional: Close library when done (call when appropriate)
// close('libvosk');
// vosk.d.ts
declare module 'vosk' {
/**
* Set log level for Kaldi messages
* @param level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence.
*/
export function setLogLevel(level: number): void;
export class Model {
/** @internal */
handle: any;
constructor(modelPath: string);
free(): void;
}
export class SpeakerModel {
/** @internal */
handle: any;
constructor(modelPath: string);
free(): void;
}
type WordResult = {
/** Confidence (0-1) */
conf: number;
/** Start time in seconds */
start: number;
/** End time in seconds */
end: number;
/** Recognized word */
word: string;
};
type RecognitionResults = {
result: WordResult[];
text: string;
};
type SpeakerResults = {
spk: number[];
spk_frames: number;
};
type PartialResults = {
partial: string;
};
type BaseRecognizerParam = {
model: Model;
sampleRate: number;
};
type SpeakerRecognizerParam = {
speakerModel: SpeakerModel;
};
type GrammarRecognizerParam = {
grammar: string[];
};
type RecognizerParams<T> = T & BaseRecognizerParam;
type Result<T> = T extends SpeakerRecognizerParam
? RecognitionResults & SpeakerResults
: T extends GrammarRecognizerParam
? RecognitionResults
: never;
export class Recognizer<T extends SpeakerRecognizerParam | GrammarRecognizerParam> {
/** @internal */
handle: any;
constructor(params: RecognizerParams<T>);
free(): void;
setMaxAlternatives(max_alternatives: number): void;
setWords(words: boolean): void;
setPartialWords(partial_words: boolean): void;
setSpkModel(spk_model: SpeakerModel): void;
acceptWaveform(data: Buffer): boolean;
resultString(): string;
result(): Result<T>;
partialResult(): PartialResults;
finalResult(): Result<T>;
reset(): void;
}
// Helper type for XOR (mutually exclusive) properties
type Without<T, U> = { [P in Exclude<keyof T, keyof U>]?: never };
type XOR<T, U> = (T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U;
export type RecognizerConstructorParams = BaseRecognizerParam & XOR<
SpeakerRecognizerParam,
GrammarRecognizerParam
>;
}
interface LibVosk {
/**
* Set the log level for Vosk.
* @param level - Log level (integer).
*/
vosk_set_log_level: (params: [number]) => void;
/**
* Create a new Vosk model.
* @param modelPath - Path to the model (string).
* @returns Pointer to the model.
*/
vosk_model_new: (params: [string]) => any;
/**
* Free a Vosk model.
* @param modelHandle - Pointer to the model.
*/
vosk_model_free: (params: [any]) => void;
/**
* Create a new Vosk speaker model.
* @param modelPath - Path to the speaker model (string).
* @returns Pointer to the speaker model.
*/
vosk_spk_model_new: (params: [string]) => any;
/**
* Free a Vosk speaker model.
* @param spkModelHandle - Pointer to the speaker model.
*/
vosk_spk_model_free: (params: [any]) => void;
/**
* Create a new recognizer without speaker or grammar.
* @param modelHandle - Pointer to the model.
* @param sampleRate - Sample rate (float).
* @returns Pointer to the recognizer.
*/
vosk_recognizer_new: (params: [any, number]) => any;
/**
* Create a new recognizer with speaker model.
* @param modelHandle - Pointer to the model.
* @param sampleRate - Sample rate (float).
* @param spkModelHandle - Pointer to the speaker model.
* @returns Pointer to the recognizer.
*/
vosk_recognizer_new_spk: (params: [any, number, any]) => any;
/**
* Create a new recognizer with grammar.
* @param modelHandle - Pointer to the model.
* @param sampleRate - Sample rate (float).
* @param grammar - Grammar string (JSON).
* @returns Pointer to the recognizer.
*/
vosk_recognizer_new_grm: (params: [any, number, string]) => any;
/**
* Free a recognizer.
* @param recognizerHandle - Pointer to the recognizer.
*/
vosk_recognizer_free: (params: [any]) => void;
/**
* Set the maximum number of alternatives for recognition results.
* @param recognizerHandle - Pointer to the recognizer.
* @param maxAlternatives - Maximum number of alternatives (integer).
*/
vosk_recognizer_set_max_alternatives: (params: [any, number]) => void;
/**
* Enable or disable word-level results.
* @param recognizerHandle - Pointer to the recognizer.
* @param words - Boolean to enable/disable word-level results.
*/
vosk_recognizer_set_words: (params: [any, boolean]) => void;
/**
* Enable or disable partial word-level results.
* @param recognizerHandle - Pointer to the recognizer.
* @param partialWords - Boolean to enable/disable partial word-level results.
*/
vosk_recognizer_set_partial_words: (params: [any, boolean]) => void;
/**
* Set the speaker model for an existing recognizer.
* @param recognizerHandle - Pointer to the recognizer.
* @param spkModelHandle - Pointer to the speaker model.
*/
vosk_recognizer_set_spk_model: (params: [any, any]) => void;
/**
* Accept waveform data for recognition.
* @param recognizerHandle - Pointer to the recognizer.
* @param data - Audio data buffer (pointer).
* @param length - Length of the audio data (integer).
* @returns Boolean indicating if silence was detected.
*/
vosk_recognizer_accept_waveform: (params: [any, any, number]) => boolean;
/**
* Get the final recognition result.
* @param recognizerHandle - Pointer to the recognizer.
* @returns JSON string with recognition results.
*/
vosk_recognizer_result: (params: [any]) => string;
/**
* Get the final recognition result without waiting for silence.
* @param recognizerHandle - Pointer to the recognizer.
* @returns JSON string with recognition results.
*/
vosk_recognizer_final_result: (params: [any]) => string;
/**
* Get the partial recognition result.
* @param recognizerHandle - Pointer to the recognizer.
* @returns JSON string with partial recognition results.
*/
vosk_recognizer_partial_result: (params: [any]) => string;
/**
* Reset the recognizer.
* @param recognizerHandle - Pointer to the recognizer.
*/
vosk_recognizer_reset: (params: [any]) => void;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment