Updated version of vosk
for node
Nothing difficult: just changed define
a signature and wrapped each call parameter group to tuples.
Updated version of vosk
for node
Nothing difficult: just changed define
a signature and wrapped each call parameter group to tuples.
// @ts-check | |
'use strict'; | |
/** | |
* @module vosk | |
*/ | |
const os = require('os'); | |
const path = require('path'); | |
const {DataType, open, close, define} = require('ffi-rs'); | |
const fs = require('fs'); | |
const soname = (function () { | |
if (os.platform() === 'win32') { | |
let currentPath = process.env.Path; | |
let dllDirectory = path.resolve(path.join(__dirname, 'lib', 'win-x86_64')); | |
process.env.Path = dllDirectory + path.delimiter + currentPath; | |
return path.join(__dirname, 'lib', 'win-x86_64', 'libvosk.dll'); | |
} | |
if (os.platform() === 'darwin') { | |
return path.join(__dirname, 'lib', 'osx-universal', 'libvosk.dylib'); | |
} | |
if (os.platform() === 'linux' && os.arch() === 'arm64') { | |
return path.join(__dirname, 'lib', 'linux-arm64', 'libvosk.so'); | |
} | |
return path.join(__dirname, 'lib', 'linux-x86_64', 'libvosk.so'); | |
})(); | |
if (!fs.existsSync(soname)) { | |
throw new Error(`File doesn't exist: ${soname}`); | |
} | |
open({ | |
library: 'libvosk', | |
path : soname, | |
}); | |
/** @type {LibVosk} */ | |
const libvosk = define({ | |
vosk_set_log_level : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.I32 ]}, | |
vosk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]}, | |
vosk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
vosk_spk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]}, | |
vosk_spk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
vosk_recognizer_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float ]}, | |
vosk_recognizer_new_spk : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.External]}, | |
vosk_recognizer_new_grm : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.String ]}, | |
vosk_recognizer_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
vosk_recognizer_set_max_alternatives: {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.I32 ]}, | |
vosk_recognizer_set_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]}, | |
vosk_recognizer_set_partial_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]}, | |
vosk_recognizer_set_spk_model : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.External ]}, | |
vosk_recognizer_accept_waveform : {library: 'libvosk', retType: DataType.Boolean , paramsType: [DataType.External, DataType.U8Array, DataType.I32 ]}, | |
vosk_recognizer_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
vosk_recognizer_final_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
vosk_recognizer_partial_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
vosk_recognizer_reset : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]} | |
}); | |
/** | |
* Set log level for Kaldi messages | |
* @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. | |
*/ | |
function setLogLevel(level) { | |
libvosk.vosk_set_log_level([level]); | |
} | |
/** | |
* Build a Model from a model file. | |
* @see models [models](https://alphacephei.com/vosk/models) | |
*/ | |
class Model { | |
/** | |
* Build a Model to be used with the voice recognition. Each language should have it's own Model | |
* for the speech recognition to work. | |
* @param {string} modelPath The abstract pathname to the model | |
* @see models [models](https://alphacephei.com/vosk/models) | |
*/ | |
constructor(modelPath) { | |
/** | |
* Store the handle. | |
* For internal use only | |
* @type {unknown} | |
*/ | |
this.handle = libvosk.vosk_model_new([modelPath]); | |
console.log("model is created"); | |
} | |
/** | |
* Releases the model memory | |
* | |
* The model object is reference-counted so if some recognizer | |
* depends on this model, model might still stay alive. When | |
* last recognizer is released, model will be released too. | |
*/ | |
free() { | |
libvosk.vosk_model_free([this.handle]); | |
} | |
} | |
/** | |
* Build a Speaker Model from a speaker model file. | |
* The Speaker Model enables speaker identification. | |
* @see models [models](https://alphacephei.com/vosk/models) | |
*/ | |
class SpeakerModel { | |
/** | |
* Loads speaker model data from the file and returns the model object | |
* | |
* @param {string} modelPath the path of the model on the filesystem | |
* @see models [models](https://alphacephei.com/vosk/models) | |
*/ | |
constructor(modelPath) { | |
/** | |
* Store the handle. | |
* For internal use only | |
* @type {unknown} | |
*/ | |
this.handle = libvosk.vosk_spk_model_new([modelPath]); | |
} | |
/** | |
* Releases the model memory | |
* | |
* The model object is reference-counted so if some recognizer | |
* depends on this model, model might still stay alive. When | |
* last recognizer is released, model will be released too. | |
*/ | |
free() { | |
libvosk.vosk_spk_model_free([this.handle]); | |
} | |
} | |
/** | |
* Helper to narrow down type while using `hasOwnProperty`. | |
* @see hasOwnProperty [typescript issue](https://fettblog.eu/typescript-hasownproperty/) | |
* @template {Object} Obj | |
* @template {PropertyKey} Key | |
* @param {Obj} obj | |
* @param {Key} prop | |
* @returns {obj is Obj & Record<Key, unknown>} | |
*/ | |
function hasOwnProperty(obj, prop) { | |
return obj.hasOwnProperty(prop); | |
} | |
/** | |
* @template T | |
* @template U | |
* @typedef {{ [P in Exclude<keyof T, keyof U>]?: never }} Without | |
*/ | |
/** | |
* @template T | |
* @template U | |
* @typedef {(T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U} XOR | |
*/ | |
/** | |
* Create a Recognizer that will be able to transform audio streams into text using a Model. | |
* @template {XOR<SpeakerRecognizerParam, Partial<GrammarRecognizerParam>>} T extra parameter | |
* @see Model | |
*/ | |
class Recognizer { | |
handle; | |
/** | |
* Create a Recognizer that will handle speech to text recognition. | |
* @constructor | |
* @param {T & BaseRecognizerParam & Partial<SpeakerRecognizerParam>} param The Recognizer parameters | |
* | |
* Sometimes when you want to improve recognition accuracy and when you don't need | |
* to recognize large vocabulary you can specify a list of phrases to recognize. This | |
* will improve recognizer speed and accuracy but might return [unk] if user said | |
* something different. | |
* | |
* Only recognizers with lookahead models support this type of quick configuration. | |
* Precompiled HCLG graph models are not supported. | |
*/ | |
constructor(param) { | |
const {model, sampleRate} = param; | |
// Prevent the user to receive unpredictable results | |
if (hasOwnProperty(param, 'speakerModel') && hasOwnProperty(param, 'grammar')) { | |
throw new Error('grammar and speakerModel cannot be used together for now.'); | |
} | |
/** | |
* Store the handle. | |
* For internal use only | |
* @type {unknown} | |
*/ | |
this.handle = hasOwnProperty(param, 'speakerModel') | |
? libvosk.vosk_recognizer_new_spk([model.handle, sampleRate, param.speakerModel.handle]) | |
: hasOwnProperty(param, 'grammar') | |
? libvosk.vosk_recognizer_new_grm([model.handle, sampleRate, JSON.stringify(param.grammar)]) | |
: libvosk.vosk_recognizer_new([model.handle, sampleRate]); | |
} | |
/** | |
* Releases the model memory | |
* | |
* The model object is reference-counted so if some recognizer | |
* depends on this model, model might still stay alive. When | |
* last recognizer is released, model will be released too. | |
*/ | |
free() { | |
libvosk.vosk_recognizer_free([this.handle]); | |
} | |
/** Configures recognizer to output n-best results | |
* | |
* <pre> | |
* { | |
* "alternatives": [ | |
* { "text": "one two three four five", "confidence": 0.97 }, | |
* { "text": "one two three for five", "confidence": 0.03 }, | |
* ] | |
* } | |
* </pre> | |
* | |
* @param max_alternatives - maximum alternatives to return from recognition results | |
*/ | |
setMaxAlternatives(max_alternatives) { | |
libvosk.vosk_recognizer_set_max_alternatives([this.handle, max_alternatives]); | |
} | |
/** Configures recognizer to output words with times | |
* | |
* <pre> | |
* "result" : [{ | |
* "conf" : 1.000000, | |
* "end" : 1.110000, | |
* "start" : 0.870000, | |
* "word" : "what" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 1.530000, | |
* "start" : 1.110000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 1.950000, | |
* "start" : 1.530000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 2.340000, | |
* "start" : 1.950000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 2.610000, | |
* "start" : 2.340000, | |
* "word" : "one" | |
* }], | |
* </pre> | |
* | |
* @param words - boolean value | |
*/ | |
setWords(words) { | |
libvosk.vosk_recognizer_set_words([this.handle, words]); | |
} | |
/** Same as above, but for partial results*/ | |
setPartialWords(partial_words) { | |
libvosk.vosk_recognizer_set_partial_words([this.handle, partial_words]); | |
} | |
/** Adds speaker recognition model to already created recognizer. Helps to initialize | |
* speaker recognition for grammar-based recognizer. | |
* | |
* @param spk_model Speaker recognition model | |
*/ | |
setSpkModel(spk_model) { | |
libvosk.vosk_recognizer_set_spk_model([this.handle, spk_model.handle]); | |
} | |
/** | |
* Accept voice data | |
* | |
* accept and process new chunk of voice data | |
* | |
* @param {Buffer} data audio data in PCM 16-bit mono format | |
* @returns true if silence is occured and you can retrieve a new utterance with result method | |
*/ | |
acceptWaveform(data) { | |
return libvosk.vosk_recognizer_accept_waveform([this.handle, data, data.length]); | |
}; | |
/** | |
* Accept voice data | |
* | |
* accept and process new chunk of voice data | |
* | |
* @param {Buffer} data audio data in PCM 16-bit mono format | |
* @returns true if silence is occured and you can retrieve a new utterance with result method | |
*/ | |
acceptWaveformAsync(data) { | |
return new Promise((resolve, reject) => { | |
libvosk.vosk_recognizer_accept_waveform.async([this.handle, data, data.length], function (err, result) { | |
if (err) { | |
reject(err); | |
} | |
else { | |
resolve(result); | |
} | |
}); | |
}); | |
}; | |
/** Returns speech recognition result in a string | |
* | |
* @returns the result in JSON format which contains decoded line, decoded | |
* words, times in seconds and confidences. You can parse this result | |
* with any json parser | |
* <pre> | |
* { | |
* "result" : [{ | |
* "conf" : 1.000000, | |
* "end" : 1.110000, | |
* "start" : 0.870000, | |
* "word" : "what" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 1.530000, | |
* "start" : 1.110000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 1.950000, | |
* "start" : 1.530000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 2.340000, | |
* "start" : 1.950000, | |
* "word" : "zero" | |
* }, { | |
* "conf" : 1.000000, | |
* "end" : 2.610000, | |
* "start" : 2.340000, | |
* "word" : "one" | |
* }], | |
* "text" : "what zero zero zero one" | |
* } | |
* </pre> | |
*/ | |
resultString() { | |
return libvosk.vosk_recognizer_result([this.handle]); | |
}; | |
/** | |
* Returns speech recognition results | |
* @returns {Result<T>} The results | |
*/ | |
result() { | |
return JSON.parse(libvosk.vosk_recognizer_result([this.handle])); | |
}; | |
/** | |
* speech recognition text which is not yet finalized. | |
* result may change as recognizer process more data. | |
* | |
* @returns {PartialResults} The partial results | |
*/ | |
partialResult() { | |
return JSON.parse(libvosk.vosk_recognizer_partial_result([this.handle])); | |
}; | |
/** | |
* Returns speech recognition result. Same as result, but doesn't wait for silence | |
* You usually call it in the end of the stream to get final bits of audio. It | |
* flushes the feature pipeline, so all remaining audio chunks got processed. | |
* | |
* @returns {Result<T>} speech result. | |
*/ | |
finalResult() { | |
return JSON.parse(libvosk.vosk_recognizer_final_result([this.handle])); | |
}; | |
/** | |
* | |
* Resets current results so the recognition can continue from scratch | |
*/ | |
reset() { | |
libvosk.vosk_recognizer_reset([this.handle]); | |
} | |
} | |
exports.setLogLevel = setLogLevel; | |
exports.Model = Model; | |
exports.SpeakerModel = SpeakerModel; | |
exports.Recognizer = Recognizer; | |
// Optional: Close library when done (call when appropriate) | |
// close('libvosk'); |
// vosk.d.ts | |
declare module 'vosk' { | |
/** | |
* Set log level for Kaldi messages | |
* @param level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. | |
*/ | |
export function setLogLevel(level: number): void; | |
export class Model { | |
/** @internal */ | |
handle: any; | |
constructor(modelPath: string); | |
free(): void; | |
} | |
export class SpeakerModel { | |
/** @internal */ | |
handle: any; | |
constructor(modelPath: string); | |
free(): void; | |
} | |
type WordResult = { | |
/** Confidence (0-1) */ | |
conf: number; | |
/** Start time in seconds */ | |
start: number; | |
/** End time in seconds */ | |
end: number; | |
/** Recognized word */ | |
word: string; | |
}; | |
type RecognitionResults = { | |
result: WordResult[]; | |
text: string; | |
}; | |
type SpeakerResults = { | |
spk: number[]; | |
spk_frames: number; | |
}; | |
type PartialResults = { | |
partial: string; | |
}; | |
type BaseRecognizerParam = { | |
model: Model; | |
sampleRate: number; | |
}; | |
type SpeakerRecognizerParam = { | |
speakerModel: SpeakerModel; | |
}; | |
type GrammarRecognizerParam = { | |
grammar: string[]; | |
}; | |
type RecognizerParams<T> = T & BaseRecognizerParam; | |
type Result<T> = T extends SpeakerRecognizerParam | |
? RecognitionResults & SpeakerResults | |
: T extends GrammarRecognizerParam | |
? RecognitionResults | |
: never; | |
export class Recognizer<T extends SpeakerRecognizerParam | GrammarRecognizerParam> { | |
/** @internal */ | |
handle: any; | |
constructor(params: RecognizerParams<T>); | |
free(): void; | |
setMaxAlternatives(max_alternatives: number): void; | |
setWords(words: boolean): void; | |
setPartialWords(partial_words: boolean): void; | |
setSpkModel(spk_model: SpeakerModel): void; | |
acceptWaveform(data: Buffer): boolean; | |
resultString(): string; | |
result(): Result<T>; | |
partialResult(): PartialResults; | |
finalResult(): Result<T>; | |
reset(): void; | |
} | |
// Helper type for XOR (mutually exclusive) properties | |
type Without<T, U> = { [P in Exclude<keyof T, keyof U>]?: never }; | |
type XOR<T, U> = (T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U; | |
export type RecognizerConstructorParams = BaseRecognizerParam & XOR< | |
SpeakerRecognizerParam, | |
GrammarRecognizerParam | |
>; | |
} | |
interface LibVosk { | |
/** | |
* Set the log level for Vosk. | |
* @param level - Log level (integer). | |
*/ | |
vosk_set_log_level: (params: [number]) => void; | |
/** | |
* Create a new Vosk model. | |
* @param modelPath - Path to the model (string). | |
* @returns Pointer to the model. | |
*/ | |
vosk_model_new: (params: [string]) => any; | |
/** | |
* Free a Vosk model. | |
* @param modelHandle - Pointer to the model. | |
*/ | |
vosk_model_free: (params: [any]) => void; | |
/** | |
* Create a new Vosk speaker model. | |
* @param modelPath - Path to the speaker model (string). | |
* @returns Pointer to the speaker model. | |
*/ | |
vosk_spk_model_new: (params: [string]) => any; | |
/** | |
* Free a Vosk speaker model. | |
* @param spkModelHandle - Pointer to the speaker model. | |
*/ | |
vosk_spk_model_free: (params: [any]) => void; | |
/** | |
* Create a new recognizer without speaker or grammar. | |
* @param modelHandle - Pointer to the model. | |
* @param sampleRate - Sample rate (float). | |
* @returns Pointer to the recognizer. | |
*/ | |
vosk_recognizer_new: (params: [any, number]) => any; | |
/** | |
* Create a new recognizer with speaker model. | |
* @param modelHandle - Pointer to the model. | |
* @param sampleRate - Sample rate (float). | |
* @param spkModelHandle - Pointer to the speaker model. | |
* @returns Pointer to the recognizer. | |
*/ | |
vosk_recognizer_new_spk: (params: [any, number, any]) => any; | |
/** | |
* Create a new recognizer with grammar. | |
* @param modelHandle - Pointer to the model. | |
* @param sampleRate - Sample rate (float). | |
* @param grammar - Grammar string (JSON). | |
* @returns Pointer to the recognizer. | |
*/ | |
vosk_recognizer_new_grm: (params: [any, number, string]) => any; | |
/** | |
* Free a recognizer. | |
* @param recognizerHandle - Pointer to the recognizer. | |
*/ | |
vosk_recognizer_free: (params: [any]) => void; | |
/** | |
* Set the maximum number of alternatives for recognition results. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @param maxAlternatives - Maximum number of alternatives (integer). | |
*/ | |
vosk_recognizer_set_max_alternatives: (params: [any, number]) => void; | |
/** | |
* Enable or disable word-level results. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @param words - Boolean to enable/disable word-level results. | |
*/ | |
vosk_recognizer_set_words: (params: [any, boolean]) => void; | |
/** | |
* Enable or disable partial word-level results. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @param partialWords - Boolean to enable/disable partial word-level results. | |
*/ | |
vosk_recognizer_set_partial_words: (params: [any, boolean]) => void; | |
/** | |
* Set the speaker model for an existing recognizer. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @param spkModelHandle - Pointer to the speaker model. | |
*/ | |
vosk_recognizer_set_spk_model: (params: [any, any]) => void; | |
/** | |
* Accept waveform data for recognition. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @param data - Audio data buffer (pointer). | |
* @param length - Length of the audio data (integer). | |
* @returns Boolean indicating if silence was detected. | |
*/ | |
vosk_recognizer_accept_waveform: (params: [any, any, number]) => boolean; | |
/** | |
* Get the final recognition result. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @returns JSON string with recognition results. | |
*/ | |
vosk_recognizer_result: (params: [any]) => string; | |
/** | |
* Get the final recognition result without waiting for silence. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @returns JSON string with recognition results. | |
*/ | |
vosk_recognizer_final_result: (params: [any]) => string; | |
/** | |
* Get the partial recognition result. | |
* @param recognizerHandle - Pointer to the recognizer. | |
* @returns JSON string with partial recognition results. | |
*/ | |
vosk_recognizer_partial_result: (params: [any]) => string; | |
/** | |
* Reset the recognizer. | |
* @param recognizerHandle - Pointer to the recognizer. | |
*/ | |
vosk_recognizer_reset: (params: [any]) => void; | |
} |