Created
December 31, 2024 02:00
-
-
Save JSandusky/b1c284228be8566f033584aedca4ee2d to your computer and use it in GitHub Desktop.
Urho3D speech recognition via pocketsphinx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include "../Core/Object.h" | |
namespace Urho3D | |
{ | |
/// Sound playback finished. Sent through the SoundSource's Node. | |
URHO3D_EVENT(E_SPEECHRESULT, SpeechResult) | |
{ | |
URHO3D_PARAM(P_TEXT, Text); // String | |
URHO3D_PARAM(P_TRUST, Trust); // int trust factor | |
} | |
URHO3D_EVENT(E_SPEECHSTARTED, SpeechStarted) | |
{ | |
} | |
URHO3D_EVENT(E_SPEECHACTIVITY, SpeechActivity) | |
{ | |
URHO3D_PARAM(P_STATE, State); // bool true if in active speech | |
} | |
URHO3D_EVENT(E_SPEECHENDED, SpeechEnded) | |
{ | |
URHO3D_PARAM(P_TEXT, Text); // String | |
URHO3D_PARAM(P_TRUST, Trust); // int trust factor | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "SpeechEvents.h" | |
#include "SpeechRecognizer.h" | |
#include "../Audio/AudioEvents.h" | |
#include "../Core/Context.h" | |
#include "../IO/FileSystem.h" | |
#include "../IO/Log.h" | |
#include "../Audio/Microphone.h" | |
#include "../Audio/Sound.h" | |
#include "../Audio/SoundStream.h" | |
#include <pocketsphinx.h> | |
namespace Urho3D | |
{ | |
SpeechRecognizer::SpeechRecognizer(Context* ctx) : Object(ctx), | |
decoder_(nullptr), | |
inUtterance_(false) | |
{ | |
} | |
SpeechRecognizer::~SpeechRecognizer() | |
{ | |
Release(); | |
} | |
void SpeechRecognizer::RegisterObject(Context* ctx) | |
{ | |
ctx->RegisterFactory<SpeechRecognizer>(); | |
} | |
bool SpeechRecognizer::Initialize(const String& language) | |
{ | |
// default is en-us | |
Release(); | |
auto fileSystem = GetContext()->GetSubsystem<FileSystem>(); | |
auto modelDir = fileSystem->GetProgramDir(); | |
modelDir = AddTrailingSlash(AddTrailingSlash(modelDir) + "SpeechModels"); | |
modelDir = modelDir + AddTrailingSlash(language); | |
// C:\dev\Urho3D\bin\SpeechModels\en-us | |
auto HMM = modelDir + language; | |
// check for binary LM first | |
auto LM = modelDir + language + ".lm.bin"; | |
if (!fileSystem->FileExists(LM)) | |
LM = modelDir + language + ".lm"; | |
auto DICT = modelDir + language + ".dict"; | |
cmd_ln_t* config = cmd_ln_init(nullptr, ps_args(), TRUE, | |
"-hmm", HMM.CString(), | |
"-lm", LM.CString(), | |
"-dict", DICT.CString(), | |
"-samprate", "16000", | |
"-nfft", "512", | |
"-bestpath", "yes", | |
NULL); | |
if (config == nullptr) | |
{ | |
URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString()); | |
return false; | |
} | |
decoder_ = ps_init(config); | |
cmd_ln_free_r(config); | |
if (decoder_ == nullptr) | |
{ | |
URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString()); | |
return false; | |
} | |
return true; | |
} | |
void SpeechRecognizer::Release() | |
{ | |
if (decoder_) | |
ps_free(decoder_); | |
decoder_ = nullptr; | |
inUtterance_ = false; | |
} | |
String SpeechRecognizer::AddData(int16_t* data, int sampleCt, int* score) | |
{ | |
if (decoder_ == nullptr) | |
return String(); | |
if (data && sampleCt <= 0) | |
return String(); | |
MutexLock locked(lock_); | |
if (!inUtterance_) | |
{ | |
int rv = ps_start_utt(decoder_); | |
inUtterance_ = true; | |
lastText_ = String(); | |
} | |
int rv = ps_process_raw(decoder_, data, sampleCt, FALSE, FALSE); | |
int activity = ps_get_in_speech(decoder_); | |
{ | |
auto& eventData = GetEventDataMap(); | |
eventData[SpeechActivity::P_STATE] = activity == 1; | |
SendEvent(E_SPEECHACTIVITY, eventData); | |
} | |
int deadScore = 0; | |
int* writeScore = score ? score : &deadScore; | |
String text = ps_get_hyp(decoder_, writeScore); | |
if (text.Length() > 0 && activity != 0 && text != lastText_) | |
{ | |
lastText_ = text; | |
auto& eventData = GetEventDataMap(); | |
eventData[SpeechResult::P_TEXT] = text; | |
eventData[SpeechResult::P_TRUST] = *writeScore; | |
SendEvent(E_SPEECHRESULT, eventData); | |
} | |
return text; | |
} | |
String SpeechRecognizer::Finish(int* score) | |
{ | |
if (decoder_ == nullptr) | |
return String(); | |
MutexLock locked(lock_); | |
if (inUtterance_) | |
{ | |
ps_end_utt(decoder_); | |
int deadScore = 0; | |
int* writeScore = score ? score : &deadScore; | |
String text = ps_get_hyp(decoder_, writeScore); | |
if (text.Length()) | |
{ | |
auto& eventData = GetEventDataMap(); | |
eventData[SpeechResult::P_TEXT] = text; | |
eventData[SpeechResult::P_TRUST] = score; | |
SendEvent(E_SPEECHRESULT, eventData); | |
eventData.Clear(); | |
eventData[SpeechActivity::P_STATE] = false; | |
SendEvent(E_SPEECHACTIVITY, eventData); | |
} | |
lastText_ = text; | |
return text; | |
} | |
return String(); | |
} | |
bool SpeechRecognizer::InSpeech() const | |
{ | |
if (decoder_ == nullptr) | |
return false; | |
MutexLock locked(lock_); | |
const int state = ps_get_in_speech(decoder_); | |
return state == 1; | |
} | |
bool SpeechRecognizer::Link(SharedPtr<Microphone> mic) | |
{ | |
if (mic.Null() || decoder_ == nullptr) | |
return false; | |
SubscribeToEvent(E_RECORDINGUPDATED, URHO3D_HANDLER(SpeechRecognizer, HandleMicEvent)); | |
return true; | |
} | |
void SpeechRecognizer::HandleMicEvent(StringHash evt, VariantMap& eventData) | |
{ | |
if (auto micRef = eventData[RecordingUpdated::P_MICROPHONE].GetPtr()) | |
{ | |
if (Microphone* mic = dynamic_cast<Microphone*>(micRef)) | |
{ | |
auto& data = mic->GetData(); | |
if (!data.Empty()) | |
{ | |
AddData(data.Buffer(), data.Size(), nullptr); | |
eventData[RecordingUpdated::P_CLEARDATA] = true; | |
} | |
} | |
} | |
} | |
String SpeechRecognizer::Recognize(Sound* snd) | |
{ | |
if (snd == nullptr || !snd->IsSixteenBit() || snd->IsStereo()) | |
return String(); | |
if (inUtterance_ || decoder_ == nullptr) | |
return String(); | |
auto utt = ps_start_utt(decoder_); | |
int result = ps_process_raw(decoder_, (int16_t*)snd->GetData().Get(), snd->GetDataSize() / sizeof(short), 0, 1); | |
ps_end_utt(decoder_); | |
if (result < 0) | |
return String(); | |
int score = 0; | |
String text = ps_get_hyp(decoder_, &score); | |
return text; | |
} | |
String SpeechRecognizer::Recognize(SoundStream* stream) | |
{ | |
if (stream == nullptr || !stream->IsSixteenBit() || stream->IsStereo()) | |
return String(); | |
if (inUtterance_ || decoder_ == nullptr) | |
return String(); | |
auto utt = ps_start_utt(decoder_); | |
short tempData[4096]; | |
int bytesRead = 0; | |
do { | |
bytesRead = stream->GetData((signed char*)tempData, 4096); | |
if (bytesRead > 0) | |
ps_process_raw(decoder_, (short*)bytesRead, bytesRead / sizeof(short), 0, 0); | |
} while (bytesRead); | |
ps_end_utt(decoder_); | |
int score = 0; | |
String text = ps_get_hyp(decoder_, &score); | |
return text; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include "../Core/Object.h" | |
#include "../Core/Mutex.h" | |
#include <pocketsphinx.h> | |
namespace Urho3D | |
{ | |
class Sound; | |
class SoundStream; | |
class Microphone; | |
class URHO3D_API SpeechRecognizer : public Object | |
{ | |
URHO3D_OBJECT(SpeechRecognizer, Object); | |
public: | |
SpeechRecognizer(Context*); | |
virtual ~SpeechRecognizer(); | |
static void RegisterObject(Context*); | |
bool Initialize(const String& language = "en-us"); | |
void Release(); | |
/// Pump data into the recognizer. | |
String AddData(int16_t* data, int sampleCt, int* trust = nullptr); | |
/// Finalizes the current processing task. | |
String Finish(int* trust = nullptr); | |
/// Returns true if the recognizer is currently processing text. | |
bool InSpeech() const; | |
/// Tests whether this recognizer is in a probably valid state. | |
bool IsAlive() const { return decoder_ != nullptr; } | |
/// Utility for connecting a microphone with this recognizer. | |
bool Link(SharedPtr<Microphone>); | |
String GetLastText() const { return lastText_; } | |
String Recognize(Sound*); | |
String Recognize(SoundStream*); | |
private: | |
/// Utility for pumping microphone data right into the recognizer. | |
void HandleMicEvent(StringHash, VariantMap&); | |
ps_decoder_t* decoder_ = nullptr; | |
mutable Mutex lock_; | |
bool inUtterance_ = false; | |
String lastText_; | |
}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment