Skip to content

Instantly share code, notes, and snippets.

@JSandusky
Created December 31, 2024 02:00
Show Gist options
  • Save JSandusky/b1c284228be8566f033584aedca4ee2d to your computer and use it in GitHub Desktop.
Save JSandusky/b1c284228be8566f033584aedca4ee2d to your computer and use it in GitHub Desktop.
Urho3D speech recognition via pocketsphinx
#pragma once
#include "../Core/Object.h"
namespace Urho3D
{
/// Sound playback finished. Sent through the SoundSource's Node.
URHO3D_EVENT(E_SPEECHRESULT, SpeechResult)
{
URHO3D_PARAM(P_TEXT, Text); // String
URHO3D_PARAM(P_TRUST, Trust); // int trust factor
}
URHO3D_EVENT(E_SPEECHSTARTED, SpeechStarted)
{
}
URHO3D_EVENT(E_SPEECHACTIVITY, SpeechActivity)
{
URHO3D_PARAM(P_STATE, State); // bool true if in active speech
}
URHO3D_EVENT(E_SPEECHENDED, SpeechEnded)
{
URHO3D_PARAM(P_TEXT, Text); // String
URHO3D_PARAM(P_TRUST, Trust); // int trust factor
}
}
#include "SpeechEvents.h"
#include "SpeechRecognizer.h"
#include "../Audio/AudioEvents.h"
#include "../Core/Context.h"
#include "../IO/FileSystem.h"
#include "../IO/Log.h"
#include "../Audio/Microphone.h"
#include "../Audio/Sound.h"
#include "../Audio/SoundStream.h"
#include <pocketsphinx.h>
namespace Urho3D
{
SpeechRecognizer::SpeechRecognizer(Context* ctx) : Object(ctx),
decoder_(nullptr),
inUtterance_(false)
{
}
SpeechRecognizer::~SpeechRecognizer()
{
Release();
}
void SpeechRecognizer::RegisterObject(Context* ctx)
{
ctx->RegisterFactory<SpeechRecognizer>();
}
bool SpeechRecognizer::Initialize(const String& language)
{
// default is en-us
Release();
auto fileSystem = GetContext()->GetSubsystem<FileSystem>();
auto modelDir = fileSystem->GetProgramDir();
modelDir = AddTrailingSlash(AddTrailingSlash(modelDir) + "SpeechModels");
modelDir = modelDir + AddTrailingSlash(language);
// C:\dev\Urho3D\bin\SpeechModels\en-us
auto HMM = modelDir + language;
// check for binary LM first
auto LM = modelDir + language + ".lm.bin";
if (!fileSystem->FileExists(LM))
LM = modelDir + language + ".lm";
auto DICT = modelDir + language + ".dict";
cmd_ln_t* config = cmd_ln_init(nullptr, ps_args(), TRUE,
"-hmm", HMM.CString(),
"-lm", LM.CString(),
"-dict", DICT.CString(),
"-samprate", "16000",
"-nfft", "512",
"-bestpath", "yes",
NULL);
if (config == nullptr)
{
URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
return false;
}
decoder_ = ps_init(config);
cmd_ln_free_r(config);
if (decoder_ == nullptr)
{
URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
return false;
}
return true;
}
void SpeechRecognizer::Release()
{
if (decoder_)
ps_free(decoder_);
decoder_ = nullptr;
inUtterance_ = false;
}
String SpeechRecognizer::AddData(int16_t* data, int sampleCt, int* score)
{
if (decoder_ == nullptr)
return String();
if (data && sampleCt <= 0)
return String();
MutexLock locked(lock_);
if (!inUtterance_)
{
int rv = ps_start_utt(decoder_);
inUtterance_ = true;
lastText_ = String();
}
int rv = ps_process_raw(decoder_, data, sampleCt, FALSE, FALSE);
int activity = ps_get_in_speech(decoder_);
{
auto& eventData = GetEventDataMap();
eventData[SpeechActivity::P_STATE] = activity == 1;
SendEvent(E_SPEECHACTIVITY, eventData);
}
int deadScore = 0;
int* writeScore = score ? score : &deadScore;
String text = ps_get_hyp(decoder_, writeScore);
if (text.Length() > 0 && activity != 0 && text != lastText_)
{
lastText_ = text;
auto& eventData = GetEventDataMap();
eventData[SpeechResult::P_TEXT] = text;
eventData[SpeechResult::P_TRUST] = *writeScore;
SendEvent(E_SPEECHRESULT, eventData);
}
return text;
}
String SpeechRecognizer::Finish(int* score)
{
if (decoder_ == nullptr)
return String();
MutexLock locked(lock_);
if (inUtterance_)
{
ps_end_utt(decoder_);
int deadScore = 0;
int* writeScore = score ? score : &deadScore;
String text = ps_get_hyp(decoder_, writeScore);
if (text.Length())
{
auto& eventData = GetEventDataMap();
eventData[SpeechResult::P_TEXT] = text;
eventData[SpeechResult::P_TRUST] = score;
SendEvent(E_SPEECHRESULT, eventData);
eventData.Clear();
eventData[SpeechActivity::P_STATE] = false;
SendEvent(E_SPEECHACTIVITY, eventData);
}
lastText_ = text;
return text;
}
return String();
}
bool SpeechRecognizer::InSpeech() const
{
if (decoder_ == nullptr)
return false;
MutexLock locked(lock_);
const int state = ps_get_in_speech(decoder_);
return state == 1;
}
bool SpeechRecognizer::Link(SharedPtr<Microphone> mic)
{
if (mic.Null() || decoder_ == nullptr)
return false;
SubscribeToEvent(E_RECORDINGUPDATED, URHO3D_HANDLER(SpeechRecognizer, HandleMicEvent));
return true;
}
void SpeechRecognizer::HandleMicEvent(StringHash evt, VariantMap& eventData)
{
if (auto micRef = eventData[RecordingUpdated::P_MICROPHONE].GetPtr())
{
if (Microphone* mic = dynamic_cast<Microphone*>(micRef))
{
auto& data = mic->GetData();
if (!data.Empty())
{
AddData(data.Buffer(), data.Size(), nullptr);
eventData[RecordingUpdated::P_CLEARDATA] = true;
}
}
}
}
String SpeechRecognizer::Recognize(Sound* snd)
{
if (snd == nullptr || !snd->IsSixteenBit() || snd->IsStereo())
return String();
if (inUtterance_ || decoder_ == nullptr)
return String();
auto utt = ps_start_utt(decoder_);
int result = ps_process_raw(decoder_, (int16_t*)snd->GetData().Get(), snd->GetDataSize() / sizeof(short), 0, 1);
ps_end_utt(decoder_);
if (result < 0)
return String();
int score = 0;
String text = ps_get_hyp(decoder_, &score);
return text;
}
String SpeechRecognizer::Recognize(SoundStream* stream)
{
if (stream == nullptr || !stream->IsSixteenBit() || stream->IsStereo())
return String();
if (inUtterance_ || decoder_ == nullptr)
return String();
auto utt = ps_start_utt(decoder_);
short tempData[4096];
int bytesRead = 0;
do {
bytesRead = stream->GetData((signed char*)tempData, 4096);
if (bytesRead > 0)
ps_process_raw(decoder_, (short*)bytesRead, bytesRead / sizeof(short), 0, 0);
} while (bytesRead);
ps_end_utt(decoder_);
int score = 0;
String text = ps_get_hyp(decoder_, &score);
return text;
}
}
#pragma once
#include "../Core/Object.h"
#include "../Core/Mutex.h"
#include <pocketsphinx.h>
namespace Urho3D
{
class Sound;
class SoundStream;
class Microphone;
class URHO3D_API SpeechRecognizer : public Object
{
URHO3D_OBJECT(SpeechRecognizer, Object);
public:
SpeechRecognizer(Context*);
virtual ~SpeechRecognizer();
static void RegisterObject(Context*);
bool Initialize(const String& language = "en-us");
void Release();
/// Pump data into the recognizer.
String AddData(int16_t* data, int sampleCt, int* trust = nullptr);
/// Finalizes the current processing task.
String Finish(int* trust = nullptr);
/// Returns true if the recognizer is currently processing text.
bool InSpeech() const;
/// Tests whether this recognizer is in a probably valid state.
bool IsAlive() const { return decoder_ != nullptr; }
/// Utility for connecting a microphone with this recognizer.
bool Link(SharedPtr<Microphone>);
String GetLastText() const { return lastText_; }
String Recognize(Sound*);
String Recognize(SoundStream*);
private:
/// Utility for pumping microphone data right into the recognizer.
void HandleMicEvent(StringHash, VariantMap&);
ps_decoder_t* decoder_ = nullptr;
mutable Mutex lock_;
bool inUtterance_ = false;
String lastText_;
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment