JSandusky · December 31, 2024 02:00
diff --git a/SpeechEvents.h b/SpeechEvents.h
 #pragma once

 #include "../Core/Object.h"

 namespace Urho3D
 {

 /// Sound playback finished. Sent through the SoundSource's Node.
 URHO3D_EVENT(E_SPEECHRESULT, SpeechResult)
 {
    URHO3D_PARAM(P_TEXT, Text);                     // String
    URHO3D_PARAM(P_TRUST, Trust);                   // int trust factor
 }

 URHO3D_EVENT(E_SPEECHSTARTED, SpeechStarted)
 {

 }

 URHO3D_EVENT(E_SPEECHACTIVITY, SpeechActivity)
 {
    URHO3D_PARAM(P_STATE, State);                   // bool true if in active speech
 }

 URHO3D_EVENT(E_SPEECHENDED, SpeechEnded)
 {
    URHO3D_PARAM(P_TEXT, Text);                     // String
    URHO3D_PARAM(P_TRUST, Trust);                   // int trust factor
 }

 }
diff --git a/SpeechRecognizer.cpp b/SpeechRecognizer.cpp
 #include "SpeechEvents.h"
 #include "SpeechRecognizer.h"

 #include "../Audio/AudioEvents.h"
 #include "../Core/Context.h"
 #include "../IO/FileSystem.h"
 #include "../IO/Log.h"
 #include "../Audio/Microphone.h"
 #include "../Audio/Sound.h"
 #include "../Audio/SoundStream.h"

 #include <pocketsphinx.h>

 namespace Urho3D
 {

 SpeechRecognizer::SpeechRecognizer(Context* ctx) : Object(ctx),
    decoder_(nullptr),
    inUtterance_(false)
 {

 }

 SpeechRecognizer::~SpeechRecognizer()
 {
    Release();
 }

 void SpeechRecognizer::RegisterObject(Context* ctx)
 {
    ctx->RegisterFactory<SpeechRecognizer>();
 }

 bool SpeechRecognizer::Initialize(const String& language)
 {
    // default is en-us
    Release();

    auto fileSystem = GetContext()->GetSubsystem<FileSystem>();
    auto modelDir = fileSystem->GetProgramDir();
    modelDir = AddTrailingSlash(AddTrailingSlash(modelDir) + "SpeechModels");
    modelDir = modelDir + AddTrailingSlash(language);
    // C:\dev\Urho3D\bin\SpeechModels\en-us

    auto HMM = modelDir + language;

    // check for binary LM first
    auto LM = modelDir + language + ".lm.bin";
    if (!fileSystem->FileExists(LM))
        LM = modelDir + language + ".lm";

    auto DICT = modelDir + language + ".dict";

    cmd_ln_t* config = cmd_ln_init(nullptr, ps_args(), TRUE, 
        "-hmm", HMM.CString(),
        "-lm", LM.CString(),
        "-dict", DICT.CString(),
        "-samprate", "16000",
        "-nfft", "512",
        "-bestpath", "yes",
        NULL);

    if (config == nullptr)
    {
        URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
        return false;
    }

    decoder_ = ps_init(config);

    cmd_ln_free_r(config);
    if (decoder_ == nullptr)
    {
        URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
        return false;
    }

    return true;
 }

 void SpeechRecognizer::Release()
 {
    if (decoder_)
        ps_free(decoder_);
    decoder_ = nullptr;
    inUtterance_ = false;
 }

 String SpeechRecognizer::AddData(int16_t* data, int sampleCt, int* score)
 {
    if (decoder_ == nullptr)
        return String();

    if (data && sampleCt <= 0)
        return String();

    MutexLock locked(lock_);

    if (!inUtterance_)
    {
        int rv = ps_start_utt(decoder_);
        inUtterance_ = true;
        lastText_ = String();
    }

    int rv = ps_process_raw(decoder_, data, sampleCt, FALSE, FALSE);
    int activity = ps_get_in_speech(decoder_);
    {
        auto& eventData = GetEventDataMap();
        eventData[SpeechActivity::P_STATE] = activity == 1;
        SendEvent(E_SPEECHACTIVITY, eventData);
    }

    int deadScore = 0;
    int* writeScore = score ? score : &deadScore;

    String text = ps_get_hyp(decoder_, writeScore);
    if (text.Length() > 0 && activity != 0 && text != lastText_)
    {
        lastText_ = text;

        auto& eventData = GetEventDataMap();
        eventData[SpeechResult::P_TEXT] = text;
        eventData[SpeechResult::P_TRUST] = *writeScore;
        SendEvent(E_SPEECHRESULT, eventData);
    }

    return text;
 }

 String SpeechRecognizer::Finish(int* score)
 {
    if (decoder_ == nullptr)
        return String();

    MutexLock locked(lock_);

    if (inUtterance_)
    {
        ps_end_utt(decoder_);

        int deadScore = 0;
        int* writeScore = score ? score : &deadScore;

        String text = ps_get_hyp(decoder_, writeScore);
        if (text.Length())
        {
            auto& eventData = GetEventDataMap();
            eventData[SpeechResult::P_TEXT] = text;
            eventData[SpeechResult::P_TRUST] = score;
            SendEvent(E_SPEECHRESULT, eventData);

            eventData.Clear();
            eventData[SpeechActivity::P_STATE] = false;
            SendEvent(E_SPEECHACTIVITY, eventData);
        }
        lastText_ = text;
        return text;
    }

    return String();
 }

 bool SpeechRecognizer::InSpeech() const
 {
    if (decoder_ == nullptr)
        return false;

    MutexLock locked(lock_);
    const int state = ps_get_in_speech(decoder_);
    return state == 1;
 }

 bool SpeechRecognizer::Link(SharedPtr<Microphone> mic)
 {
    if (mic.Null() || decoder_ == nullptr)
        return false;

    SubscribeToEvent(E_RECORDINGUPDATED, URHO3D_HANDLER(SpeechRecognizer, HandleMicEvent));

    return true;
 }

 void SpeechRecognizer::HandleMicEvent(StringHash evt, VariantMap& eventData)
 {
    if (auto micRef = eventData[RecordingUpdated::P_MICROPHONE].GetPtr())
    {
        if (Microphone* mic = dynamic_cast<Microphone*>(micRef))
        {
            auto& data = mic->GetData();
            if (!data.Empty())
            {
                AddData(data.Buffer(), data.Size(), nullptr);
                eventData[RecordingUpdated::P_CLEARDATA] = true;
            }
        }
    }
 }

 String SpeechRecognizer::Recognize(Sound* snd)
 {
    if (snd == nullptr || !snd->IsSixteenBit() || snd->IsStereo())
        return String();

    if (inUtterance_ || decoder_ == nullptr)
        return String();

    auto utt = ps_start_utt(decoder_);
    int result = ps_process_raw(decoder_, (int16_t*)snd->GetData().Get(), snd->GetDataSize() / sizeof(short), 0, 1);
    ps_end_utt(decoder_);
    if (result < 0)
        return String();

    int score = 0;
    String text = ps_get_hyp(decoder_, &score);
    return text;
 }

 String SpeechRecognizer::Recognize(SoundStream* stream)
 {
    if (stream == nullptr || !stream->IsSixteenBit() || stream->IsStereo())
        return String();

    if (inUtterance_ || decoder_ == nullptr)
        return String();

    auto utt = ps_start_utt(decoder_);

    short tempData[4096];
    int bytesRead = 0;
    do {
        bytesRead = stream->GetData((signed char*)tempData, 4096);
        if (bytesRead > 0)
            ps_process_raw(decoder_, (short*)bytesRead, bytesRead / sizeof(short), 0, 0);
    } while (bytesRead);

    ps_end_utt(decoder_);

    int score = 0;
    String text = ps_get_hyp(decoder_, &score);
    return text;
 }

 }
diff --git a/SpeechRecognizer.h b/SpeechRecognizer.h
 #pragma once

 #include "../Core/Object.h"
 #include "../Core/Mutex.h"

 #include <pocketsphinx.h>

 namespace Urho3D
 {

 class Sound;
 class SoundStream;
 class Microphone;

 class URHO3D_API SpeechRecognizer : public Object
 {
    URHO3D_OBJECT(SpeechRecognizer, Object);
 public:
    SpeechRecognizer(Context*);
    virtual ~SpeechRecognizer();
    static void RegisterObject(Context*);

    bool Initialize(const String& language = "en-us");
    void Release();

    /// Pump data into the recognizer.
    String AddData(int16_t* data, int sampleCt, int* trust = nullptr);
    /// Finalizes the current processing task.
    String Finish(int* trust = nullptr);
    /// Returns true if the recognizer is currently processing text.
    bool InSpeech() const;
    /// Tests whether this recognizer is in a probably valid state.
    bool IsAlive() const { return decoder_ != nullptr; }

    /// Utility for connecting a microphone with this recognizer.
    bool Link(SharedPtr<Microphone>);

    String GetLastText() const { return lastText_; }

    String Recognize(Sound*);
    String Recognize(SoundStream*);

 private:
    /// Utility for pumping microphone data right into the recognizer.
    void HandleMicEvent(StringHash, VariantMap&);

    ps_decoder_t* decoder_ = nullptr;
    mutable Mutex lock_;
    bool inUtterance_ = false;
    String lastText_;
 };

 }
	#pragma once

	#include "../Core/Object.h"

	namespace Urho3D
	{

	/// Sound playback finished. Sent through the SoundSource's Node.
	URHO3D_EVENT(E_SPEECHRESULT, SpeechResult)
	{
	URHO3D_PARAM(P_TEXT, Text); // String
	URHO3D_PARAM(P_TRUST, Trust); // int trust factor
	}

	URHO3D_EVENT(E_SPEECHSTARTED, SpeechStarted)
	{

	}

	URHO3D_EVENT(E_SPEECHACTIVITY, SpeechActivity)
	{
	URHO3D_PARAM(P_STATE, State); // bool true if in active speech
	}

	URHO3D_EVENT(E_SPEECHENDED, SpeechEnded)
	{
	URHO3D_PARAM(P_TEXT, Text); // String
	URHO3D_PARAM(P_TRUST, Trust); // int trust factor
	}

	}
	#include "SpeechEvents.h"
	#include "SpeechRecognizer.h"

	#include "../Audio/AudioEvents.h"
	#include "../Core/Context.h"
	#include "../IO/FileSystem.h"
	#include "../IO/Log.h"
	#include "../Audio/Microphone.h"
	#include "../Audio/Sound.h"
	#include "../Audio/SoundStream.h"

	#include <pocketsphinx.h>

	namespace Urho3D
	{

	SpeechRecognizer::SpeechRecognizer(Context* ctx) : Object(ctx),
	decoder_(nullptr),
	inUtterance_(false)
	{

	}

	SpeechRecognizer::~SpeechRecognizer()
	{
	Release();
	}

	void SpeechRecognizer::RegisterObject(Context* ctx)
	{
	ctx->RegisterFactory<SpeechRecognizer>();
	}

	bool SpeechRecognizer::Initialize(const String& language)
	{
	// default is en-us
	Release();

	auto fileSystem = GetContext()->GetSubsystem<FileSystem>();
	auto modelDir = fileSystem->GetProgramDir();
	modelDir = AddTrailingSlash(AddTrailingSlash(modelDir) + "SpeechModels");
	modelDir = modelDir + AddTrailingSlash(language);
	// C:\dev\Urho3D\bin\SpeechModels\en-us

	auto HMM = modelDir + language;

	// check for binary LM first
	auto LM = modelDir + language + ".lm.bin";
	if (!fileSystem->FileExists(LM))
	LM = modelDir + language + ".lm";

	auto DICT = modelDir + language + ".dict";

	cmd_ln_t* config = cmd_ln_init(nullptr, ps_args(), TRUE,
	"-hmm", HMM.CString(),
	"-lm", LM.CString(),
	"-dict", DICT.CString(),
	"-samprate", "16000",
	"-nfft", "512",
	"-bestpath", "yes",
	NULL);

	if (config == nullptr)
	{
	URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
	return false;
	}

	decoder_ = ps_init(config);

	cmd_ln_free_r(config);
	if (decoder_ == nullptr)
	{
	URHO3D_LOGERRORF("Could not initialize speech recognition for language: %s", language.CString());
	return false;
	}

	return true;
	}

	void SpeechRecognizer::Release()
	{
	if (decoder_)
	ps_free(decoder_);
	decoder_ = nullptr;
	inUtterance_ = false;
	}

	String SpeechRecognizer::AddData(int16_t* data, int sampleCt, int* score)
	{
	if (decoder_ == nullptr)
	return String();

	if (data && sampleCt <= 0)
	return String();

	MutexLock locked(lock_);

	if (!inUtterance_)
	{
	int rv = ps_start_utt(decoder_);
	inUtterance_ = true;
	lastText_ = String();
	}

	int rv = ps_process_raw(decoder_, data, sampleCt, FALSE, FALSE);
	int activity = ps_get_in_speech(decoder_);
	{
	auto& eventData = GetEventDataMap();
	eventData[SpeechActivity::P_STATE] = activity == 1;
	SendEvent(E_SPEECHACTIVITY, eventData);
	}

	int deadScore = 0;
	int* writeScore = score ? score : &deadScore;

	String text = ps_get_hyp(decoder_, writeScore);
	if (text.Length() > 0 && activity != 0 && text != lastText_)
	{
	lastText_ = text;

	auto& eventData = GetEventDataMap();
	eventData[SpeechResult::P_TEXT] = text;
	eventData[SpeechResult::P_TRUST] = *writeScore;
	SendEvent(E_SPEECHRESULT, eventData);
	}

	return text;
	}

	String SpeechRecognizer::Finish(int* score)
	{
	if (decoder_ == nullptr)
	return String();

	MutexLock locked(lock_);

	if (inUtterance_)
	{
	ps_end_utt(decoder_);

	int deadScore = 0;
	int* writeScore = score ? score : &deadScore;

	String text = ps_get_hyp(decoder_, writeScore);
	if (text.Length())
	{
	auto& eventData = GetEventDataMap();
	eventData[SpeechResult::P_TEXT] = text;
	eventData[SpeechResult::P_TRUST] = score;
	SendEvent(E_SPEECHRESULT, eventData);

	eventData.Clear();
	eventData[SpeechActivity::P_STATE] = false;
	SendEvent(E_SPEECHACTIVITY, eventData);
	}
	lastText_ = text;
	return text;
	}

	return String();
	}

	bool SpeechRecognizer::InSpeech() const
	{
	if (decoder_ == nullptr)
	return false;

	MutexLock locked(lock_);
	const int state = ps_get_in_speech(decoder_);
	return state == 1;
	}

	bool SpeechRecognizer::Link(SharedPtr<Microphone> mic)
	{
	if (mic.Null() \|\| decoder_ == nullptr)
	return false;

	SubscribeToEvent(E_RECORDINGUPDATED, URHO3D_HANDLER(SpeechRecognizer, HandleMicEvent));

	return true;
	}

	void SpeechRecognizer::HandleMicEvent(StringHash evt, VariantMap& eventData)
	{
	if (auto micRef = eventData[RecordingUpdated::P_MICROPHONE].GetPtr())
	{
	if (Microphone* mic = dynamic_cast<Microphone*>(micRef))
	{
	auto& data = mic->GetData();
	if (!data.Empty())
	{
	AddData(data.Buffer(), data.Size(), nullptr);
	eventData[RecordingUpdated::P_CLEARDATA] = true;
	}
	}
	}
	}

	String SpeechRecognizer::Recognize(Sound* snd)
	{
	if (snd == nullptr \|\| !snd->IsSixteenBit() \|\| snd->IsStereo())
	return String();

	if (inUtterance_ \|\| decoder_ == nullptr)
	return String();

	auto utt = ps_start_utt(decoder_);
	int result = ps_process_raw(decoder_, (int16_t*)snd->GetData().Get(), snd->GetDataSize() / sizeof(short), 0, 1);
	ps_end_utt(decoder_);
	if (result < 0)
	return String();

	int score = 0;
	String text = ps_get_hyp(decoder_, &score);
	return text;
	}

	String SpeechRecognizer::Recognize(SoundStream* stream)
	{
	if (stream == nullptr \|\| !stream->IsSixteenBit() \|\| stream->IsStereo())
	return String();

	if (inUtterance_ \|\| decoder_ == nullptr)
	return String();

	auto utt = ps_start_utt(decoder_);

	short tempData[4096];
	int bytesRead = 0;
	do {
	bytesRead = stream->GetData((signed char*)tempData, 4096);
	if (bytesRead > 0)
	ps_process_raw(decoder_, (short*)bytesRead, bytesRead / sizeof(short), 0, 0);
	} while (bytesRead);

	ps_end_utt(decoder_);

	int score = 0;
	String text = ps_get_hyp(decoder_, &score);
	return text;
	}

	}
	#pragma once

	#include "../Core/Object.h"
	#include "../Core/Mutex.h"

	#include <pocketsphinx.h>

	namespace Urho3D
	{

	class Sound;
	class SoundStream;
	class Microphone;

	class URHO3D_API SpeechRecognizer : public Object
	{
	URHO3D_OBJECT(SpeechRecognizer, Object);
	public:
	SpeechRecognizer(Context*);
	virtual ~SpeechRecognizer();
	static void RegisterObject(Context*);

	bool Initialize(const String& language = "en-us");
	void Release();

	/// Pump data into the recognizer.
	String AddData(int16_t* data, int sampleCt, int* trust = nullptr);
	/// Finalizes the current processing task.
	String Finish(int* trust = nullptr);
	/// Returns true if the recognizer is currently processing text.
	bool InSpeech() const;
	/// Tests whether this recognizer is in a probably valid state.
	bool IsAlive() const { return decoder_ != nullptr; }

	/// Utility for connecting a microphone with this recognizer.
	bool Link(SharedPtr<Microphone>);

	String GetLastText() const { return lastText_; }

	String Recognize(Sound*);
	String Recognize(SoundStream*);

	private:
	/// Utility for pumping microphone data right into the recognizer.
	void HandleMicEvent(StringHash, VariantMap&);

	ps_decoder_t* decoder_ = nullptr;
	mutable Mutex lock_;
	bool inUtterance_ = false;
	String lastText_;
	};

	}