Created
April 9, 2025 08:55
-
-
Save josejuan/a845b87db7b1e02c8a5b7c1bf12f34a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "whisper.h" | |
#include <iostream> | |
#include <vector> | |
#include <fstream> | |
#include <string> | |
#include <filesystem> | |
#include <thread> | |
#include <chrono> | |
#include <regex> | |
#include <map> | |
#include <csignal> | |
#include <inotifytools/inotifytools.h> | |
#include <sys/inotify.h> | |
namespace fs = std::filesystem; | |
#define WHISPER_SAMPLE_RATE 16000 | |
volatile bool running = true; | |
void signal_handler(int) { | |
running = false; | |
} | |
bool wait_for_file_complete(const fs::path &path, int retries = 10) { | |
using namespace std::chrono_literals; | |
uintmax_t last_size = 0; | |
for (int i = 0; i < retries; ++i) { | |
auto cur_size = fs::file_size(path); | |
if (cur_size == last_size) | |
return true; | |
last_size = cur_size; | |
std::this_thread::sleep_for(500ms); | |
} | |
return false; | |
} | |
bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) { | |
FILE *fp = fopen(path.c_str(), "rb"); | |
if (!fp) return false; | |
fseek(fp, 0, SEEK_END); | |
long len = ftell(fp); | |
fseek(fp, 0, SEEK_SET); | |
std::vector<uint8_t> buf(len); | |
fread(buf.data(), 1, len, fp); | |
fclose(fp); | |
if (buf.size() < 44) return false; | |
int channels = buf[22] | (buf[23] << 8); | |
int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24); | |
int bits_per_sample = buf[34] | (buf[35] << 8); | |
if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) { | |
std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n"; | |
return false; | |
} | |
int data_offset = 44; | |
int num_samples = (buf.size() - data_offset) / 2; | |
pcmf32.resize(num_samples); | |
for (int i = 0; i < num_samples; ++i) { | |
int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8); | |
pcmf32[i] = s / 32768.0f; | |
} | |
return true; | |
} | |
void transcribe_file(const fs::path &path, struct whisper_context *ctx) { | |
std::string name = path.filename(); | |
std::smatch m; | |
std::regex rx(R"(^(en|es)-(.+)\.wav$)"); | |
if (!std::regex_match(name, m, rx)) { | |
std::cerr << "Ignoring invalid file: " << name << "\n"; | |
return; | |
} | |
std::string lang = m[1]; | |
std::string base = m[2]; | |
std::vector<float> pcmf32; | |
if (!wait_for_file_complete(path)) { | |
std::cerr << "File incomplete: " << name << "\n"; | |
return; | |
} | |
if (!load_wav_mono(path, pcmf32)) { | |
std::cerr << "Failed to load: " << name << "\n"; | |
return; | |
} | |
whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); | |
params.print_progress = false; | |
params.print_special = false; | |
params.print_realtime = false; | |
params.print_timestamps = false; | |
params.translate = (lang == "en"); | |
params.language = "es"; | |
if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) { | |
std::cerr << "whisper_full() failed\n"; | |
return; | |
} | |
std::string transcription; | |
int n_segments = whisper_full_n_segments(ctx); | |
for (int i = 0; i < n_segments; ++i) { | |
transcription += whisper_full_get_segment_text(ctx, i); | |
} | |
std::string escaped; | |
for (char c : transcription) { | |
if (c == '"' || c == '$' || c == '`' || c == '\\') | |
escaped += '\\'; | |
escaped += c; | |
} | |
std::string cmd = "xdotool type \"" + escaped + "\""; | |
int ret = std::system(cmd.c_str()); | |
if (ret != 0) { | |
std::cerr << "xdotool failed\n"; | |
} else { | |
std::cout << "Typed: " << name << "\n"; | |
fs::remove(path); | |
} | |
} | |
int main(int argc, char **argv) { | |
if (argc != 2) { | |
std::cerr << "usage: " << argv[0] << " <watch_dir>\n"; | |
return 1; | |
} | |
const fs::path watch_dir = argv[1]; | |
const std::string model_path = "./models/ggml-large-v3.bin"; | |
if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) { | |
std::cerr << "Invalid directory\n"; | |
return 1; | |
} | |
struct whisper_context *ctx = whisper_init_from_file(model_path.c_str()); | |
if (!ctx) { | |
std::cerr << "failed to load model\n"; | |
return 2; | |
} | |
signal(SIGINT, signal_handler); | |
if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) { | |
std::cerr << "inotify init failed\n"; | |
return 3; | |
} | |
std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n"; | |
while (running) { | |
const struct inotify_event *evt = inotifytools_next_event(-1); | |
if (!evt) continue; | |
fs::path fname = evt->name; | |
if (fname.extension() == ".wav") { | |
fs::path fullpath = watch_dir / fname; | |
transcribe_file(fullpath, ctx); | |
} | |
} | |
whisper_free(ctx); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment