Skip to content

Instantly share code, notes, and snippets.

@josejuan
Created April 9, 2025 08:55
Show Gist options
  • Save josejuan/a845b87db7b1e02c8a5b7c1bf12f34a1 to your computer and use it in GitHub Desktop.
Save josejuan/a845b87db7b1e02c8a5b7c1bf12f34a1 to your computer and use it in GitHub Desktop.
#include "whisper.h"
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <filesystem>
#include <thread>
#include <chrono>
#include <regex>
#include <map>
#include <csignal>
#include <inotifytools/inotifytools.h>
#include <sys/inotify.h>
namespace fs = std::filesystem;
#define WHISPER_SAMPLE_RATE 16000
volatile bool running = true;
void signal_handler(int) {
running = false;
}
bool wait_for_file_complete(const fs::path &path, int retries = 10) {
using namespace std::chrono_literals;
uintmax_t last_size = 0;
for (int i = 0; i < retries; ++i) {
auto cur_size = fs::file_size(path);
if (cur_size == last_size)
return true;
last_size = cur_size;
std::this_thread::sleep_for(500ms);
}
return false;
}
bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) {
FILE *fp = fopen(path.c_str(), "rb");
if (!fp) return false;
fseek(fp, 0, SEEK_END);
long len = ftell(fp);
fseek(fp, 0, SEEK_SET);
std::vector<uint8_t> buf(len);
fread(buf.data(), 1, len, fp);
fclose(fp);
if (buf.size() < 44) return false;
int channels = buf[22] | (buf[23] << 8);
int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24);
int bits_per_sample = buf[34] | (buf[35] << 8);
if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) {
std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n";
return false;
}
int data_offset = 44;
int num_samples = (buf.size() - data_offset) / 2;
pcmf32.resize(num_samples);
for (int i = 0; i < num_samples; ++i) {
int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8);
pcmf32[i] = s / 32768.0f;
}
return true;
}
void transcribe_file(const fs::path &path, struct whisper_context *ctx) {
std::string name = path.filename();
std::smatch m;
std::regex rx(R"(^(en|es)-(.+)\.wav$)");
if (!std::regex_match(name, m, rx)) {
std::cerr << "Ignoring invalid file: " << name << "\n";
return;
}
std::string lang = m[1];
std::string base = m[2];
std::vector<float> pcmf32;
if (!wait_for_file_complete(path)) {
std::cerr << "File incomplete: " << name << "\n";
return;
}
if (!load_wav_mono(path, pcmf32)) {
std::cerr << "Failed to load: " << name << "\n";
return;
}
whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
params.print_progress = false;
params.print_special = false;
params.print_realtime = false;
params.print_timestamps = false;
params.translate = (lang == "en");
params.language = "es";
if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
std::cerr << "whisper_full() failed\n";
return;
}
std::string transcription;
int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
transcription += whisper_full_get_segment_text(ctx, i);
}
std::string escaped;
for (char c : transcription) {
if (c == '"' || c == '$' || c == '`' || c == '\\')
escaped += '\\';
escaped += c;
}
std::string cmd = "xdotool type \"" + escaped + "\"";
int ret = std::system(cmd.c_str());
if (ret != 0) {
std::cerr << "xdotool failed\n";
} else {
std::cout << "Typed: " << name << "\n";
fs::remove(path);
}
}
int main(int argc, char **argv) {
if (argc != 2) {
std::cerr << "usage: " << argv[0] << " <watch_dir>\n";
return 1;
}
const fs::path watch_dir = argv[1];
const std::string model_path = "./models/ggml-large-v3.bin";
if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) {
std::cerr << "Invalid directory\n";
return 1;
}
struct whisper_context *ctx = whisper_init_from_file(model_path.c_str());
if (!ctx) {
std::cerr << "failed to load model\n";
return 2;
}
signal(SIGINT, signal_handler);
if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) {
std::cerr << "inotify init failed\n";
return 3;
}
std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n";
while (running) {
const struct inotify_event *evt = inotifytools_next_event(-1);
if (!evt) continue;
fs::path fname = evt->name;
if (fname.extension() == ".wav") {
fs::path fullpath = watch_dir / fname;
transcribe_file(fullpath, ctx);
}
}
whisper_free(ctx);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment