josejuan · April 9, 2025 08:55
diff --git a/transcript.cpp b/transcript.cpp
 #include "whisper.h"

 #include <iostream>
 #include <vector>
 #include <fstream>
 #include <string>
 #include <filesystem>
 #include <thread>
 #include <chrono>
 #include <regex>
 #include <map>
 #include <csignal>
 #include <inotifytools/inotifytools.h>
 #include <sys/inotify.h>

 namespace fs = std::filesystem;

 #define WHISPER_SAMPLE_RATE 16000

 volatile bool running = true;

 void signal_handler(int) {
    running = false;
 }

 bool wait_for_file_complete(const fs::path &path, int retries = 10) {
    using namespace std::chrono_literals;
    uintmax_t last_size = 0;

    for (int i = 0; i < retries; ++i) {
        auto cur_size = fs::file_size(path);
        if (cur_size == last_size)
            return true;
        last_size = cur_size;
        std::this_thread::sleep_for(500ms);
    }

    return false;
 }

 bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) {
    FILE *fp = fopen(path.c_str(), "rb");
    if (!fp) return false;

    fseek(fp, 0, SEEK_END);
    long len = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    std::vector<uint8_t> buf(len);
    fread(buf.data(), 1, len, fp);
    fclose(fp);

    if (buf.size() < 44) return false;

    int channels = buf[22] | (buf[23] << 8);
    int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24);
    int bits_per_sample = buf[34] | (buf[35] << 8);

    if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) {
        std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n";
        return false;
    }

    int data_offset = 44;
    int num_samples = (buf.size() - data_offset) / 2;
    pcmf32.resize(num_samples);

    for (int i = 0; i < num_samples; ++i) {
        int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8);
        pcmf32[i] = s / 32768.0f;
    }

    return true;
 }

 void transcribe_file(const fs::path &path, struct whisper_context *ctx) {
    std::string name = path.filename();
    std::smatch m;
    std::regex rx(R"(^(en|es)-(.+)\.wav$)");

    if (!std::regex_match(name, m, rx)) {
        std::cerr << "Ignoring invalid file: " << name << "\n";
        return;
    }

    std::string lang = m[1];
    std::string base = m[2];
    std::vector<float> pcmf32;

    if (!wait_for_file_complete(path)) {
        std::cerr << "File incomplete: " << name << "\n";
        return;
    }

    if (!load_wav_mono(path, pcmf32)) {
        std::cerr << "Failed to load: " << name << "\n";
        return;
    }

    whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_progress = false;
    params.print_special  = false;
    params.print_realtime = false;
    params.print_timestamps = false;
    params.translate = (lang == "en");
    params.language = "es";

    if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
        std::cerr << "whisper_full() failed\n";
        return;
    }

    std::string transcription;
    int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        transcription += whisper_full_get_segment_text(ctx, i);
    }

    std::string escaped;
    for (char c : transcription) {
        if (c == '"' || c == '$' || c == '`' || c == '\\')
            escaped += '\\';
        escaped += c;
    }

    std::string cmd = "xdotool type \"" + escaped + "\"";
    int ret = std::system(cmd.c_str());

    if (ret != 0) {
        std::cerr << "xdotool failed\n";
    } else {
        std::cout << "Typed: " << name << "\n";
        fs::remove(path);
    }
 }

 int main(int argc, char **argv) {
    if (argc != 2) {
        std::cerr << "usage: " << argv[0] << " <watch_dir>\n";
        return 1;
    }

    const fs::path watch_dir = argv[1];
    const std::string model_path = "./models/ggml-large-v3.bin";

    if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) {
        std::cerr << "Invalid directory\n";
        return 1;
    }

    struct whisper_context *ctx = whisper_init_from_file(model_path.c_str());
    if (!ctx) {
        std::cerr << "failed to load model\n";
        return 2;
    }

    signal(SIGINT, signal_handler);

    if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) {
        std::cerr << "inotify init failed\n";
        return 3;
    }

    std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n";

    while (running) {
        const struct inotify_event *evt = inotifytools_next_event(-1);
        if (!evt) continue;

        fs::path fname = evt->name;
        if (fname.extension() == ".wav") {
            fs::path fullpath = watch_dir / fname;
            transcribe_file(fullpath, ctx);
        }
    }

    whisper_free(ctx);
    return 0;
 }
	#include "whisper.h"

	#include <iostream>
	#include <vector>
	#include <fstream>
	#include <string>
	#include <filesystem>
	#include <thread>
	#include <chrono>
	#include <regex>
	#include <map>
	#include <csignal>
	#include <inotifytools/inotifytools.h>
	#include <sys/inotify.h>

	namespace fs = std::filesystem;

	#define WHISPER_SAMPLE_RATE 16000

	volatile bool running = true;

	void signal_handler(int) {
	running = false;
	}

	bool wait_for_file_complete(const fs::path &path, int retries = 10) {
	using namespace std::chrono_literals;
	uintmax_t last_size = 0;

	for (int i = 0; i < retries; ++i) {
	auto cur_size = fs::file_size(path);
	if (cur_size == last_size)
	return true;
	last_size = cur_size;
	std::this_thread::sleep_for(500ms);
	}

	return false;
	}

	bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) {
	FILE *fp = fopen(path.c_str(), "rb");
	if (!fp) return false;

	fseek(fp, 0, SEEK_END);
	long len = ftell(fp);
	fseek(fp, 0, SEEK_SET);

	std::vector<uint8_t> buf(len);
	fread(buf.data(), 1, len, fp);
	fclose(fp);

	if (buf.size() < 44) return false;

	int channels = buf[22] \| (buf[23] << 8);
	int sample_rate = buf[24] \| (buf[25] << 8) \| (buf[26] << 16) \| (buf[27] << 24);
	int bits_per_sample = buf[34] \| (buf[35] << 8);

	if (channels != 1 \|\| sample_rate != WHISPER_SAMPLE_RATE \|\| bits_per_sample != 16) {
	std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n";
	return false;
	}

	int data_offset = 44;
	int num_samples = (buf.size() - data_offset) / 2;
	pcmf32.resize(num_samples);

	for (int i = 0; i < num_samples; ++i) {
	int16_t s = buf[data_offset + 2i] \| (buf[data_offset + 2i + 1] << 8);
	pcmf32[i] = s / 32768.0f;
	}

	return true;
	}

	void transcribe_file(const fs::path &path, struct whisper_context *ctx) {
	std::string name = path.filename();
	std::smatch m;
	std::regex rx(R"(^(en\|es)-(.+)\.wav$)");

	if (!std::regex_match(name, m, rx)) {
	std::cerr << "Ignoring invalid file: " << name << "\n";
	return;
	}

	std::string lang = m[1];
	std::string base = m[2];
	std::vector<float> pcmf32;

	if (!wait_for_file_complete(path)) {
	std::cerr << "File incomplete: " << name << "\n";
	return;
	}

	if (!load_wav_mono(path, pcmf32)) {
	std::cerr << "Failed to load: " << name << "\n";
	return;
	}

	whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
	params.print_progress = false;
	params.print_special = false;
	params.print_realtime = false;
	params.print_timestamps = false;
	params.translate = (lang == "en");
	params.language = "es";

	if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
	std::cerr << "whisper_full() failed\n";
	return;
	}

	std::string transcription;
	int n_segments = whisper_full_n_segments(ctx);
	for (int i = 0; i < n_segments; ++i) {
	transcription += whisper_full_get_segment_text(ctx, i);
	}

	std::string escaped;
	for (char c : transcription) {
	if (c == '"' \|\| c == '$' \|\| c == '`' \|\| c == '\\')
	escaped += '\\';
	escaped += c;
	}

	std::string cmd = "xdotool type \"" + escaped + "\"";
	int ret = std::system(cmd.c_str());

	if (ret != 0) {
	std::cerr << "xdotool failed\n";
	} else {
	std::cout << "Typed: " << name << "\n";
	fs::remove(path);
	}
	}

	int main(int argc, char **argv) {
	if (argc != 2) {
	std::cerr << "usage: " << argv[0] << " <watch_dir>\n";
	return 1;
	}

	const fs::path watch_dir = argv[1];
	const std::string model_path = "./models/ggml-large-v3.bin";

	if (!fs::exists(watch_dir) \|\| !fs::is_directory(watch_dir)) {
	std::cerr << "Invalid directory\n";
	return 1;
	}

	struct whisper_context *ctx = whisper_init_from_file(model_path.c_str());
	if (!ctx) {
	std::cerr << "failed to load model\n";
	return 2;
	}

	signal(SIGINT, signal_handler);

	if (!inotifytools_initialize() \|\| !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) {
	std::cerr << "inotify init failed\n";
	return 3;
	}

	std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n";

	while (running) {
	const struct inotify_event *evt = inotifytools_next_event(-1);
	if (!evt) continue;

	fs::path fname = evt->name;
	if (fname.extension() == ".wav") {
	fs::path fullpath = watch_dir / fname;
	transcribe_file(fullpath, ctx);
	}
	}

	whisper_free(ctx);
	return 0;
	}