Created
April 22, 2026 19:59
-
-
Save tkafka/9b7c66775e8755647196b011a43600eb to your computer and use it in GitHub Desktop.
Convert whisper-large-v3-czech-cv13 to ggml for VoiceInk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Converts mikr/whisper-large-v3-czech-cv13 (HuggingFace Safetensors) | |
| # to GGML .bin format compatible with VoiceInk. | |
| # | |
| # Requirements installed by this script: | |
| # - git clone of ggml-org/whisper.cpp (conversion script) | |
| # - git clone of openai/whisper (vocab assets needed by conversion) | |
| # - Python venv with: torch, transformers, numpy | |
| # | |
| # Usage: bash convert-to-voiceink.sh | |
| set -euo pipefail | |
| STAGING=~/Downloads/whisper | |
| MODEL_ID="mikr/whisper-large-v3-czech-cv13" | |
| OUTPUT_NAME="whisper-large-v3-czech-cv13.bin" | |
| MODEL_DIR="model" | |
| MODEL_FILES=( | |
| added_tokens.json | |
| config.json | |
| generation_config.json | |
| merges.txt | |
| model.safetensors | |
| normalizer.json | |
| preprocessor_config.json | |
| special_tokens_map.json | |
| tokenizer.json | |
| tokenizer_config.json | |
| vocab.json | |
| ) | |
| clone_or_update_model_repo() { | |
| if [ ! -d "$MODEL_DIR/.git" ]; then | |
| rm -rf "$MODEL_DIR" | |
| echo "==> Cloning $MODEL_ID via Git..." | |
| GIT_LFS_SKIP_SMUDGE=1 git clone --depth=1 "https://huggingface.co/$MODEL_ID" "$MODEL_DIR" | |
| else | |
| echo "==> Model Git repo already exists, refreshing..." | |
| git -C "$MODEL_DIR" pull --ff-only | |
| fi | |
| echo "==> Downloading model.safetensors via Git LFS..." | |
| git -C "$MODEL_DIR" lfs pull --include=model.safetensors | |
| git -C "$MODEL_DIR" lfs checkout model.safetensors | |
| } | |
| clear_stale_hf_locks() { | |
| local cache_dir="$1/.cache/huggingface" | |
| if [ ! -d "$cache_dir" ]; then | |
| return | |
| fi | |
| if pgrep -f "(^|/)hf($| )|(^|/)huggingface-cli($| )" >/dev/null 2>&1; then | |
| return | |
| fi | |
| find "$cache_dir" -name '*.lock' -type f -delete | |
| } | |
| echo "==> Staging folder: $STAGING" | |
| mkdir -p "$STAGING" | |
| cd "$STAGING" | |
| # ── Clone whisper.cpp (provides convert-h5-to-ggml.py) ─────────────────────── | |
| if [ ! -d "whisper.cpp" ]; then | |
| echo "==> Cloning ggml-org/whisper.cpp..." | |
| git clone --depth=1 https://github.com/ggml-org/whisper.cpp | |
| else | |
| echo "==> whisper.cpp already cloned, skipping." | |
| fi | |
| # ── Clone openai/whisper source (vocab assets required by conversion script) ── | |
| if [ ! -d "openai-whisper" ]; then | |
| echo "==> Cloning openai/whisper..." | |
| git clone --depth=1 https://github.com/openai/whisper openai-whisper | |
| else | |
| echo "==> openai-whisper already cloned, skipping." | |
| fi | |
| # ── Python virtual environment ──────────────────────────────────────────────── | |
| if [ ! -d "venv" ]; then | |
| echo "==> Creating Python venv..." | |
| python3 -m venv venv | |
| fi | |
| # shellcheck disable=SC1091 | |
| source venv/bin/activate | |
| echo "==> Installing Python dependencies (torch, transformers, numpy)..." | |
| pip install --quiet --upgrade pip | |
| pip install --quiet torch transformers numpy | |
| # ── Download HuggingFace model ──────────────────────────────────────────────── | |
| if [ ! -f "$OUTPUT_NAME" ]; then | |
| echo "==> Ensuring $MODEL_ID is fully downloaded..." | |
| if command -v git-lfs >/dev/null 2>&1; then | |
| clone_or_update_model_repo | |
| else | |
| mkdir -p "$MODEL_DIR" | |
| clear_stale_hf_locks "$MODEL_DIR" | |
| hf download "$MODEL_ID" \ | |
| --local-dir "./$MODEL_DIR" \ | |
| --max-workers 4 \ | |
| "${MODEL_FILES[@]}" | |
| fi | |
| else | |
| echo "==> Output already exists, skipping model download." | |
| fi | |
| # ── Convert HF Safetensors → GGML ──────────────────────────────────────────── | |
| if [ ! -f "$OUTPUT_NAME" ]; then | |
| echo "==> Converting to GGML format..." | |
| rm -f ggml-model.bin ggml-model-f32.bin | |
| python3 ./whisper.cpp/models/convert-h5-to-ggml.py "./$MODEL_DIR/" ./openai-whisper . | |
| # ── Rename to final output name ─────────────────────────────────────────── | |
| mv ggml-model.bin "$OUTPUT_NAME" | |
| else | |
| echo "==> Output already exists, skipping conversion." | |
| fi | |
| echo "" | |
| echo "✓ Done! Model saved at:" | |
| echo " $STAGING/$OUTPUT_NAME" | |
| echo "" | |
| echo "To import into VoiceInk:" | |
| echo " AI Models → Local tab → Import Local Model… → select $OUTPUT_NAME" | |
| echo " Then: Set as Default" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment