Last active
June 24, 2026 11:00
-
-
Save 0chroma/db735f95bf47e4b822463935e6c997ec to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # tuning info: https://carteakey.dev/blog/local-inference/local-llm-optimization/ | |
| # dependencies: podman | |
| # Probe VRAM from sysfs (zero dependencies) | |
| get_max_vram_gb() { | |
| local max_bytes=0 | |
| for vram_file in /sys/class/drm/card*/device/mem_info_vram_total; do | |
| [[ -f "$vram_file" ]] || continue | |
| local bytes | |
| bytes=$(<"$vram_file") | |
| (( bytes > max_bytes )) && max_bytes=$bytes | |
| done | |
| echo $(( max_bytes / 1024 / 1024 / 1024 )) | |
| } | |
| VRAM_GB=$(get_max_vram_gb) | |
| # Select model and configuration based on available VRAM | |
| if (( VRAM_GB >= 22 )); then | |
| MODEL_NAME="Qwen3.6-27B" | |
| MODEL_QUANT="UD-Q4_K_XL" | |
| MODEL_REPO="${MODEL_NAME}-MTP-GGUF" | |
| CUSTOM_TEMPLATE_URL="https://gist.githubusercontent.com/jscott3201/e4b155885cc68c038d6ac8909a3bd9fe/raw/08b4cce5971d4075591204995fdd7a17e29826fc/custom_pub_chat_template_qwen36.jinja" | |
| else | |
| # TODO: try qwen 3.5 9b instead? | |
| MODEL_NAME="gemma-4-12b-it" | |
| MODEL_QUANT="UD-Q4_K_XL" | |
| MODEL_REPO="gemma-4-12b-it-GGUF" | |
| CUSTOM_TEMPLATE_URL="" | |
| fi | |
| MODEL_NAME_QUANT="$MODEL_NAME-$MODEL_QUANT" | |
| MODEL_URL="https://huggingface.co/unsloth/${MODEL_REPO}/resolve/main/$MODEL_NAME_QUANT.gguf" | |
| MMPROJ_URL="https://huggingface.co/unsloth/${MODEL_REPO}/resolve/main/mmproj-BF16.gguf" | |
| VOLUME_NAME="llm-models" | |
| # last working version was -b9570 | |
| LLAMA_IMAGE="ghcr.io/ggml-org/llama.cpp:server-vulkan" | |
| # Check if the volume exists; create it if it doesn't | |
| if ! podman volume inspect "$VOLUME_NAME" &>/dev/null; then | |
| echo "Volume '$VOLUME_NAME' not found. Creating..." | |
| podman volume create "$VOLUME_NAME" | |
| fi | |
| # Get the volume's mount path on the host | |
| MOUNT_PATH="$(podman volume inspect --format '{{.Mountpoint}}' "$VOLUME_NAME")" | |
| MODEL_DIR="$MOUNT_PATH/$MODEL_NAME_QUANT" | |
| MODEL_FILE="$MODEL_DIR/$MODEL_NAME_QUANT.gguf" | |
| MMPROJ_FILE="$MODEL_DIR/mmproj-BF16.gguf" | |
| PRESET_FILE="$MOUNT_PATH/models-preset.ini" | |
| if [[ ! -f "$MODEL_FILE" ]]; then | |
| echo "Model file '$MODEL_NAME_QUANT.gguf' not found. Downloading..." | |
| mkdir -p "$MODEL_DIR" | |
| curl -# -L -o "$MODEL_FILE" "$MODEL_URL" | |
| curl -# -L -o "$MMPROJ_FILE" "$MMPROJ_URL" | |
| echo "Model and mmproj downloaded successfully." | |
| else | |
| echo "Model file '$MODEL_NAME_QUANT.gguf' already exists." | |
| fi | |
| if [[ -n "$CUSTOM_TEMPLATE_URL" ]]; then | |
| CUSTOM_TEMPLATE_PATH="$MODEL_DIR/chat_template.jinja" | |
| if [[ ! -f "$CUSTOM_TEMPLATE_PATH" ]]; then | |
| echo "Custom chat template not found. Downloading..." | |
| curl -# -L -o "$CUSTOM_TEMPLATE_PATH" "$CUSTOM_TEMPLATE_URL" | |
| echo "Chat template downloaded." | |
| fi | |
| fi | |
| # Generate models-preset.ini for per-model configuration | |
| if [[ -n "$CUSTOM_TEMPLATE_URL" ]]; then | |
| cat > "$PRESET_FILE" << INI | |
| [*] | |
| chat-template-kwargs = {"preserve_thinking_enabled": true} | |
| [$MODEL_NAME_QUANT] | |
| chat-template-file = /models/$MODEL_NAME_QUANT/chat_template.jinja | |
| temp = 0.6 | |
| top-p = 0.95 | |
| top-k = 20 | |
| min-p = 0.00 | |
| INI | |
| else | |
| cat > "$PRESET_FILE" << INI | |
| [$MODEL_NAME_QUANT] | |
| temp = 0.6 | |
| top-p = 0.95 | |
| top-k = 20 | |
| min-p = 0.00 | |
| INI | |
| fi | |
| # 128k ctx fits but leaves <500MB VRAM headroom; 96k is a safer balance | |
| CTX_SIZE=98304 | |
| podman run \ | |
| --name llama.cpp \ | |
| --pull newer \ | |
| --detach \ | |
| --replace \ | |
| --group-add keep-groups \ | |
| --shm-size 16G \ | |
| --ulimit memlock=-1:-1 \ | |
| --security-opt label=type:container_runtime_t \ | |
| --device /dev/kfd \ | |
| --device /dev/dri \ | |
| -v "$VOLUME_NAME":/models \ | |
| -p 8080:8080 \ | |
| "$LLAMA_IMAGE" \ | |
| --models-dir /models \ | |
| --models-preset /models/models-preset.ini \ | |
| --parallel 1 \ | |
| --gpu-layers all \ | |
| --mlock \ | |
| --direct-io \ | |
| --cache-reuse 128 \ | |
| --sleep-idle-seconds 1200 \ | |
| --ctx-size "$CTX_SIZE" \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --spec-type draft-mtp \ | |
| --spec-draft-n-max 2 | |
| # Set up systemd service for auto-starting the container on boot | |
| SYSTEMD_DIR="$HOME/.config/systemd/user" | |
| SERVICE_FILE="$SYSTEMD_DIR/llamacpp.service" | |
| if [[ ! -f "$SERVICE_FILE" ]]; then | |
| echo "Systemd service not found. Creating..." | |
| mkdir -p "$SYSTEMD_DIR" | |
| cat > "$SERVICE_FILE" << 'SERVICE_EOF' | |
| [Unit] | |
| Description=llama.cpp Inference Server | |
| Documentation=https://github.com/ggml-org/llama.cpp | |
| After=network-online.target | |
| Wants=network-online.target | |
| [Service] | |
| Type=forking | |
| RemainAfterExit=yes | |
| ExecStart=/usr/bin/podman start llama.cpp | |
| ExecStop=/usr/bin/podman stop llama.cpp | |
| Restart=on-failure | |
| RestartSec=10 | |
| [Install] | |
| WantedBy=default.target | |
| SERVICE_EOF | |
| systemctl --user daemon-reload | |
| systemctl --user enable llamacpp.service | |
| echo "Systemd service created and enabled." | |
| else | |
| echo "Systemd service already exists." | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment