Skip to content

Instantly share code, notes, and snippets.

@0chroma
Last active June 24, 2026 11:00
Show Gist options
  • Select an option

  • Save 0chroma/db735f95bf47e4b822463935e6c997ec to your computer and use it in GitHub Desktop.

Select an option

Save 0chroma/db735f95bf47e4b822463935e6c997ec to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
set -euo pipefail
# tuning info: https://carteakey.dev/blog/local-inference/local-llm-optimization/
# dependencies: podman
# Probe VRAM from sysfs (zero dependencies)
get_max_vram_gb() {
local max_bytes=0
for vram_file in /sys/class/drm/card*/device/mem_info_vram_total; do
[[ -f "$vram_file" ]] || continue
local bytes
bytes=$(<"$vram_file")
(( bytes > max_bytes )) && max_bytes=$bytes
done
echo $(( max_bytes / 1024 / 1024 / 1024 ))
}
VRAM_GB=$(get_max_vram_gb)
# Select model and configuration based on available VRAM
if (( VRAM_GB >= 22 )); then
MODEL_NAME="Qwen3.6-27B"
MODEL_QUANT="UD-Q4_K_XL"
MODEL_REPO="${MODEL_NAME}-MTP-GGUF"
CUSTOM_TEMPLATE_URL="https://gist.githubusercontent.com/jscott3201/e4b155885cc68c038d6ac8909a3bd9fe/raw/08b4cce5971d4075591204995fdd7a17e29826fc/custom_pub_chat_template_qwen36.jinja"
else
# TODO: try qwen 3.5 9b instead?
MODEL_NAME="gemma-4-12b-it"
MODEL_QUANT="UD-Q4_K_XL"
MODEL_REPO="gemma-4-12b-it-GGUF"
CUSTOM_TEMPLATE_URL=""
fi
MODEL_NAME_QUANT="$MODEL_NAME-$MODEL_QUANT"
MODEL_URL="https://huggingface.co/unsloth/${MODEL_REPO}/resolve/main/$MODEL_NAME_QUANT.gguf"
MMPROJ_URL="https://huggingface.co/unsloth/${MODEL_REPO}/resolve/main/mmproj-BF16.gguf"
VOLUME_NAME="llm-models"
# last working version was -b9570
LLAMA_IMAGE="ghcr.io/ggml-org/llama.cpp:server-vulkan"
# Check if the volume exists; create it if it doesn't
if ! podman volume inspect "$VOLUME_NAME" &>/dev/null; then
echo "Volume '$VOLUME_NAME' not found. Creating..."
podman volume create "$VOLUME_NAME"
fi
# Get the volume's mount path on the host
MOUNT_PATH="$(podman volume inspect --format '{{.Mountpoint}}' "$VOLUME_NAME")"
MODEL_DIR="$MOUNT_PATH/$MODEL_NAME_QUANT"
MODEL_FILE="$MODEL_DIR/$MODEL_NAME_QUANT.gguf"
MMPROJ_FILE="$MODEL_DIR/mmproj-BF16.gguf"
PRESET_FILE="$MOUNT_PATH/models-preset.ini"
if [[ ! -f "$MODEL_FILE" ]]; then
echo "Model file '$MODEL_NAME_QUANT.gguf' not found. Downloading..."
mkdir -p "$MODEL_DIR"
curl -# -L -o "$MODEL_FILE" "$MODEL_URL"
curl -# -L -o "$MMPROJ_FILE" "$MMPROJ_URL"
echo "Model and mmproj downloaded successfully."
else
echo "Model file '$MODEL_NAME_QUANT.gguf' already exists."
fi
if [[ -n "$CUSTOM_TEMPLATE_URL" ]]; then
CUSTOM_TEMPLATE_PATH="$MODEL_DIR/chat_template.jinja"
if [[ ! -f "$CUSTOM_TEMPLATE_PATH" ]]; then
echo "Custom chat template not found. Downloading..."
curl -# -L -o "$CUSTOM_TEMPLATE_PATH" "$CUSTOM_TEMPLATE_URL"
echo "Chat template downloaded."
fi
fi
# Generate models-preset.ini for per-model configuration
if [[ -n "$CUSTOM_TEMPLATE_URL" ]]; then
cat > "$PRESET_FILE" << INI
[*]
chat-template-kwargs = {"preserve_thinking_enabled": true}
[$MODEL_NAME_QUANT]
chat-template-file = /models/$MODEL_NAME_QUANT/chat_template.jinja
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.00
INI
else
cat > "$PRESET_FILE" << INI
[$MODEL_NAME_QUANT]
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.00
INI
fi
# 128k ctx fits but leaves <500MB VRAM headroom; 96k is a safer balance
CTX_SIZE=98304
podman run \
--name llama.cpp \
--pull newer \
--detach \
--replace \
--group-add keep-groups \
--shm-size 16G \
--ulimit memlock=-1:-1 \
--security-opt label=type:container_runtime_t \
--device /dev/kfd \
--device /dev/dri \
-v "$VOLUME_NAME":/models \
-p 8080:8080 \
"$LLAMA_IMAGE" \
--models-dir /models \
--models-preset /models/models-preset.ini \
--parallel 1 \
--gpu-layers all \
--mlock \
--direct-io \
--cache-reuse 128 \
--sleep-idle-seconds 1200 \
--ctx-size "$CTX_SIZE" \
--cache-type-k q8_0 \
--cache-type-v q8_0 \
--spec-type draft-mtp \
--spec-draft-n-max 2
# Set up systemd service for auto-starting the container on boot
SYSTEMD_DIR="$HOME/.config/systemd/user"
SERVICE_FILE="$SYSTEMD_DIR/llamacpp.service"
if [[ ! -f "$SERVICE_FILE" ]]; then
echo "Systemd service not found. Creating..."
mkdir -p "$SYSTEMD_DIR"
cat > "$SERVICE_FILE" << 'SERVICE_EOF'
[Unit]
Description=llama.cpp Inference Server
Documentation=https://github.com/ggml-org/llama.cpp
After=network-online.target
Wants=network-online.target
[Service]
Type=forking
RemainAfterExit=yes
ExecStart=/usr/bin/podman start llama.cpp
ExecStop=/usr/bin/podman stop llama.cpp
Restart=on-failure
RestartSec=10
[Install]
WantedBy=default.target
SERVICE_EOF
systemctl --user daemon-reload
systemctl --user enable llamacpp.service
echo "Systemd service created and enabled."
else
echo "Systemd service already exists."
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment