Skip to content

Instantly share code, notes, and snippets.

@cyberfox
Created April 14, 2026 23:30
Show Gist options
  • Select an option

  • Save cyberfox/89350e4388b686a41065e368d5c44c5b to your computer and use it in GitHub Desktop.

Select an option

Save cyberfox/89350e4388b686a41065e368d5c44c5b to your computer and use it in GitHub Desktop.
A configuration file for the llama-swap tool managing a bunch of different llm-models.
healthCheckTimeout: 666
logLevel: debug
logTimeFormat: "rfc3339"
logToStdout: "both"
metricsMaxInMemory: 1000
captureBuffer: 15
startPort: 10001
sendLoadingState: true
includeAliasesInList: false
# -ot "blk\.([0-9])\..*=CUDA0" -ot "blk\.(1[0-8])\..*=CUDA1" -ot "blk\.(19|[2-9][0-9])\..*=CPU"
# -np 1 -t 32 -cram 32768 -b 3072 -ub 3072 --slot-save-path /home/me/workspace/slots
# --clear-idle --kv-unified
#
# Embedding model stays loaded persistently; chat models auto-fit around it via --fit on
groups:
"always-on":
persistent: true
swap: false
exclusive: false
# members:
# - "qwen3-embedding"
hooks:
# on_startup:
# preload:
# - "qwen3-embedding"
#
# --image-min-tokens 2048
macros:
"parameters": >
--metrics
--jinja
--temp 1.0 -c 0 --min-p 0.01 --top-p 0.95
--threads-http 8 --mlock --host 0.0.0.0 --port ${PORT}
--flash-attn on -ctk q8_0 -ctv q8_0
-np 2 -t 32 -cram 32768 -b 3072 -ub 3072 --slot-save-path /home/me/workspace/slots
"base": >
${env.HOME}/workspace/llama.cpp/build/bin/llama-server
${parameters} --prio 2 --mmap --log-timestamps --kv-unified --fit on
"ik-base": >
${env.HOME}/workspace/ik_llama.cpp/build/bin/llama-server
${parameters} -fit
"limited-reasoning": >
${base} --reasoning-budget 1024
"base-no-reasoning": >
${base} --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}'
"models-dir": "${env.HOME}/workspace/models"
models:
"qwen3-embedding":
cmd: |
${env.HOME}/workspace/llama.cpp/build/bin/llama-server
--embeddings --pooling last
--metrics --host 0.0.0.0 --port ${PORT}
--flash-attn on --fit on -ub 8192 --verbose-prompt
--mmap --mlock --log-timestamps
-t 8 -c 32768 -np 1
--model ${models-dir}/Qwen3-Embedding-8B-Q8_0.gguf
"minimax-m2.7":
cmd: |
${base} --model ${models-dir}/MiniMax-M2.7/MiniMax-M2.7-UD-Q8_K_XL-00001-of-00006.gguf
--top-k 40 -c 196608 -np 1
"minimax-m2.7-nr":
cmd: |
${base-no-reasoning} --model ${models-dir}/MiniMax-M2.7/MiniMax-M2.7-UD-Q8_K_XL-00001-of-00006.gguf
--top-k 40 -c 196608 -np 1
"glm5-q3":
cmd: |
${base-no-reasoning}
--model ${models-dir}/GLM5-q3/UD-Q3_K_XL/GLM-5-UD-Q3_K_XL-00001-of-00008.gguf
"glm5.1-q3":
cmd: |
${base-no-reasoning}
--model ${models-dir}/GLM5.1-q3/GLM-5.1-UD-Q3_K_XL-00001-of-00008.gguf
"glm4.7":
cmd: |
${base} -m ${models-dir}/GLM-4.7-Q5/GLM-4.7-UD-Q5_K_XL-00001-of-00006.gguf
"glm4.7-flash-q8":
cmd: |
${base} -m ${models-dir}/GLM-4.7-Flash-UD-Q8_K_XL.gguf
"glm4.7-flash-bf16":
cmd: |
${base} -m ${models-dir}/GLM-4.7-Flash-BF16/GLM-4.7-Flash-BF16-00001-of-00002.gguf -c 262144
"glm4.7-flash-aggressive":
cmd: |
${base} -m ${models-dir}/GLM-4.7-Flash-Uncensored-HauhauCS-Aggressive-FP16.gguf -c 262144
"devstral":
cmd: |
${base} -m ${models-dir}/Devstral-2-123B/Devstral-2-123B-Instruct-2512-UD-Q4_K_XL-00001-of-00002.gguf
"phi4-mini":
cmd: |
${base} -m ${models-dir}/Phi-4-mini-instruct.Q8_0.gguf
"gpt-oss-120b":
cmd: |
${base} -m ${models-dir}/gpt-oss-120b/UD-Q8_K_XL/gpt-oss-120b-UD-Q8_K_XL-00001-of-00002.gguf
"mistral4-small":
cmd: |
${base} -m ${models-dir}/Mistral-4-Small-GGUF/UD-Q8_K_XL/Mistral-Small-4-119B-2603-UD-Q8_K_XL-00001-of-00004.gguf
--mmproj ${models-dir}/Mistral-4-Small-GGUF/mmproj-BF16.gguf
"qwen3-coder-next":
cmd: |
${base} -m ${models-dir}/Qwen3-Coder-Next-Q8_0/Qwen3-Coder-Next-Q8_0-00001-of-00004.gguf -c 262144
"gemma4-26b":
cmd: |
${base} -m ${models-dir}/Gemma4-26B/gemma-4-26B-A4B-it-UD-Q8_K_XL.gguf
--mmproj ${models-dir}/Gemma4-26B/mmproj-BF16.gguf
"gemma4-31b":
cmd: |
${base} -c 262144 -m ${models-dir}/Gemma4-31B/gemma-4-31B-it-UD-Q8_K_XL.gguf
--mmproj ${models-dir}/Gemma4-31B/mmproj-BF16.gguf
"qwen3.5-27b-aggressive":
cmd: |
${limited-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-BF16.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-27b-aggressive-q8":
cmd: |
${limited-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-27b-aggressive-no-reasoning":
cmd: |
${base-no-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-BF16.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-27b":
cmd: |
${base} -m ${models-dir}/Qwen3.5/Qwen3.5-27B-UD-Q8_K_XL.gguf
"qwen3.5-35b-aggressive":
cmd: |
${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-35b-aggressive-no-reasoning":
cmd: |
${base-no-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-35b":
cmd: |
${base} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-UD-Q8_K_XL.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf
"qwen3.5-122b-aggressive":
cmd: |
${base} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-122b-aggressive-q4":
cmd: |
${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf
"qwen3.5-122b-aggressive-q4-nr":
cmd: |
${base-no-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf
"qwen3.5-122b-aggressive-limited":
cmd: |
${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-f16.gguf
"qwen3.5-122b":
cmd: |
${base} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-UD-Q8_K_XL-00001-of-00004.gguf
--mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf
"qwen35-397b-q5":
cmd: |
${base} -m ${models-dir}/Qwen3.5/Qwen3.5-397B-A17B-UD-Q5_K_XL-00001-of-00007.gguf
# Image Models
"z-image-turbo":
cmd: |
${base} -m ${models-dir}/z-image-turbo-Q8_0.gguf
"qwen-image-layered":
cmd: |
${base} -m ${models-dir}/qwen-image-layered-Q8_0.gguf
# filters:
# stripParams: "temperature, top_p, min_p, top_k"
# setParams:
# temperature: 1.0
# top_p: 0.95
# min_p: 0.01
# top_k: 40
#
# Older Models
#
# "nemotron-q8":
# cmd: |
# ${base}
# --model ${models-dir}/Nemotron-3-Super-120B-GGUF/UD-Q8_K_XL/NVIDIA-Nemotron-3-Super-120B-A12B-UD-Q8_K_XL-00001-of-00004.gguf
# "nemotron-q4":
# cmd: |
# ${base}
# --model ${models-dir}/Nemotron-3-Super-120B-GGUF/UD-Q4_K_XL/NVIDIA-Nemotron-3-Super-120B-A12B-UD-Q4_K_XL-00001-of-00003.gguf
# "glm5-q3-speculative":
# cmd: |
# ${base-no-reasoning}
# --model ${models-dir}/GLM5-q3/UD-Q3_K_XL/GLM-5-UD-Q3_K_XL-00001-of-00008.gguf
# --model-draft ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf
# -ctkd q4_0 -ctvd q4_0 --ctx-size-draft 131072 --device-draft CUDA0
# --fit-target 17408,512 --draft-n 3
# "glm5-q2-speculative":
# cmd: |
# ${base-no-reasoning}
# --model ${models-dir}/GLM5-q2/UD-Q2_K_XL/GLM-5-UD-Q2_K_XL-00001-of-00007.gguf
# --model-draft ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf
# -ctkd q4_0 -ctvd q4_0 --ctx-size-draft 131072 --device-draft CUDA0
# --fit-target 17408,512 --draft-n 3
# "glm5-q2":
# cmd: |
# ${base-no-reasoning}
# --model ${models-dir}/GLM5-q2/UD-Q2_K_XL/GLM-5-UD-Q2_K_XL-00001-of-00007.gguf
# "glm5-q2-small":
# cmd: |
# ${base-no-reasoning}
# --model ${models-dir}/UD-IQ2_XXS/GLM-5-UD-IQ2_XXS-00001-of-00006.gguf
# "glm4.7-flash-q3":
# cmd: |
# ${base} -m ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1
# --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}'
## ${base} -m ${models-dir}/GLM-4.7-Flash-UD-IQ2_XXS.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1
## 123 t/s at the high end.
# "glm4.7-flash-tiny":
# cmd: |
# ${base} -m ${models-dir}/GLM-4.7-Flash-UD-IQ1_S.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1
# --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}'
# "step3.5-flash":
# cmd: |
# ${base} -m ${models-dir}/step-3.5-flash-q8/Step-3.5-Flash-Q8.gguf
# "step3.5-flash-alt":
# cmd: |
# ${base} -m ${models-dir}/StepFun-3.5-Flash-Q8/step3p5_flash_Q8_0-00001-of-00022.gguf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment