Created
April 14, 2026 23:30
-
-
Save cyberfox/89350e4388b686a41065e368d5c44c5b to your computer and use it in GitHub Desktop.
A configuration file for the llama-swap tool managing a bunch of different llm-models.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| healthCheckTimeout: 666 | |
| logLevel: debug | |
| logTimeFormat: "rfc3339" | |
| logToStdout: "both" | |
| metricsMaxInMemory: 1000 | |
| captureBuffer: 15 | |
| startPort: 10001 | |
| sendLoadingState: true | |
| includeAliasesInList: false | |
| # -ot "blk\.([0-9])\..*=CUDA0" -ot "blk\.(1[0-8])\..*=CUDA1" -ot "blk\.(19|[2-9][0-9])\..*=CPU" | |
| # -np 1 -t 32 -cram 32768 -b 3072 -ub 3072 --slot-save-path /home/me/workspace/slots | |
| # --clear-idle --kv-unified | |
| # | |
| # Embedding model stays loaded persistently; chat models auto-fit around it via --fit on | |
| groups: | |
| "always-on": | |
| persistent: true | |
| swap: false | |
| exclusive: false | |
| # members: | |
| # - "qwen3-embedding" | |
| hooks: | |
| # on_startup: | |
| # preload: | |
| # - "qwen3-embedding" | |
| # | |
| # --image-min-tokens 2048 | |
| macros: | |
| "parameters": > | |
| --metrics | |
| --jinja | |
| --temp 1.0 -c 0 --min-p 0.01 --top-p 0.95 | |
| --threads-http 8 --mlock --host 0.0.0.0 --port ${PORT} | |
| --flash-attn on -ctk q8_0 -ctv q8_0 | |
| -np 2 -t 32 -cram 32768 -b 3072 -ub 3072 --slot-save-path /home/me/workspace/slots | |
| "base": > | |
| ${env.HOME}/workspace/llama.cpp/build/bin/llama-server | |
| ${parameters} --prio 2 --mmap --log-timestamps --kv-unified --fit on | |
| "ik-base": > | |
| ${env.HOME}/workspace/ik_llama.cpp/build/bin/llama-server | |
| ${parameters} -fit | |
| "limited-reasoning": > | |
| ${base} --reasoning-budget 1024 | |
| "base-no-reasoning": > | |
| ${base} --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}' | |
| "models-dir": "${env.HOME}/workspace/models" | |
| models: | |
| "qwen3-embedding": | |
| cmd: | | |
| ${env.HOME}/workspace/llama.cpp/build/bin/llama-server | |
| --embeddings --pooling last | |
| --metrics --host 0.0.0.0 --port ${PORT} | |
| --flash-attn on --fit on -ub 8192 --verbose-prompt | |
| --mmap --mlock --log-timestamps | |
| -t 8 -c 32768 -np 1 | |
| --model ${models-dir}/Qwen3-Embedding-8B-Q8_0.gguf | |
| "minimax-m2.7": | |
| cmd: | | |
| ${base} --model ${models-dir}/MiniMax-M2.7/MiniMax-M2.7-UD-Q8_K_XL-00001-of-00006.gguf | |
| --top-k 40 -c 196608 -np 1 | |
| "minimax-m2.7-nr": | |
| cmd: | | |
| ${base-no-reasoning} --model ${models-dir}/MiniMax-M2.7/MiniMax-M2.7-UD-Q8_K_XL-00001-of-00006.gguf | |
| --top-k 40 -c 196608 -np 1 | |
| "glm5-q3": | |
| cmd: | | |
| ${base-no-reasoning} | |
| --model ${models-dir}/GLM5-q3/UD-Q3_K_XL/GLM-5-UD-Q3_K_XL-00001-of-00008.gguf | |
| "glm5.1-q3": | |
| cmd: | | |
| ${base-no-reasoning} | |
| --model ${models-dir}/GLM5.1-q3/GLM-5.1-UD-Q3_K_XL-00001-of-00008.gguf | |
| "glm4.7": | |
| cmd: | | |
| ${base} -m ${models-dir}/GLM-4.7-Q5/GLM-4.7-UD-Q5_K_XL-00001-of-00006.gguf | |
| "glm4.7-flash-q8": | |
| cmd: | | |
| ${base} -m ${models-dir}/GLM-4.7-Flash-UD-Q8_K_XL.gguf | |
| "glm4.7-flash-bf16": | |
| cmd: | | |
| ${base} -m ${models-dir}/GLM-4.7-Flash-BF16/GLM-4.7-Flash-BF16-00001-of-00002.gguf -c 262144 | |
| "glm4.7-flash-aggressive": | |
| cmd: | | |
| ${base} -m ${models-dir}/GLM-4.7-Flash-Uncensored-HauhauCS-Aggressive-FP16.gguf -c 262144 | |
| "devstral": | |
| cmd: | | |
| ${base} -m ${models-dir}/Devstral-2-123B/Devstral-2-123B-Instruct-2512-UD-Q4_K_XL-00001-of-00002.gguf | |
| "phi4-mini": | |
| cmd: | | |
| ${base} -m ${models-dir}/Phi-4-mini-instruct.Q8_0.gguf | |
| "gpt-oss-120b": | |
| cmd: | | |
| ${base} -m ${models-dir}/gpt-oss-120b/UD-Q8_K_XL/gpt-oss-120b-UD-Q8_K_XL-00001-of-00002.gguf | |
| "mistral4-small": | |
| cmd: | | |
| ${base} -m ${models-dir}/Mistral-4-Small-GGUF/UD-Q8_K_XL/Mistral-Small-4-119B-2603-UD-Q8_K_XL-00001-of-00004.gguf | |
| --mmproj ${models-dir}/Mistral-4-Small-GGUF/mmproj-BF16.gguf | |
| "qwen3-coder-next": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3-Coder-Next-Q8_0/Qwen3-Coder-Next-Q8_0-00001-of-00004.gguf -c 262144 | |
| "gemma4-26b": | |
| cmd: | | |
| ${base} -m ${models-dir}/Gemma4-26B/gemma-4-26B-A4B-it-UD-Q8_K_XL.gguf | |
| --mmproj ${models-dir}/Gemma4-26B/mmproj-BF16.gguf | |
| "gemma4-31b": | |
| cmd: | | |
| ${base} -c 262144 -m ${models-dir}/Gemma4-31B/gemma-4-31B-it-UD-Q8_K_XL.gguf | |
| --mmproj ${models-dir}/Gemma4-31B/mmproj-BF16.gguf | |
| "qwen3.5-27b-aggressive": | |
| cmd: | | |
| ${limited-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-BF16.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-27b-aggressive-q8": | |
| cmd: | | |
| ${limited-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-27b-aggressive-no-reasoning": | |
| cmd: | | |
| ${base-no-reasoning} -c 262144 -m ${models-dir}/Qwen3.5/Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-BF16.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-27B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-27b": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3.5/Qwen3.5-27B-UD-Q8_K_XL.gguf | |
| "qwen3.5-35b-aggressive": | |
| cmd: | | |
| ${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-35b-aggressive-no-reasoning": | |
| cmd: | | |
| ${base-no-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-35B-A3B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-35b": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3.5/Qwen3.5-35B-A3B-UD-Q8_K_XL.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf | |
| "qwen3.5-122b-aggressive": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-122b-aggressive-q4": | |
| cmd: | | |
| ${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf | |
| "qwen3.5-122b-aggressive-q4-nr": | |
| cmd: | | |
| ${base-no-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf | |
| "qwen3.5-122b-aggressive-limited": | |
| cmd: | | |
| ${limited-reasoning} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-Qwen3.5-122B-A10B-Uncensored-HauhauCS-Aggressive-f16.gguf | |
| "qwen3.5-122b": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3.5/Qwen3.5-122B-A10B-UD-Q8_K_XL-00001-of-00004.gguf | |
| --mmproj ${models-dir}/Qwen3.5/mmproj-BF16.gguf | |
| "qwen35-397b-q5": | |
| cmd: | | |
| ${base} -m ${models-dir}/Qwen3.5/Qwen3.5-397B-A17B-UD-Q5_K_XL-00001-of-00007.gguf | |
| # Image Models | |
| "z-image-turbo": | |
| cmd: | | |
| ${base} -m ${models-dir}/z-image-turbo-Q8_0.gguf | |
| "qwen-image-layered": | |
| cmd: | | |
| ${base} -m ${models-dir}/qwen-image-layered-Q8_0.gguf | |
| # filters: | |
| # stripParams: "temperature, top_p, min_p, top_k" | |
| # setParams: | |
| # temperature: 1.0 | |
| # top_p: 0.95 | |
| # min_p: 0.01 | |
| # top_k: 40 | |
| # | |
| # Older Models | |
| # | |
| # "nemotron-q8": | |
| # cmd: | | |
| # ${base} | |
| # --model ${models-dir}/Nemotron-3-Super-120B-GGUF/UD-Q8_K_XL/NVIDIA-Nemotron-3-Super-120B-A12B-UD-Q8_K_XL-00001-of-00004.gguf | |
| # "nemotron-q4": | |
| # cmd: | | |
| # ${base} | |
| # --model ${models-dir}/Nemotron-3-Super-120B-GGUF/UD-Q4_K_XL/NVIDIA-Nemotron-3-Super-120B-A12B-UD-Q4_K_XL-00001-of-00003.gguf | |
| # "glm5-q3-speculative": | |
| # cmd: | | |
| # ${base-no-reasoning} | |
| # --model ${models-dir}/GLM5-q3/UD-Q3_K_XL/GLM-5-UD-Q3_K_XL-00001-of-00008.gguf | |
| # --model-draft ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf | |
| # -ctkd q4_0 -ctvd q4_0 --ctx-size-draft 131072 --device-draft CUDA0 | |
| # --fit-target 17408,512 --draft-n 3 | |
| # "glm5-q2-speculative": | |
| # cmd: | | |
| # ${base-no-reasoning} | |
| # --model ${models-dir}/GLM5-q2/UD-Q2_K_XL/GLM-5-UD-Q2_K_XL-00001-of-00007.gguf | |
| # --model-draft ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf | |
| # -ctkd q4_0 -ctvd q4_0 --ctx-size-draft 131072 --device-draft CUDA0 | |
| # --fit-target 17408,512 --draft-n 3 | |
| # "glm5-q2": | |
| # cmd: | | |
| # ${base-no-reasoning} | |
| # --model ${models-dir}/GLM5-q2/UD-Q2_K_XL/GLM-5-UD-Q2_K_XL-00001-of-00007.gguf | |
| # "glm5-q2-small": | |
| # cmd: | | |
| # ${base-no-reasoning} | |
| # --model ${models-dir}/UD-IQ2_XXS/GLM-5-UD-IQ2_XXS-00001-of-00006.gguf | |
| # "glm4.7-flash-q3": | |
| # cmd: | | |
| # ${base} -m ${models-dir}/GLM-4.7-Flash-UD-Q3_K_XL.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1 | |
| # --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}' | |
| ## ${base} -m ${models-dir}/GLM-4.7-Flash-UD-IQ2_XXS.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1 | |
| ## 123 t/s at the high end. | |
| # "glm4.7-flash-tiny": | |
| # cmd: | | |
| # ${base} -m ${models-dir}/GLM-4.7-Flash-UD-IQ1_S.gguf -c 32768 -ctk q4_0 -ctv q4_0 -np 1 | |
| # --reasoning-budget 0 --chat-template-kwargs '{"enable_thinking": false}' | |
| # "step3.5-flash": | |
| # cmd: | | |
| # ${base} -m ${models-dir}/step-3.5-flash-q8/Step-3.5-Flash-Q8.gguf | |
| # "step3.5-flash-alt": | |
| # cmd: | | |
| # ${base} -m ${models-dir}/StepFun-3.5-Flash-Q8/step3p5_flash_Q8_0-00001-of-00022.gguf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment