vitorcalvi · February 6, 2025 17:07
diff --git a/gistfile1.txt b/gistfile1.txt
 ## Max Token
 python -m llama_cpp.server --model DeepSeek-R1-Distill-Qwen-1.5B-Q4_1.gguf --host 0.0.0.0 --n_threads 8 --n_batch 512 --n_gpu_layers 0 --n_ctx 2048 --mul_mat_q 1



 # Balanced
 python -m llama_cpp.server --model DeepSeek-R1-Distill-Qwen-1.5B-Q4_1.gguf --host 0.0.0.0 --n_threads 8 --n_batch 32 --n_gpu_layers 0 --n_ctx 512 --mul_mat_q 1 --offload_kqv 1


 ## VULKAN
 ./llama-cli -m ../../../models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf --gpu-layers 24 --gpu-precision fp16 --ctx-size 2048 --batch-size 512 --threads 4
	## Max Token
	python -m llama_cpp.server --model DeepSeek-R1-Distill-Qwen-1.5B-Q4_1.gguf --host 0.0.0.0 --n_threads 8 --n_batch 512 --n_gpu_layers 0 --n_ctx 2048 --mul_mat_q 1



	# Balanced
	python -m llama_cpp.server --model DeepSeek-R1-Distill-Qwen-1.5B-Q4_1.gguf --host 0.0.0.0 --n_threads 8 --n_batch 32 --n_gpu_layers 0 --n_ctx 512 --mul_mat_q 1 --offload_kqv 1


	## VULKAN
	./llama-cli -m ../../../models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf --gpu-layers 24 --gpu-precision fp16 --ctx-size 2048 --batch-size 512 --threads 4