surajssd · March 5, 2025 21:54
diff --git a/benchmark_results.json b/benchmark_results.json
 [{"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 0.9284057358744006, "Output Tput (tok/s)": 198.24247678126076, "Total Tput (tok/s)": 396.266778214591, "Mean TTFT (ms)": 110.37337160010793, "Median TTFT (ms)": 96.9816950000677, "P99 TTFT (ms)": 230.3005734290491, "Mean TPOT (ms)": 43.72182021034344, "Median TPOT (ms)": 43.54532462942404, "P99 TPOT (ms)": 50.513716590712384, "Mean ITL (ms)": 43.631314270832306, "Median ITL (ms)": 42.27557599915599, "P99 ITL (ms)": 87.99811164881247}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 2.521471685463534, "Output Tput (tok/s)": 539.0528242768216, "Total Tput (tok/s)": 1076.8701274277662, "Mean TTFT (ms)": 139.8380736899344, "Median TTFT (ms)": 125.15622350110789, "P99 TTFT (ms)": 332.96458055017825, "Mean TPOT (ms)": 61.62705314762229, "Median TPOT (ms)": 63.49695762410795, "P99 TPOT (ms)": 83.14804765725845, "Mean ITL (ms)": 60.90314970594733, "Median ITL (ms)": 57.82372999965446, "P99 ITL (ms)": 173.3455703589425}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 3.697917701235066, "Output Tput (tok/s)": 791.8905861309833, "Total Tput (tok/s)": 1580.6379422159166, "Mean TTFT (ms)": 226.7617405450983, "Median TTFT (ms)": 215.64252300049702, "P99 TTFT (ms)": 479.7536375109121, "Mean TPOT (ms)": 87.57618569686481, "Median TPOT (ms)": 81.37176336238505, "P99 TPOT (ms)": 153.84377854204757, "Mean ITL (ms)": 72.72835843662789, "Median ITL (ms)": 64.36212599874125, "P99 ITL (ms)": 233.4853582404321}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 4.132880392447687, "Output Tput (tok/s)": 880.9027912482622, "Total Tput (tok/s)": 1762.4255145553916, "Mean TTFT (ms)": 2683.6246253499667, "Median TTFT (ms)": 2771.161826000025, "P99 TTFT (ms)": 4838.2172842909495, "Mean TPOT (ms)": 114.09317725452917, "Median TPOT (ms)": 79.11987648951599, "P99 TPOT (ms)": 400.5428015899088, "Mean ITL (ms)": 71.80776212203955, "Median ITL (ms)": 64.29544500133488, "P99 ITL (ms)": 399.1085860384919}]
diff --git a/benchmark_results.md b/benchmark_results.md
diff --git a/deployment.sh b/deployment.sh
 # Instructions are here: https://github.com/surajssd/llm-k8s/blob/9271454bc5a008a437c7b52c33409b18d6cb2220/configs/llama-3-3-70b-instruct/two-nodes-eight-gpus

 git clone https://github.com/surajssd/llm-k8s
 cd llm-k8s
 git checkout 9271454bc5a008a437c7b52c33409b18d6cb2220

 source .env
 export VM_SIZE="Standard_ND96asr_v4"
 export GPU_NODE_COUNT=2
 export AZURE_REGION=southcentralus

 ./scripts/deploy-aks.sh deploy_aks
 ./scripts/deploy-aks.sh download_aks_credentials
 ./scripts/deploy-aks.sh install_kube_prometheus
 ./scripts/deploy-aks.sh install_lws_controller
 ./scripts/deploy-aks.sh add_nodepool
 ./scripts/deploy-aks.sh install_network_operator
 ./scripts/deploy-aks.sh install_gpu_operator

 export HF_TOKEN=""
 kubectl create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
 kubectl apply -f configs/llama-3-3-70b-instruct/two-nodes-eight-gpus/k8s/
 ./configs/llama-3-3-70b-instruct/two-nodes-eight-gpus/fix-svc.sh

 # Test if the model is deployed!
 kubectl port-forward svc/llama-3-3-70b-instruct-leader 8000
 # If this works, this means that the model is being served
 curl -X POST "http://localhost:8000/v1/chat/completions" \
    -H "Content-Type: application/json" \
    --data '{
  "model": "meta-llama/Llama-3.3-70B-Instruct",
  "messages": [
   {
    "role": "user",
    "content": "Explain the origin of Llama the animal?"
   }
  ]
 }' | jq

 # Benchmark
 # Steps: https://github.com/surajssd/llm-k8s/blob/9271454bc5a008a437c7b52c33409b18d6cb2220/benchmark/vllm_upstream

 kubectl create ns vllm-benchmark
 kubectl -n vllm-benchmark create configmap benchmark-runner \
    --from-literal=TEST_SERVER_URL="http://llama-3-3-70b-instruct-leader.default:8000" \
    --from-literal=MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" \
    --from-literal=TENSOR_PARALLEL_SIZE=4 \
    --from-literal=PIPELINE_PARALLEL_SIZE="${GPU_NODE_COUNT}" \
    --from-literal=GPU_VM_SKU="${VM_SIZE}" \
    --dry-run=client -o yaml | kubectl apply -f -

 kubectl -n vllm-benchmark create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
 kubectl apply -f benchmark/vllm_upstream/k8s/

 POD_NAME=$(kubectl -n vllm-benchmark \
    get pods \
    -l app=benchmark-runner \
    --field-selector=status.phase=Running \
    -o jsonpath='{.items[].metadata.name}')

 kubectl -n vllm-benchmark \
    exec -it $POD_NAME \
    -- bash /root/scripts/run_vllm_upstream_benchmark.sh

 RESULTS_FILE=$(kubectl -n vllm-benchmark \
    exec -it $POD_NAME \
    -- bash -c "ls /root/results*.tar.gz" | tr -d '\r')

 kubectl -n vllm-benchmark \
    cp "${POD_NAME}:${RESULTS_FILE}" "./$(basename ${RESULTS_FILE})"
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.commands
 {
  "client_command": "python3 benchmark_serving.py         --save-result         --base-url http://llama-3-3-70b-instruct-leader.default:8000         --result-dir /root/vllm/.buildkite/results/         --result-filename serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.json         --request-rate 01         --model=meta-llama/Llama-3.3-70B-Instruct         --backend=vllm         --dataset-name=sharegpt         --dataset-path=/root/sharegpt.json         --num-prompts=200",
  "gpu_type": "Standard_ND96asr_v4 x 2"
 }
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf.json
Test name	GPU	# of req.	Tput (req/s)	Output Tput (tok/s)	Total Tput (tok/s)	Mean TTFT (ms)	Median TTFT (ms)	P99 TTFT (ms)	Mean TPOT (ms)	Median TPOT (ms)	P99 TPOT (ms)	Mean ITL (ms)	Median ITL (ms)	P99 ITL (ms)
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01	1xStandard_ND96asr_v4 x 2	200	0.928406	198.242	396.267	110.373	96.9817	230.301	43.7218	43.5453	50.5137	43.6313	42.2756	87.9981
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04	1xStandard_ND96asr_v4 x 2	200	2.52147	539.053	1076.87	139.838	125.156	332.965	61.6271	63.497	83.148	60.9031	57.8237	173.346
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_16	1xStandard_ND96asr_v4 x 2	200	3.69792	791.891	1580.64	226.762	215.643	479.754	87.5762	81.3718	153.844	72.7284	64.3621	233.485
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_inf	1xStandard_ND96asr_v4 x 2	200	4.13288	880.903	1762.43	2683.62	2771.16	4838.22	114.093	79.1199	400.543	71.8078	64.2954	399.109
	# Instructions are here: https://github.com/surajssd/llm-k8s/blob/9271454bc5a008a437c7b52c33409b18d6cb2220/configs/llama-3-3-70b-instruct/two-nodes-eight-gpus

	git clone https://github.com/surajssd/llm-k8s
	cd llm-k8s
	git checkout 9271454bc5a008a437c7b52c33409b18d6cb2220

	source .env
	export VM_SIZE="Standard_ND96asr_v4"
	export GPU_NODE_COUNT=2
	export AZURE_REGION=southcentralus

	./scripts/deploy-aks.sh deploy_aks
	./scripts/deploy-aks.sh download_aks_credentials
	./scripts/deploy-aks.sh install_kube_prometheus
	./scripts/deploy-aks.sh install_lws_controller
	./scripts/deploy-aks.sh add_nodepool
	./scripts/deploy-aks.sh install_network_operator
	./scripts/deploy-aks.sh install_gpu_operator

	export HF_TOKEN=""
	kubectl create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
	kubectl apply -f configs/llama-3-3-70b-instruct/two-nodes-eight-gpus/k8s/
	./configs/llama-3-3-70b-instruct/two-nodes-eight-gpus/fix-svc.sh

	# Test if the model is deployed!
	kubectl port-forward svc/llama-3-3-70b-instruct-leader 8000
	# If this works, this means that the model is being served
	curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
	"model": "meta-llama/Llama-3.3-70B-Instruct",
	"messages": [
	{
	"role": "user",
	"content": "Explain the origin of Llama the animal?"
	}
	]
	}' \| jq

	# Benchmark
	# Steps: https://github.com/surajssd/llm-k8s/blob/9271454bc5a008a437c7b52c33409b18d6cb2220/benchmark/vllm_upstream

	kubectl create ns vllm-benchmark
	kubectl -n vllm-benchmark create configmap benchmark-runner \
	--from-literal=TEST_SERVER_URL="http://llama-3-3-70b-instruct-leader.default:8000" \
	--from-literal=MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" \
	--from-literal=TENSOR_PARALLEL_SIZE=4 \
	--from-literal=PIPELINE_PARALLEL_SIZE="${GPU_NODE_COUNT}" \
	--from-literal=GPU_VM_SKU="${VM_SIZE}" \
	--dry-run=client -o yaml \| kubectl apply -f -

	kubectl -n vllm-benchmark create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
	kubectl apply -f benchmark/vllm_upstream/k8s/

	POD_NAME=$(kubectl -n vllm-benchmark \
	get pods \
	-l app=benchmark-runner \
	--field-selector=status.phase=Running \
	-o jsonpath='{.items[].metadata.name}')

	kubectl -n vllm-benchmark \
	exec -it $POD_NAME \
	-- bash /root/scripts/run_vllm_upstream_benchmark.sh

	RESULTS_FILE=$(kubectl -n vllm-benchmark \
	exec -it $POD_NAME \
	-- bash -c "ls /root/results*.tar.gz" \| tr -d '\r')

	kubectl -n vllm-benchmark \
	cp "${POD_NAME}:${RESULTS_FILE}" "./$(basename ${RESULTS_FILE})"
	{
	"client_command": "python3 benchmark_serving.py --save-result --base-url http://llama-3-3-70b-instruct-leader.default:8000 --result-dir /root/vllm/.buildkite/results/ --result-filename serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01.json --request-rate 01 --model=meta-llama/Llama-3.3-70B-Instruct --backend=vllm --dataset-name=sharegpt --dataset-path=/root/sharegpt.json --num-prompts=200",
	"gpu_type": "Standard_ND96asr_v4 x 2"
	}