surajssd · February 27, 2025 20:02
diff --git a/benchmark_results.json b/benchmark_results.json
 [{"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 0.9279779228604551, "Output Tput (tok/s)": 198.50839736869426, "Total Tput (tok/s)": 396.441448425215, "Mean TTFT (ms)": 154.77130150999983, "Median TTFT (ms)": 128.38760200008892, "P99 TTFT (ms)": 376.5480166300789, "Mean TPOT (ms)": 44.93937090850136, "Median TPOT (ms)": 44.63469464226745, "P99 TPOT (ms)": 58.03939859885578, "Mean ITL (ms)": 44.85155391470774, "Median ITL (ms)": 43.878026000129466, "P99 ITL (ms)": 131.9412263799859}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 2.495698100981504, "Output Tput (tok/s)": 532.9313724835904, "Total Tput (tok/s)": 1065.2512989324402, "Mean TTFT (ms)": 246.33752696000101, "Median TTFT (ms)": 219.59910649991343, "P99 TTFT (ms)": 606.8011953101309, "Mean TPOT (ms)": 73.95579696666762, "Median TPOT (ms)": 73.8304143313294, "P99 TPOT (ms)": 129.3536884687555, "Mean ITL (ms)": 69.54965636861299, "Median ITL (ms)": 55.53150799994455, "P99 ITL (ms)": 288.60196543984557}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 3.325199159049069, "Output Tput (tok/s)": 710.1295324065192, "Total Tput (tok/s)": 1419.3778870358904, "Mean TTFT (ms)": 2461.8073529899934, "Median TTFT (ms)": 2882.5457675000052, "P99 TTFT (ms)": 3660.7770861201493, "Mean TPOT (ms)": 199.3748304130239, "Median TPOT (ms)": 117.27497017068639, "P99 TPOT (ms)": 704.7451063547737, "Mean ITL (ms)": 96.27992907936581, "Median ITL (ms)": 70.86817749996044, "P99 ITL (ms)": 705.1076254899294}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 3.3334046649708653, "Output Tput (tok/s)": 710.3485341052914, "Total Tput (tok/s)": 1421.3470821202523, "Mean TTFT (ms)": 8390.680319614992, "Median TTFT (ms)": 8344.11360350009, "P99 TTFT (ms)": 15300.305423499995, "Mean TPOT (ms)": 205.38025907747684, "Median TPOT (ms)": 116.90824614373891, "P99 TPOT (ms)": 700.7413872749946, "Mean ITL (ms)": 96.44242881350782, "Median ITL (ms)": 70.6381174999251, "P99 ITL (ms)": 701.5374677201612}]
diff --git a/benchmark_results.md b/benchmark_results.md
diff --git a/deployment.sh b/deployment.sh
 git clone https://github.com/surajssd/llm-k8s
 cd llm-k8s
 git checkout 0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b

 # Deploy a single node to host the llama 3.3 70b instruct model
 # Steps: https://github.com/surajssd/llm-k8s/blob/0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b/configs/llama-3-3-70b-instruct/one-node-four-gpus

 source .env
 export VM_SIZE="Standard_NC96ads_A100_v4"
 export GPU_NODE_COUNT=1

 ./scripts/deploy-aks.sh
 export HF_TOKEN=""
 kubectl create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
 kubectl apply -f configs/llama-3-3-70b-instruct/one-node-four-gpus/k8s/

 kubectl port-forward svc/llama-3-3-70b-instruct 8000
 curl -X POST "http://localhost:8000/v1/chat/completions" \
    -H "Content-Type: application/json" \
    --data '{
  "model": "meta-llama/Llama-3.3-70B-Instruct",
  "messages": [
   {
    "role": "user",
    "content": "Explain the origin of Llama the animal?"
   }
  ]
 }' | jq

 # Benchmark
 # Steps: https://github.com/surajssd/llm-k8s/blob/0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b/benchmark/vllm_upstream/README.md

 kubectl create ns vllm-benchmark
 kubectl -n vllm-benchmark create configmap benchmark-runner \
    --from-literal=TEST_SERVER_URL="http://llama-3-3-70b-instruct.default:8000" \
    --from-literal=MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" \
    --from-literal=TENSOR_PARALLEL_SIZE=4 \
    --from-literal=PIPELINE_PARALLEL_SIZE="${GPU_NODE_COUNT}" \
    --from-literal=GPU_VM_SKU="${VM_SIZE}"

 kubectl -n vllm-benchmark create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
 kubectl apply -f benchmark/vllm_upstream/k8s/

 POD_NAME=$(kubectl -n vllm-benchmark \
    get pods \
    -l app=benchmark-runner \
    --field-selector=status.phase=Running \
    -o jsonpath='{.items[].metadata.name}')

 kubectl -n vllm-benchmark \
    exec -it $POD_NAME \
    -- bash /root/scripts/run_vllm_upstream_benchmark.sh

 RESULTS_FILE=$(kubectl -n vllm-benchmark \
    exec -it $POD_NAME \
    -- bash -c "ls /root/results*.tar.gz" | tr -d '\r')

 kubectl -n vllm-benchmark \
    cp "${POD_NAME}:${RESULTS_FILE}" "./$(basename ${RESULTS_FILE})"
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.commands
 {
  "client_command": "python3 benchmark_serving.py         --save-result         --base-url http://llama-3-3-70b-instruct.default:8000         --result-dir /root/vllm/.buildkite/results/         --result-filename serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.json         --request-rate 01         --model=meta-llama/Llama-3.3-70B-Instruct         --backend=vllm         --dataset-name=sharegpt         --dataset-path=/root/sharegpt.json         --num-prompts=200",
  "gpu_type": "Standard_NC96ads_A100_v4 x 1"
 }
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16.json
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf.commands b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf.commands
diff --git a/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf.json b/serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf.json
Test name	GPU	# of req.	Tput (req/s)	Output Tput (tok/s)	Total Tput (tok/s)	Mean TTFT (ms)	Median TTFT (ms)	P99 TTFT (ms)	Mean TPOT (ms)	Median TPOT (ms)	P99 TPOT (ms)	Mean ITL (ms)	Median ITL (ms)	P99 ITL (ms)
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01	1xStandard_NC96ads_A100_v4 x 1	200	0.927978	198.508	396.441	154.771	128.388	376.548	44.9394	44.6347	58.0394	44.8516	43.878	131.941
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04	1xStandard_NC96ads_A100_v4 x 1	200	2.4957	532.931	1065.25	246.338	219.599	606.801	73.9558	73.8304	129.354	69.5497	55.5315	288.602
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_16	1xStandard_NC96ads_A100_v4 x 1	200	3.3252	710.13	1419.38	2461.81	2882.55	3660.78	199.375	117.275	704.745	96.2799	70.8682	705.108
serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_inf	1xStandard_NC96ads_A100_v4 x 1	200	3.3334	710.349	1421.35	8390.68	8344.11	15300.3	205.38	116.908	700.741	96.4424	70.6381	701.537
	git clone https://github.com/surajssd/llm-k8s
	cd llm-k8s
	git checkout 0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b

	# Deploy a single node to host the llama 3.3 70b instruct model
	# Steps: https://github.com/surajssd/llm-k8s/blob/0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b/configs/llama-3-3-70b-instruct/one-node-four-gpus

	source .env
	export VM_SIZE="Standard_NC96ads_A100_v4"
	export GPU_NODE_COUNT=1

	./scripts/deploy-aks.sh
	export HF_TOKEN=""
	kubectl create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
	kubectl apply -f configs/llama-3-3-70b-instruct/one-node-four-gpus/k8s/

	kubectl port-forward svc/llama-3-3-70b-instruct 8000
	curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
	"model": "meta-llama/Llama-3.3-70B-Instruct",
	"messages": [
	{
	"role": "user",
	"content": "Explain the origin of Llama the animal?"
	}
	]
	}' \| jq

	# Benchmark
	# Steps: https://github.com/surajssd/llm-k8s/blob/0a6d436f27034f6f1bf2c2bd12c2da334a65cf7b/benchmark/vllm_upstream/README.md

	kubectl create ns vllm-benchmark
	kubectl -n vllm-benchmark create configmap benchmark-runner \
	--from-literal=TEST_SERVER_URL="http://llama-3-3-70b-instruct.default:8000" \
	--from-literal=MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" \
	--from-literal=TENSOR_PARALLEL_SIZE=4 \
	--from-literal=PIPELINE_PARALLEL_SIZE="${GPU_NODE_COUNT}" \
	--from-literal=GPU_VM_SKU="${VM_SIZE}"

	kubectl -n vllm-benchmark create secret generic hf-token-secret --from-literal token=${HF_TOKEN}
	kubectl apply -f benchmark/vllm_upstream/k8s/

	POD_NAME=$(kubectl -n vllm-benchmark \
	get pods \
	-l app=benchmark-runner \
	--field-selector=status.phase=Running \
	-o jsonpath='{.items[].metadata.name}')

	kubectl -n vllm-benchmark \
	exec -it $POD_NAME \
	-- bash /root/scripts/run_vllm_upstream_benchmark.sh

	RESULTS_FILE=$(kubectl -n vllm-benchmark \
	exec -it $POD_NAME \
	-- bash -c "ls /root/results*.tar.gz" \| tr -d '\r')

	kubectl -n vllm-benchmark \
	cp "${POD_NAME}:${RESULTS_FILE}" "./$(basename ${RESULTS_FILE})"
	{
	"client_command": "python3 benchmark_serving.py --save-result --base-url http://llama-3-3-70b-instruct.default:8000 --result-dir /root/vllm/.buildkite/results/ --result-filename serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01.json --request-rate 01 --model=meta-llama/Llama-3.3-70B-Instruct --backend=vllm --dataset-name=sharegpt --dataset-path=/root/sharegpt.json --num-prompts=200",
	"gpu_type": "Standard_NC96ads_A100_v4 x 1"
	}