$ ENV_METADATA_GPU="4xNVIDIA_L40S" \
./e2e-bench-control.sh --4xgpu-minikube --model meta-llama/Llama-3.2-3B-Instruct
🌟 LLM Deployment and Benchmark Orchestrator 🌟
-------------------------------------------------
--- Configuration Summary ---
Minikube Start Args (Hardcoded): --driver docker --container-runtime docker --gpus all --memory no-limit --cpus no-limit
LLMD Installer Script (Hardcoded): ./llmd-installer.sh
Test Request Script (Hardcoded): ./test-request.sh (Args: --minikube, Retry: 30s)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ubuntu@ip-172-31-16-33:~/secret-llm-d-deployer/project$ kubectl logs -n kgateway-system kgateway-7c58ddd989-nw5wc -c kgateway --previous --tail=200 | |
{"level":"info","ts":"2025-05-17T18:01:08.979Z","caller":"probes/probes.go:57","msg":"probe server starting at :8765 listening for /healthz"} | |
{"level":"info","ts":"2025-05-17T18:01:08.979Z","caller":"setup/setup.go:69","msg":"got settings from env: {DnsLookupFamily:V4_PREFERRED EnableIstioIntegration:false EnableIstioAutoMtls:false IstioNamespace:istio-system XdsServiceName:kgateway XdsServicePort:9977 UseRustFormations:false EnableInferExt:true InferExtAutoProvision:false DefaultImageRegistry:cr.kgateway.dev/kgateway-dev DefaultImageTag:v2.0.0 DefaultImagePullPolicy:IfNotPresent}"} | |
{"level":"info","ts":"2025-05-17T18:01:08.980Z","logger":"k8s","caller":"setup/setup.go:110","msg":"starting kgateway"} | |
{"level":"info","ts":"2025-05-17T18:01:08.984Z","logger":"k8s","caller":"setup/setup.go:117","msg":"creating krt collections"} | |
{"level":"info","ts":"2025-05-17T18:01 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# -*- indent-tabs-mode: nil; tab-width: 4; sh-indentation: 4; -*- | |
set -euo pipefail | |
### GLOBALS ### | |
NAMESPACE="llm-d" | |
PROVISION_MINIKUBE=false | |
PROVISION_MINIKUBE_GPU=false | |
STORAGE_SIZE="15Gi" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
transcribe_video_to_srt.py | |
Transcribe a video or audio file into SRT subtitles using OpenAI Whisper. | |
Dependencies & Install: | |
------------------------------------ | |
# 1. Create & activate a virtual environment (optional but recommended): | |
# python3 -m venv venv |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ helm template llm-d . --debug --namespace default --values values.yaml | |
install.go:225: 2025-05-07 17:20:53.000638786 +0000 UTC m=+0.031145623 [debug] Original chart version: "" | |
install.go:242: 2025-05-07 17:20:53.000679067 +0000 UTC m=+0.031185914 [debug] CHART PATH: /home/ubuntu/tmp/llm-d-deployer/charts/llm-d | |
--- | |
# Source: llm-d/charts/redis/templates/master/serviceaccount.yaml | |
apiVersion: v1 | |
kind: ServiceAccount | |
automountServiceAccountToken: false |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Events: <none> | |
ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ kubectl describe inferencepools.inference.networking.x-k8s.io llama-3-2-3b-instruct -n llm-d^C | |
ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ k logs llama-3.2-3b-instruct-epp-65c87574f5-wtxrc | |
{"level":"info","ts":"2025-05-03T17:09:29Z","logger":"setup","caller":"epp/main.go:135","msg":"Flags processed","flags":{"DestinationEndpointHintMetadataNamespace":"envoy.lb","certPath":"","destinationEndpointHintKey":"x-gateway-destination-endpoint","grpcHealthPort":9003,"grpcPort":9002,"kubeconfig":"","kvCacheUsagePercentageMetric":"vllm:gpu_cache_usage_perc","loraInfoMetric":"vllm:lora_requests_info","metricsPort":9090,"poolName":"llama-3.2-3b-instruct","poolNamespace":"llm-d","refreshMetricsInterval":50000000,"refreshPrometheusMetricsInterval":5000000000,"secureServing":true,"totalQueuedRequestsMetric":"vllm:num_requests_waiting","v":4,"zap-devel":true,"zap-encoder":{},"zap-log-level":{},"zap-stacktrace-level":{},"zap-time-encoding":{}}} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ k logs llama-3.2-3b-instruct-decode-6dcb767b75-4c8c8 -c vllm | |
INFO 05-03 16:00:29 [__init__.py:239] Automatically detected platform cuda. | |
INFO 05-03 16:00:32 [api_server.py:1042] vLLM API server version 0.1.dev1+g9b70e2b | |
INFO 05-03 16:00:32 [api_server.py:1043] args: Namespace(host=None, port=8200, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='meta-llama/Llama-3.2-3B-Instruct', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=Fals |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sudo apt-get update | |
sudo apt-get -y install jq gh | |
sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq | |
sudo chmod +x /usr/local/bin/yq | |
sudo apt-get update | |
sudo apt-get -y install ca-certificates curl | |
sudo install -m 0755 -d /etc/apt/keyrings | |
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc |
fedora@ip-172-31-37-101:~$ git clone https://github.com/neuralmagic/gateway-api-inference-extension.git
cd gateway-api-inference-extension
Cloning into 'gateway-api-inference-extension'...
remote: Enumerating objects: 5757, done.
remote: Counting objects: 100% (1395/1395), done.
remote: Compressing objects: 100% (318/318), done.
remote: Total 5757 (delta 1188), reused 1078 (delta 1077), pack-reused 4362 (from 3)
Receiving objects: 100% (5757/5757), 7.04 MiB | 38.35 MiB/s, done.
Resolving deltas: 100% (3112/3112), done.
NewerOlder