Last active
May 4, 2026 21:27
-
-
Save davivcgarcia/907e1bb1e5d1eeef4be26013e518d5fb to your computer and use it in GitHub Desktop.
Partner Tutorial: Production LLM Inference on Kubernetes with vLLM (Extra Lab) @ Data Maker Fest 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # | |
| # Deploys Google's Gemma 4 26B-A4B (Mixture-of-Experts, vision-capable) on | |
| # vLLM running on a single g6e.2xlarge node (1x NVIDIA L40S, 48 GB, SM89), | |
| # and exposes it through an Open WebUI frontend fronted by an AWS NLB so | |
| # workshop participants can chat with the model (including image uploads) | |
| # from a browser. | |
| # | |
| # Prereqs: https://genai.eksworkshop.com | |
| # | |
| # Copyright 2026 Davi Garcia | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # Create manifests for running Gemma 4 26B-A4B (MoE) on vLLM on a g6e.2xlarge (1x L40S, 48 GB, SM89). | |
| cat <<EOF > vllm-gemma.yaml | |
| # HF token for pulling gated Gemma weights at pod startup. | |
| # It works if you don't create, but you may suffer throttling from HF. | |
| apiVersion: v1 | |
| kind: Secret | |
| metadata: | |
| name: hf-token | |
| namespace: default | |
| type: Opaque | |
| stringData: | |
| HF_TOKEN: <YOUR TOKEN HERE> | |
| --- | |
| # ClusterIP fronts the OpenAI-compatible API; expose externally via Ingress/LB as needed. | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: vllm-gemma | |
| namespace: default | |
| spec: | |
| type: ClusterIP | |
| selector: | |
| app: vllm-gemma | |
| ports: | |
| - name: http | |
| port: 8000 | |
| targetPort: 8000 | |
| --- | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: vllm-gemma | |
| namespace: default | |
| labels: | |
| app: vllm-gemma | |
| spec: | |
| replicas: 1 | |
| # One GPU per node means two pods can't coexist during a rollout. | |
| strategy: | |
| type: Recreate | |
| selector: | |
| matchLabels: | |
| app: vllm-gemma | |
| template: | |
| metadata: | |
| labels: | |
| app: vllm-gemma | |
| spec: | |
| # g6e.2xlarge = 1x NVIDIA L40S (48 GB, Ada Lovelace SM89, native FP8). | |
| nodeSelector: | |
| node.kubernetes.io/instance-type: g6e.2xlarge | |
| tolerations: | |
| - key: nvidia.com/gpu | |
| operator: Exists | |
| effect: NoSchedule | |
| # Give vLLM time to drain in-flight requests on shutdown. | |
| terminationGracePeriodSeconds: 60 | |
| containers: | |
| - name: vllm | |
| # AWS Deep Learning Container: vLLM 0.20 on CUDA/py312, EC2 flavor. | |
| # SOCI = Seekable OCI: lazy-load index baked in, so containerd with | |
| # the SOCI snapshotter starts the pod before the full image is pulled. | |
| # Falls back to a normal pull on nodes without the snapshotter. | |
| image: public.ecr.aws/deep-learning-containers/vllm:0.20-gpu-py312-ec2-soci | |
| imagePullPolicy: IfNotPresent | |
| args: | |
| # Model: 25.2B total / 3.8B active MoE (8/128 experts + 1 shared), BF16 native. | |
| - "--model=google/gemma-4-26b-a4b-it" | |
| # Explicit dtype avoids 'auto' picking fp16 on some configs. | |
| - "--dtype=bfloat16" | |
| # 25.2B x BF16 ~ 50 GB won't fit on 48 GB — FP8 weights bring it into range. | |
| # Alternative for better accuracy/stability: pre-quantized checkpoint, e.g. | |
| # --model=RedHatAI/gemma-4-26b-a4b-it-FP8-dynamic | |
| # --quantization=compressed-tensors | |
| - "--quantization=fp8" | |
| # Must match the weight FP8 format (E4M3): vLLM rejects fp8_e5m2 KV | |
| # when the model is loaded via the fp8 quant path. | |
| - "--kv-cache-dtype=fp8_e4m3" | |
| # Single GPU: no sharding, no expert parallelism. | |
| - "--tensor-parallel-size=1" | |
| # Model supports 256K; 64K is comfortable on 48 GB with the 1024-token | |
| # sliding window keeping per-layer KV bounded. | |
| - "--max-model-len=65536" | |
| # Scheduler token budget per step. Higher = more throughput, higher TTFT. | |
| - "--max-num-batched-tokens=16384" | |
| # Concurrent sequences the engine will interleave; sized to keep an | |
| # L40S busy given 3.8B active params per token. | |
| - "--max-num-seqs=64" | |
| # Fraction of VRAM for weights + KV cache + activations; ~4 GB left for | |
| # CUDA / driver / nccl overhead. | |
| - "--gpu-memory-utilization=0.92" | |
| # Reuse KV for repeated prompt prefixes (system prompts, tool schemas). | |
| - "--enable-prefix-caching" | |
| # Interleave prefill chunks with decode steps to smooth TTFT under load. | |
| - "--enable-chunked-prefill" | |
| # Optimization level 3 enables full torch.compile + CUDA graphs. | |
| - "-O3" | |
| # Vision enabled: up to 2 images per prompt. The ~550M ViT encoder | |
| # stays in BF16 — do NOT enable --mm-encoder-attn-dtype=fp8 here, | |
| # FP8 ViT attention requires SM90 (Hopper) and L40S is SM89. | |
| - "--limit-mm-per-prompt.image=2" | |
| # Enable for function calling (uses vLLM's dedicated Gemma 4 parser): | |
| # - "--enable-auto-tool-choice" | |
| # - "--tool-call-parser=gemma4" | |
| - "--host=0.0.0.0" | |
| - "--port=8000" | |
| env: | |
| - name: HF_TOKEN | |
| valueFrom: | |
| secretKeyRef: | |
| name: hf-token | |
| key: HF_TOKEN | |
| # Keep the HF cache on our writable volume, not the container's rootfs. | |
| - name: HF_HOME | |
| value: /cache/huggingface | |
| # Parallel chunked downloads; big speedup on the 25-50 GB cold pull. | |
| - name: HF_HUB_ENABLE_HF_TRANSFER | |
| value: "1" | |
| # Avoid fork() issues when the parent has already initialized CUDA. | |
| - name: VLLM_WORKER_MULTIPROC_METHOD | |
| value: "spawn" | |
| # Pin the V1 engine explicitly to avoid surprise regressions. | |
| - name: VLLM_USE_V1 | |
| value: "1" | |
| # Let vLLM auto-select the attention backend on L40S unless a specific | |
| # issue forces a choice (both FLASHINFER and FLASH_ATTN work on SM89). | |
| # - name: VLLM_ATTENTION_BACKEND | |
| # value: "FLASH_ATTN" | |
| ports: | |
| - name: http | |
| containerPort: 8000 | |
| resources: | |
| # HF downloads + weight mmap peak well above steady-state during cold start. | |
| limits: | |
| nvidia.com/gpu: 1 | |
| cpu: "8" | |
| memory: 40Gi | |
| requests: | |
| nvidia.com/gpu: 1 | |
| cpu: "4" | |
| memory: 32Gi | |
| volumeMounts: | |
| - name: hf-cache | |
| mountPath: /cache/huggingface | |
| - name: shm | |
| mountPath: /dev/shm | |
| # 120 * 15s = 30 min cold-start budget (weight download + torch.compile). | |
| startupProbe: | |
| httpGet: | |
| path: /health | |
| port: http | |
| periodSeconds: 15 | |
| failureThreshold: 120 | |
| readinessProbe: | |
| httpGet: | |
| path: /health | |
| port: http | |
| periodSeconds: 10 | |
| livenessProbe: | |
| httpGet: | |
| path: /health | |
| port: http | |
| periodSeconds: 30 | |
| failureThreshold: 5 | |
| volumes: | |
| # emptyDir means a re-download on every pod reschedule (~25-50 GB). For | |
| # production, back this with a PVC or hostPath on the 450 GB NVMe | |
| # instance store so weights persist across restarts. | |
| - name: hf-cache | |
| emptyDir: | |
| sizeLimit: 60Gi | |
| # Container /dev/shm defaults to 64 MiB — too small for vLLM's | |
| # multiprocess workers. Memory-backed emptyDir gives real RAM. | |
| - name: shm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: 8Gi | |
| EOF | |
| # Deploy the manifest | |
| kubectl apply -f vllm-gemma.yaml | |
| # Check if the Pod is executed | |
| kubectl get pods -A -w | |
| # Watch logs to make sure vLLM is able to download Gemma from HuggingFace and load it into vRAM | |
| kubectl logs -l app=vllm-gemma -f | |
| # Option A — direct API access for cURL / SDK testing | |
| kubectl port-forward svc/vllm-gemma 8000:8000 | |
| # In another terminal, prompt using cURL. | |
| # Sampling params follow the model card: temperature=1.0, top_p=0.95, top_k=64. | |
| curl http://localhost:8000/v1/chat/completions \ | |
| -H "Content-Type: application/json" \ | |
| -d '{"model":"google/gemma-4-26b-a4b-it","messages":[{"role":"user","content":"Why Kubernetes is amazing?"}],"max_tokens":800,"temperature":1.0,"top_p":0.95,"top_k":64}' | |
| # Option B — Open WebUI exposed via an AWS NLB (public URL). | |
| # Create the manifest to run Open-WebUI | |
| cat <<EOF > open-webui-gemma.yaml | |
| --- | |
| # Open WebUI — browser-based chat UI that speaks the OpenAI API. It will | |
| # connect to the vLLM service above and let you chat (with image uploads, | |
| # since vision is enabled on the backend). | |
| # Data (user accounts, chat history, settings) is on emptyDir, so it's wiped | |
| # on pod reschedule — fine for this workshop, swap to a PVC for persistence. | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: open-webui-gemma | |
| namespace: default | |
| # AWS Load Balancer Controller annotations: provisions an internet-facing | |
| # NLB that forwards directly to pod IPs (VPC CNI / ip target type). | |
| # Requires the aws-load-balancer-controller to be installed in the cluster. | |
| annotations: | |
| service.beta.kubernetes.io/aws-load-balancer-type: "external" | |
| service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip" | |
| service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" | |
| spec: | |
| type: LoadBalancer | |
| selector: | |
| app: open-webui-gemma | |
| ports: | |
| - name: http | |
| port: 80 | |
| targetPort: 8080 | |
| --- | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: open-webui-gemma | |
| namespace: default | |
| labels: | |
| app: open-webui-gemma | |
| spec: | |
| replicas: 1 | |
| strategy: | |
| type: Recreate | |
| selector: | |
| matchLabels: | |
| app: open-webui-gemma | |
| template: | |
| metadata: | |
| labels: | |
| app: open-webui-gemma | |
| spec: | |
| containers: | |
| - name: open-webui | |
| image: ghcr.io/open-webui/open-webui:main | |
| imagePullPolicy: IfNotPresent | |
| ports: | |
| - name: http | |
| containerPort: 8080 | |
| env: | |
| # Point Open WebUI at the in-cluster vLLM service. OpenAI-compatible | |
| # endpoint — /v1 is appended by the client, don't include it here. | |
| - name: OPENAI_API_BASE_URL | |
| value: "http://vllm-gemma.default.svc.cluster.local:8000/v1" | |
| # vLLM doesn't enforce a key by default; set any non-empty value to | |
| # satisfy Open WebUI's client. | |
| - name: OPENAI_API_KEY | |
| value: "not-needed" | |
| # Disable Ollama integration — we're only using the OpenAI backend. | |
| - name: ENABLE_OLLAMA_API | |
| value: "false" | |
| # Reuse the vLLM token so Open WebUI's embedding / reranker / STT | |
| # downloads go out authenticated — avoids anonymous HF rate limits. | |
| - name: HF_TOKEN | |
| valueFrom: | |
| secretKeyRef: | |
| name: hf-token | |
| key: HF_TOKEN | |
| # Disable login entirely — anyone with the URL is dropped straight | |
| # into the chat UI under a single shared account. Fine for a | |
| # workshop behind a short-lived NLB; DO NOT use in production. | |
| - name: WEBUI_AUTH | |
| value: "false" | |
| resources: | |
| limits: | |
| cpu: "2" | |
| memory: 4Gi | |
| requests: | |
| cpu: "500m" | |
| memory: 1Gi | |
| volumeMounts: | |
| - name: data | |
| mountPath: /app/backend/data | |
| readinessProbe: | |
| httpGet: | |
| path: /health | |
| port: http | |
| periodSeconds: 10 | |
| livenessProbe: | |
| httpGet: | |
| path: /health | |
| port: http | |
| periodSeconds: 30 | |
| failureThreshold: 5 | |
| volumes: | |
| - name: data | |
| emptyDir: | |
| sizeLimit: 10Gi | |
| EOF | |
| # Deploy the manifest | |
| kubectl apply -f open-webui-gemma.yaml | |
| # Wait for the AWS Load Balancer Controller to write the NLB hostname onto the | |
| # Service status (kubectl watches the field; no polling loop needed). | |
| kubectl wait --for=jsonpath='{.status.loadBalancer.ingress[0].hostname}' \ | |
| svc/open-webui-gemma -n default --timeout=5m | |
| OUI_HOST=$(kubectl get svc open-webui-gemma -n default \ | |
| -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') | |
| # Resolve the NLB ARN from its DNS name, then let the AWS waiter block until | |
| # the load balancer state is 'active'. | |
| OUI_ARN=$(aws elbv2 describe-load-balancers \ | |
| --query "LoadBalancers[?DNSName=='$OUI_HOST'].LoadBalancerArn" \ | |
| --output text) | |
| # Prints the URL of the UI | |
| aws elbv2 wait load-balancer-available --load-balancer-arns "$OUI_ARN" | |
| echo "Open WebUI URL: http://$OUI_HOST" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment