davivcgarcia · May 4, 2026 21:27
diff --git a/extra-lab.sh b/extra-lab.sh
 #!/usr/bin/env bash
 #
 # Deploys Google's Gemma 4 26B-A4B (Mixture-of-Experts, vision-capable) on
 # vLLM running on a single g6e.2xlarge node (1x NVIDIA L40S, 48 GB, SM89),
 # and exposes it through an Open WebUI frontend fronted by an AWS NLB so
 # workshop participants can chat with the model (including image uploads)
 # from a browser.
 #
 # Prereqs: https://genai.eksworkshop.com
 # 
 # Copyright 2026 Davi Garcia
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # Create manifests for running Gemma 4 26B-A4B (MoE) on vLLM on a g6e.2xlarge (1x L40S, 48 GB, SM89).
 cat <<EOF > vllm-gemma.yaml
 # HF token for pulling gated Gemma weights at pod startup.
 # It works if you don't create, but you may suffer throttling from HF. 
 apiVersion: v1
 kind: Secret
 metadata:
  name: hf-token
  namespace: default
 type: Opaque
 stringData:
  HF_TOKEN: <YOUR TOKEN HERE>
 ---
 # ClusterIP fronts the OpenAI-compatible API; expose externally via Ingress/LB as needed.
 apiVersion: v1
 kind: Service
 metadata:
  name: vllm-gemma
  namespace: default
 spec:
  type: ClusterIP
  selector:
    app: vllm-gemma
  ports:
    - name: http
      port: 8000
      targetPort: 8000
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: vllm-gemma
  namespace: default
  labels:
    app: vllm-gemma
 spec:
  replicas: 1
  # One GPU per node means two pods can't coexist during a rollout.
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: vllm-gemma
  template:
    metadata:
      labels:
        app: vllm-gemma
    spec:
      # g6e.2xlarge = 1x NVIDIA L40S (48 GB, Ada Lovelace SM89, native FP8).
      nodeSelector:
        node.kubernetes.io/instance-type: g6e.2xlarge
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      # Give vLLM time to drain in-flight requests on shutdown.
      terminationGracePeriodSeconds: 60
      containers:
        - name: vllm
          # AWS Deep Learning Container: vLLM 0.20 on CUDA/py312, EC2 flavor.
          # SOCI = Seekable OCI: lazy-load index baked in, so containerd with
          # the SOCI snapshotter starts the pod before the full image is pulled.
          # Falls back to a normal pull on nodes without the snapshotter.
          image: public.ecr.aws/deep-learning-containers/vllm:0.20-gpu-py312-ec2-soci
          imagePullPolicy: IfNotPresent
          args:
            # Model: 25.2B total / 3.8B active MoE (8/128 experts + 1 shared), BF16 native.
            - "--model=google/gemma-4-26b-a4b-it"

            # Explicit dtype avoids 'auto' picking fp16 on some configs.
            - "--dtype=bfloat16"

            # 25.2B x BF16 ~ 50 GB won't fit on 48 GB — FP8 weights bring it into range.
            # Alternative for better accuracy/stability: pre-quantized checkpoint, e.g.
            #   --model=RedHatAI/gemma-4-26b-a4b-it-FP8-dynamic
            #   --quantization=compressed-tensors
            - "--quantization=fp8"

            # Must match the weight FP8 format (E4M3): vLLM rejects fp8_e5m2 KV
            # when the model is loaded via the fp8 quant path.
            - "--kv-cache-dtype=fp8_e4m3"

            # Single GPU: no sharding, no expert parallelism.
            - "--tensor-parallel-size=1"

            # Model supports 256K; 64K is comfortable on 48 GB with the 1024-token
            # sliding window keeping per-layer KV bounded.
            - "--max-model-len=65536"

            # Scheduler token budget per step. Higher = more throughput, higher TTFT.
            - "--max-num-batched-tokens=16384"

            # Concurrent sequences the engine will interleave; sized to keep an
            # L40S busy given 3.8B active params per token.
            - "--max-num-seqs=64"

            # Fraction of VRAM for weights + KV cache + activations; ~4 GB left for
            # CUDA / driver / nccl overhead.
            - "--gpu-memory-utilization=0.92"

            # Reuse KV for repeated prompt prefixes (system prompts, tool schemas).
            - "--enable-prefix-caching"

            # Interleave prefill chunks with decode steps to smooth TTFT under load.
            - "--enable-chunked-prefill"

            # Optimization level 3 enables full torch.compile + CUDA graphs.
            - "-O3"

            # Vision enabled: up to 2 images per prompt. The ~550M ViT encoder
            # stays in BF16 — do NOT enable --mm-encoder-attn-dtype=fp8 here,
            # FP8 ViT attention requires SM90 (Hopper) and L40S is SM89.
            - "--limit-mm-per-prompt.image=2"

            # Enable for function calling (uses vLLM's dedicated Gemma 4 parser):
            # - "--enable-auto-tool-choice"
            # - "--tool-call-parser=gemma4"

            - "--host=0.0.0.0"
            - "--port=8000"
          env:
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token
                  key: HF_TOKEN
            # Keep the HF cache on our writable volume, not the container's rootfs.
            - name: HF_HOME
              value: /cache/huggingface
            # Parallel chunked downloads; big speedup on the 25-50 GB cold pull.
            - name: HF_HUB_ENABLE_HF_TRANSFER
              value: "1"
            # Avoid fork() issues when the parent has already initialized CUDA.
            - name: VLLM_WORKER_MULTIPROC_METHOD
              value: "spawn"
            # Pin the V1 engine explicitly to avoid surprise regressions.
            - name: VLLM_USE_V1
              value: "1"
            # Let vLLM auto-select the attention backend on L40S unless a specific
            # issue forces a choice (both FLASHINFER and FLASH_ATTN work on SM89).
            # - name: VLLM_ATTENTION_BACKEND
            #   value: "FLASH_ATTN"
          ports:
            - name: http
              containerPort: 8000
          resources:
            # HF downloads + weight mmap peak well above steady-state during cold start.
            limits:
              nvidia.com/gpu: 1
              cpu: "8"
              memory: 40Gi
            requests:
              nvidia.com/gpu: 1
              cpu: "4"
              memory: 32Gi
          volumeMounts:
            - name: hf-cache
              mountPath: /cache/huggingface
            - name: shm
              mountPath: /dev/shm
          # 120 * 15s = 30 min cold-start budget (weight download + torch.compile).
          startupProbe:
            httpGet:
              path: /health
              port: http
            periodSeconds: 15
            failureThreshold: 120
          readinessProbe:
            httpGet:
              path: /health
              port: http
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /health
              port: http
            periodSeconds: 30
            failureThreshold: 5
      volumes:
        # emptyDir means a re-download on every pod reschedule (~25-50 GB). For
        # production, back this with a PVC or hostPath on the 450 GB NVMe
        # instance store so weights persist across restarts.
        - name: hf-cache
          emptyDir:
            sizeLimit: 60Gi
        # Container /dev/shm defaults to 64 MiB — too small for vLLM's
        # multiprocess workers. Memory-backed emptyDir gives real RAM.
        - name: shm
          emptyDir:
            medium: Memory
            sizeLimit: 8Gi
 EOF

 # Deploy the manifest
 kubectl apply -f vllm-gemma.yaml

 # Check if the Pod is executed
 kubectl get pods -A -w

 # Watch logs to make sure vLLM is able to download Gemma from HuggingFace and load it into vRAM
 kubectl logs -l app=vllm-gemma -f

 # Option A — direct API access for cURL / SDK testing
 kubectl port-forward svc/vllm-gemma 8000:8000

 # In another terminal, prompt using cURL.
 # Sampling params follow the model card: temperature=1.0, top_p=0.95, top_k=64.
 curl http://localhost:8000/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{"model":"google/gemma-4-26b-a4b-it","messages":[{"role":"user","content":"Why Kubernetes is amazing?"}],"max_tokens":800,"temperature":1.0,"top_p":0.95,"top_k":64}'

 # Option B — Open WebUI exposed via an AWS NLB (public URL).

 # Create the manifest to run Open-WebUI
 cat <<EOF > open-webui-gemma.yaml
 ---
 # Open WebUI — browser-based chat UI that speaks the OpenAI API. It will
 # connect to the vLLM service above and let you chat (with image uploads,
 # since vision is enabled on the backend).
 # Data (user accounts, chat history, settings) is on emptyDir, so it's wiped
 # on pod reschedule — fine for this workshop, swap to a PVC for persistence.
 apiVersion: v1
 kind: Service
 metadata:
  name: open-webui-gemma
  namespace: default
  # AWS Load Balancer Controller annotations: provisions an internet-facing
  # NLB that forwards directly to pod IPs (VPC CNI / ip target type).
  # Requires the aws-load-balancer-controller to be installed in the cluster.
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-type: "external"
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip"
    service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing"
 spec:
  type: LoadBalancer
  selector:
    app: open-webui-gemma
  ports:
    - name: http
      port: 80
      targetPort: 8080
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: open-webui-gemma
  namespace: default
  labels:
    app: open-webui-gemma
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: open-webui-gemma
  template:
    metadata:
      labels:
        app: open-webui-gemma
    spec:
      containers:
        - name: open-webui
          image: ghcr.io/open-webui/open-webui:main
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 8080
          env:
            # Point Open WebUI at the in-cluster vLLM service. OpenAI-compatible
            # endpoint — /v1 is appended by the client, don't include it here.
            - name: OPENAI_API_BASE_URL
              value: "http://vllm-gemma.default.svc.cluster.local:8000/v1"
            # vLLM doesn't enforce a key by default; set any non-empty value to
            # satisfy Open WebUI's client.
            - name: OPENAI_API_KEY
              value: "not-needed"
            # Disable Ollama integration — we're only using the OpenAI backend.
            - name: ENABLE_OLLAMA_API
              value: "false"
            # Reuse the vLLM token so Open WebUI's embedding / reranker / STT
            # downloads go out authenticated — avoids anonymous HF rate limits.
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token
                  key: HF_TOKEN
            # Disable login entirely — anyone with the URL is dropped straight
            # into the chat UI under a single shared account. Fine for a
            # workshop behind a short-lived NLB; DO NOT use in production.
            - name: WEBUI_AUTH
              value: "false"
          resources:
            limits:
              cpu: "2"
              memory: 4Gi
            requests:
              cpu: "500m"
              memory: 1Gi
          volumeMounts:
            - name: data
              mountPath: /app/backend/data
          readinessProbe:
            httpGet:
              path: /health
              port: http
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /health
              port: http
            periodSeconds: 30
            failureThreshold: 5
      volumes:
        - name: data
          emptyDir:
            sizeLimit: 10Gi
 EOF

 # Deploy the manifest
 kubectl apply -f open-webui-gemma.yaml

 # Wait for the AWS Load Balancer Controller to write the NLB hostname onto the
 # Service status (kubectl watches the field; no polling loop needed).
 kubectl wait --for=jsonpath='{.status.loadBalancer.ingress[0].hostname}' \
  svc/open-webui-gemma -n default --timeout=5m

 OUI_HOST=$(kubectl get svc open-webui-gemma -n default \
  -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')

 # Resolve the NLB ARN from its DNS name, then let the AWS waiter block until
 # the load balancer state is 'active'.
 OUI_ARN=$(aws elbv2 describe-load-balancers \
  --query "LoadBalancers[?DNSName=='$OUI_HOST'].LoadBalancerArn" \
  --output text)

 # Prints the URL of the UI
 aws elbv2 wait load-balancer-available --load-balancer-arns "$OUI_ARN"
 echo "Open WebUI URL: http://$OUI_HOST"
	#!/usr/bin/env bash
	#
	# Deploys Google's Gemma 4 26B-A4B (Mixture-of-Experts, vision-capable) on
	# vLLM running on a single g6e.2xlarge node (1x NVIDIA L40S, 48 GB, SM89),
	# and exposes it through an Open WebUI frontend fronted by an AWS NLB so
	# workshop participants can chat with the model (including image uploads)
	# from a browser.
	#
	# Prereqs: https://genai.eksworkshop.com
	#
	# Copyright 2026 Davi Garcia
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Create manifests for running Gemma 4 26B-A4B (MoE) on vLLM on a g6e.2xlarge (1x L40S, 48 GB, SM89).
	cat <<EOF > vllm-gemma.yaml
	# HF token for pulling gated Gemma weights at pod startup.
	# It works if you don't create, but you may suffer throttling from HF.
	apiVersion: v1
	kind: Secret
	metadata:
	name: hf-token
	namespace: default
	type: Opaque
	stringData:
	HF_TOKEN: <YOUR TOKEN HERE>
	---
	# ClusterIP fronts the OpenAI-compatible API; expose externally via Ingress/LB as needed.
	apiVersion: v1
	kind: Service
	metadata:
	name: vllm-gemma
	namespace: default
	spec:
	type: ClusterIP
	selector:
	app: vllm-gemma
	ports:
	- name: http
	port: 8000
	targetPort: 8000
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: vllm-gemma
	namespace: default
	labels:
	app: vllm-gemma
	spec:
	replicas: 1
	# One GPU per node means two pods can't coexist during a rollout.
	strategy:
	type: Recreate
	selector:
	matchLabels:
	app: vllm-gemma
	template:
	metadata:
	labels:
	app: vllm-gemma
	spec:
	# g6e.2xlarge = 1x NVIDIA L40S (48 GB, Ada Lovelace SM89, native FP8).
	nodeSelector:
	node.kubernetes.io/instance-type: g6e.2xlarge
	tolerations:
	- key: nvidia.com/gpu
	operator: Exists
	effect: NoSchedule
	# Give vLLM time to drain in-flight requests on shutdown.
	terminationGracePeriodSeconds: 60
	containers:
	- name: vllm
	# AWS Deep Learning Container: vLLM 0.20 on CUDA/py312, EC2 flavor.
	# SOCI = Seekable OCI: lazy-load index baked in, so containerd with
	# the SOCI snapshotter starts the pod before the full image is pulled.
	# Falls back to a normal pull on nodes without the snapshotter.
	image: public.ecr.aws/deep-learning-containers/vllm:0.20-gpu-py312-ec2-soci
	imagePullPolicy: IfNotPresent
	args:
	# Model: 25.2B total / 3.8B active MoE (8/128 experts + 1 shared), BF16 native.
	- "--model=google/gemma-4-26b-a4b-it"

	# Explicit dtype avoids 'auto' picking fp16 on some configs.
	- "--dtype=bfloat16"

	# 25.2B x BF16 ~ 50 GB won't fit on 48 GB — FP8 weights bring it into range.
	# Alternative for better accuracy/stability: pre-quantized checkpoint, e.g.
	# --model=RedHatAI/gemma-4-26b-a4b-it-FP8-dynamic
	# --quantization=compressed-tensors
	- "--quantization=fp8"

	# Must match the weight FP8 format (E4M3): vLLM rejects fp8_e5m2 KV
	# when the model is loaded via the fp8 quant path.
	- "--kv-cache-dtype=fp8_e4m3"

	# Single GPU: no sharding, no expert parallelism.
	- "--tensor-parallel-size=1"

	# Model supports 256K; 64K is comfortable on 48 GB with the 1024-token
	# sliding window keeping per-layer KV bounded.
	- "--max-model-len=65536"

	# Scheduler token budget per step. Higher = more throughput, higher TTFT.
	- "--max-num-batched-tokens=16384"

	# Concurrent sequences the engine will interleave; sized to keep an
	# L40S busy given 3.8B active params per token.
	- "--max-num-seqs=64"

	# Fraction of VRAM for weights + KV cache + activations; ~4 GB left for
	# CUDA / driver / nccl overhead.
	- "--gpu-memory-utilization=0.92"

	# Reuse KV for repeated prompt prefixes (system prompts, tool schemas).
	- "--enable-prefix-caching"

	# Interleave prefill chunks with decode steps to smooth TTFT under load.
	- "--enable-chunked-prefill"

	# Optimization level 3 enables full torch.compile + CUDA graphs.
	- "-O3"

	# Vision enabled: up to 2 images per prompt. The ~550M ViT encoder
	# stays in BF16 — do NOT enable --mm-encoder-attn-dtype=fp8 here,
	# FP8 ViT attention requires SM90 (Hopper) and L40S is SM89.
	- "--limit-mm-per-prompt.image=2"

	# Enable for function calling (uses vLLM's dedicated Gemma 4 parser):
	# - "--enable-auto-tool-choice"
	# - "--tool-call-parser=gemma4"

	- "--host=0.0.0.0"
	- "--port=8000"
	env:
	- name: HF_TOKEN
	valueFrom:
	secretKeyRef:
	name: hf-token
	key: HF_TOKEN
	# Keep the HF cache on our writable volume, not the container's rootfs.
	- name: HF_HOME
	value: /cache/huggingface
	# Parallel chunked downloads; big speedup on the 25-50 GB cold pull.
	- name: HF_HUB_ENABLE_HF_TRANSFER
	value: "1"
	# Avoid fork() issues when the parent has already initialized CUDA.
	- name: VLLM_WORKER_MULTIPROC_METHOD
	value: "spawn"
	# Pin the V1 engine explicitly to avoid surprise regressions.
	- name: VLLM_USE_V1
	value: "1"
	# Let vLLM auto-select the attention backend on L40S unless a specific
	# issue forces a choice (both FLASHINFER and FLASH_ATTN work on SM89).
	# - name: VLLM_ATTENTION_BACKEND
	# value: "FLASH_ATTN"
	ports:
	- name: http
	containerPort: 8000
	resources:
	# HF downloads + weight mmap peak well above steady-state during cold start.
	limits:
	nvidia.com/gpu: 1
	cpu: "8"
	memory: 40Gi
	requests:
	nvidia.com/gpu: 1
	cpu: "4"
	memory: 32Gi
	volumeMounts:
	- name: hf-cache
	mountPath: /cache/huggingface
	- name: shm
	mountPath: /dev/shm
	# 120 * 15s = 30 min cold-start budget (weight download + torch.compile).
	startupProbe:
	httpGet:
	path: /health
	port: http
	periodSeconds: 15
	failureThreshold: 120
	readinessProbe:
	httpGet:
	path: /health
	port: http
	periodSeconds: 10
	livenessProbe:
	httpGet:
	path: /health
	port: http
	periodSeconds: 30
	failureThreshold: 5
	volumes:
	# emptyDir means a re-download on every pod reschedule (~25-50 GB). For
	# production, back this with a PVC or hostPath on the 450 GB NVMe
	# instance store so weights persist across restarts.
	- name: hf-cache
	emptyDir:
	sizeLimit: 60Gi
	# Container /dev/shm defaults to 64 MiB — too small for vLLM's
	# multiprocess workers. Memory-backed emptyDir gives real RAM.
	- name: shm
	emptyDir:
	medium: Memory
	sizeLimit: 8Gi
	EOF

	# Deploy the manifest
	kubectl apply -f vllm-gemma.yaml

	# Check if the Pod is executed
	kubectl get pods -A -w

	# Watch logs to make sure vLLM is able to download Gemma from HuggingFace and load it into vRAM
	kubectl logs -l app=vllm-gemma -f

	# Option A — direct API access for cURL / SDK testing
	kubectl port-forward svc/vllm-gemma 8000:8000

	# In another terminal, prompt using cURL.
	# Sampling params follow the model card: temperature=1.0, top_p=0.95, top_k=64.
	curl http://localhost:8000/v1/chat/completions \
	-H "Content-Type: application/json" \
	-d '{"model":"google/gemma-4-26b-a4b-it","messages":[{"role":"user","content":"Why Kubernetes is amazing?"}],"max_tokens":800,"temperature":1.0,"top_p":0.95,"top_k":64}'

	# Option B — Open WebUI exposed via an AWS NLB (public URL).

	# Create the manifest to run Open-WebUI
	cat <<EOF > open-webui-gemma.yaml
	---
	# Open WebUI — browser-based chat UI that speaks the OpenAI API. It will
	# connect to the vLLM service above and let you chat (with image uploads,
	# since vision is enabled on the backend).
	# Data (user accounts, chat history, settings) is on emptyDir, so it's wiped
	# on pod reschedule — fine for this workshop, swap to a PVC for persistence.
	apiVersion: v1
	kind: Service
	metadata:
	name: open-webui-gemma
	namespace: default
	# AWS Load Balancer Controller annotations: provisions an internet-facing
	# NLB that forwards directly to pod IPs (VPC CNI / ip target type).
	# Requires the aws-load-balancer-controller to be installed in the cluster.
	annotations:
	service.beta.kubernetes.io/aws-load-balancer-type: "external"
	service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip"
	service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing"
	spec:
	type: LoadBalancer
	selector:
	app: open-webui-gemma
	ports:
	- name: http
	port: 80
	targetPort: 8080
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: open-webui-gemma
	namespace: default
	labels:
	app: open-webui-gemma
	spec:
	replicas: 1
	strategy:
	type: Recreate
	selector:
	matchLabels:
	app: open-webui-gemma
	template:
	metadata:
	labels:
	app: open-webui-gemma
	spec:
	containers:
	- name: open-webui
	image: ghcr.io/open-webui/open-webui:main
	imagePullPolicy: IfNotPresent
	ports:
	- name: http
	containerPort: 8080
	env:
	# Point Open WebUI at the in-cluster vLLM service. OpenAI-compatible
	# endpoint — /v1 is appended by the client, don't include it here.
	- name: OPENAI_API_BASE_URL
	value: "http://vllm-gemma.default.svc.cluster.local:8000/v1"
	# vLLM doesn't enforce a key by default; set any non-empty value to
	# satisfy Open WebUI's client.
	- name: OPENAI_API_KEY
	value: "not-needed"
	# Disable Ollama integration — we're only using the OpenAI backend.
	- name: ENABLE_OLLAMA_API
	value: "false"
	# Reuse the vLLM token so Open WebUI's embedding / reranker / STT
	# downloads go out authenticated — avoids anonymous HF rate limits.
	- name: HF_TOKEN
	valueFrom:
	secretKeyRef:
	name: hf-token
	key: HF_TOKEN
	# Disable login entirely — anyone with the URL is dropped straight
	# into the chat UI under a single shared account. Fine for a
	# workshop behind a short-lived NLB; DO NOT use in production.
	- name: WEBUI_AUTH
	value: "false"
	resources:
	limits:
	cpu: "2"
	memory: 4Gi
	requests:
	cpu: "500m"
	memory: 1Gi
	volumeMounts:
	- name: data
	mountPath: /app/backend/data
	readinessProbe:
	httpGet:
	path: /health
	port: http
	periodSeconds: 10
	livenessProbe:
	httpGet:
	path: /health
	port: http
	periodSeconds: 30
	failureThreshold: 5
	volumes:
	- name: data
	emptyDir:
	sizeLimit: 10Gi
	EOF

	# Deploy the manifest
	kubectl apply -f open-webui-gemma.yaml

	# Wait for the AWS Load Balancer Controller to write the NLB hostname onto the
	# Service status (kubectl watches the field; no polling loop needed).
	kubectl wait --for=jsonpath='{.status.loadBalancer.ingress[0].hostname}' \
	svc/open-webui-gemma -n default --timeout=5m

	OUI_HOST=$(kubectl get svc open-webui-gemma -n default \
	-o jsonpath='{.status.loadBalancer.ingress[0].hostname}')

	# Resolve the NLB ARN from its DNS name, then let the AWS waiter block until
	# the load balancer state is 'active'.
	OUI_ARN=$(aws elbv2 describe-load-balancers \
	--query "LoadBalancers[?DNSName=='$OUI_HOST'].LoadBalancerArn" \
	--output text)

	# Prints the URL of the UI
	aws elbv2 wait load-balancer-available --load-balancer-arns "$OUI_ARN"
	echo "Open WebUI URL: http://$OUI_HOST"
No results found