g6e.12xlarge
or at least 2x L40S
minikube delete
or just for kube parts
./llmd-installer-minikube.sh --uninstall --namespace e2e-helm
--- a/charts/llm-d/templates/modelservice/deployment.yaml
+++ b/charts/llm-d/templates/modelservice/deployment.yaml
@@ -46,11 +46,11 @@ spec:
- --epp-cluster-role
- endpoint-picker-clusterrole
- --epp-pull-secrets
- - endpoint-picker-pull-secret
+ - llm-d-pull-secret
- --pd-cluster-role
- - prefill-decode-clusterrole
+ - llm-d-pull-secret
- --pd-pull-secrets
- - prefill-decode-pull-secret
+ - llm-d-pull-secret
# MSV2 HACK END
command:
- /manager
@@ -95,3 +95,4 @@ spec:
type: RuntimeDefault
serviceAccountName: {{ include "modelservice.serviceAccountName" . }}
{{- end}}
export HF_TOKEN=12345
./llmd-installer-minikube.sh --provision-minikube-gpu --namespace e2e-helm
Updated quickstart/llmd-installer-minikube.sh
script:
#!/usr/bin/env bash
# -*- indent-tabs-mode: nil; tab-width: 4; sh-indentation: 4; -*-
set -euo pipefail
### GLOBALS ###
NAMESPACE="llm-d"
PROVISION_MINIKUBE=false
PROVISION_MINIKUBE_GPU=false
USE_MINIKUBE_STORAGE=false
STORAGE_SIZE="15Gi"
STORAGE_CLASS="efs-sc"
DELETE_MINIKUBE=false
ACTION="install"
HF_TOKEN_CLI=""
AUTH_FILE_CLI=""
PULL_SECRET_NAME="llm-d-pull-secret"
SCRIPT_DIR=""
REPO_ROOT=""
INSTALL_DIR=""
CHART_DIR=""
HF_NAME=""
HF_KEY=""
PROXY_UID=""
AUTH_FILE=""
HOSTPATH_DIR=${HOSTPATH_DIR:="/mnt/data/llama-model-storage"}
VALUES_FILE="values.yaml"
DEBUG=""
### HELP & LOGGING ###
print_help() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Options:
--hf-token TOKEN Hugging Face token (or set HF_TOKEN env var)
--auth-file PATH Path to containers auth.json
--provision-minikube Provision a local Minikube cluster without GPU support (p/d pods will stay pending)
--provision-minikube-gpu Provision a local Minikube cluster with GPU support
--delete-minikube Delete local Minikube cluster
--minikube-storage Use Minikube-specific PVC manifest for storage
--storage-size SIZE Size of storage volume (default: 7Gi)
--storage-class CLASS Storage class to use (default: efs-sc)
--namespace NAME K8s namespace (default: llm-d)
--values-file PATH Path to Helm values.yaml file (default: values.yaml)
--uninstall Uninstall the llm-d components from the current cluster
--debug Add debug mode to the helm install
-h, --help Show this help and exit
EOF
}
log_info() { echo -e "$*"; }
log_success() { echo -e "$*"; }
log_error() { echo -e "β $*" >&2; }
die() { log_error "$*"; exit 1; }
### UTILITIES ###
check_cmd() {
command -v "$1" &>/dev/null || die "Required command not found: $1"
}
check_dependencies() {
local required_cmds=(git yq jq helm kubectl kustomize make)
for cmd in "${required_cmds[@]}"; do
check_cmd "$cmd"
done
}
check_cluster_reachability() {
if kubectl cluster-info &> /dev/null; then
log_info "kubectl can reach to a running Kubernetes cluster."
else
die "kubectl cannot reach any running Kubernetes cluster."
fi
}
# Derive an OpenShift PROXY_UID; default to 0 if not available
fetch_proxy_uid() {
log_info "Fetching OCP proxy UID..."
local uid_range
uid_range=$(kubectl get namespace "${NAMESPACE}" -o jsonpath='{.metadata.annotations.openshift\.io/sa\.scc\.uid-range}' 2>/dev/null || true)
if [[ -n "$uid_range" ]]; then
PROXY_UID=$(echo "$uid_range" | awk -F'/' '{print $1 + 1}')
log_success "Derived PROXY_UID=${PROXY_UID}"
else
PROXY_UID=0
log_info "No OpenShift SCC annotation found; defaulting PROXY_UID=${PROXY_UID}"
fi
}
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--hf-token) HF_TOKEN_CLI="$2"; shift 2 ;;
--auth-file) AUTH_FILE_CLI="$2"; shift 2 ;;
--provision-minikube) PROVISION_MINIKUBE=true; USE_MINIKUBE_STORAGE=true; shift ;;
--provision-minikube-gpu) PROVISION_MINIKUBE_GPU=true; USE_MINIKUBE_STORAGE=true; shift ;;
--delete-minikube) DELETE_MINIKUBE=true; shift ;;
--minikube-storage) USE_MINIKUBE_STORAGE=true; shift ;;
--storage-size) STORAGE_SIZE="$2"; shift 2 ;;
--storage-class) STORAGE_CLASS="$2"; shift 2 ;;
--namespace) NAMESPACE="$2"; shift 2 ;;
--values-file) VALUES_FILE="$2"; shift 2 ;;
--uninstall) ACTION="uninstall"; shift ;;
--debug) DEBUG="--debug"; shift;;
-h|--help) print_help; exit 0 ;;
*) die "Unknown option: $1" ;;
esac
done
}
### ENV & PATH SETUP ###
setup_env() {
log_info "π Setting up script environment..."
SCRIPT_DIR=$(realpath "$(pwd)")
REPO_ROOT=$(git rev-parse --show-toplevel)
INSTALL_DIR=$(realpath "${REPO_ROOT}/quickstart")
CHART_DIR=$(realpath "${REPO_ROOT}/charts/llm-d")
if [[ "$SCRIPT_DIR" != "$INSTALL_DIR" ]]; then
die "Script must be run from ${INSTALL_DIR}"
fi
}
locate_auth_file() {
log_info "π Locating container auth file..."
if [[ -n "$AUTH_FILE_CLI" && -f "$AUTH_FILE_CLI" ]]; then
AUTH_FILE="$AUTH_FILE_CLI"
elif [[ -f "$HOME/.config/containers/auth.json" ]]; then
AUTH_FILE="$HOME/.config/containers/auth.json"
elif [[ -f "$HOME/.config/containers/config.json" ]]; then
AUTH_FILE="$HOME/.config/containers/config.json"
else
echo "No auth file found in ~/.config/containers/"
echo "Please authenticate with either:"
echo
echo "# Docker"
echo "docker --config ~/.config/containers/ login quay.io"
echo "docker --config ~/.config/containers/ login registry.redhat.io"
echo
echo "# Podman"
echo "podman login quay.io --authfile ~/.config/containers/auth.json"
echo "podman login registry.redhat.io --authfile ~/.config/containers/auth.json"
exit 1
fi
log_success "β
Auth file: ${AUTH_FILE}"
}
validate_hf_token() {
if [[ "$ACTION" == "install" ]]; then
log_info "π€ Validating Hugging Face token..."
HF_TOKEN="${HF_TOKEN_CLI:-${HF_TOKEN:-}}"
[[ -n "$HF_TOKEN" ]] || die "HF_TOKEN not set."
log_success "β
HF_TOKEN validated"
fi
}
### MINIKUBE HANDLERS ###
provision_minikube() {
log_info "π± Provisioning Minikube cluster..."
minikube start
log_success "π Minikube started."
}
provision_minikube_gpu() {
log_info "π± Provisioning Minikube GPU clusterβ¦"
minikube start \
--driver docker \
--container-runtime docker \
--gpus all
log_success "π Minikube GPU cluster started."
}
delete_minikube() {
log_info "ποΈ Deleting Minikube cluster..."
minikube delete
log_success "π Minikube deleted."
}
install() {
log_info "ποΈ Installing GAIE Kubernetes infrastructureβ¦"
clone_gaie_repo
pushd gateway-api-inference-extension >/dev/null
INFRASTRUCTURE_OVERRIDE=true make environment.dev.kubernetes.infrastructure
popd >/dev/null
rm -rf gateway-api-inference-extension
log_success "β
GAIE infra applied"
log_info "π¦ Creating namespace ${NAMESPACE}..."
kubectl create namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
kubectl config set-context --current --namespace="${NAMESPACE}"
log_success "β
Namespace ready"
log_info "π Creating pull secret ${PULL_SECRET_NAME}..."
kubectl create secret generic "${PULL_SECRET_NAME}" \
-n "${NAMESPACE}" \
--from-file=.dockerconfigjson="${AUTH_FILE}" \
--type=kubernetes.io/dockerconfigjson \
--dry-run=client -o yaml | kubectl apply -f -
log_success "β
Pull secret created"
log_info "π§ Patching default ServiceAccount..."
kubectl patch serviceaccount default \
-n "${NAMESPACE}" \
--type merge \
--patch '{"imagePullSecrets":[{"name":"'"${PULL_SECRET_NAME}"'"}]}'
log_success "β
ServiceAccount patched"
cd "${CHART_DIR}"
# Resolve which values.yaml to use:
# - If the user passed --values-file (i.e. $VALUES_FILE != "values.yaml"), treat it as
# either relative or absolute path and require it to exist.
# - Otherwise default to $CHART_DIR/values.yaml.
if [[ "$VALUES_FILE" != "values.yaml" ]]; then
if [[ -f "$VALUES_FILE" ]]; then
VALUES_PATH=$(realpath "$VALUES_FILE")
log_info "β
Using custom values file: ${VALUES_PATH}"
else
die "Custom values file not found: $VALUES_FILE"
fi
else
VALUES_PATH="${CHART_DIR}/values.yaml"
fi
if [[ "$(yq -r .auth.hfToken.enabled "${VALUES_PATH}")" == "true" ]]; then
log_info "π Creating HF token secret (from ${VALUES_PATH})..."
HF_NAME=$(yq -r .auth.hfToken.name "${VALUES_PATH}")
HF_KEY=$(yq -r .auth.hfToken.key "${VALUES_PATH}")
kubectl create secret generic "${HF_NAME}" \
--from-literal="${HF_KEY}=${HF_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f -
log_success "β
HF token secret created"
fi
fetch_proxy_uid
log_info "π Applying modelservice CRD..."
kubectl apply -f crds/modelservice-crd.yaml
log_success "β
ModelService CRD applied"
log_info "π Patching load-model job manifest with HF secret name='${HF_NAME}', key='${HF_KEY}'"
# try brewβs yq first; if that fails, fall back to linux installed pkg syntax -_-
if ! yq -i ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.name = \"${HF_NAME}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"; then
yq -i -y ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.name = \"${HF_NAME}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"
fi
if ! yq -i ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.key = \"${HF_KEY}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"; then
yq -i -y ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.key = \"${HF_KEY}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"
fi
log_success "β
Job manifest patched"
log_info "πΎ Provisioning model storageβ¦"
if [[ "${USE_MINIKUBE_STORAGE}" == "true" ]]; then
# this creates both the hostPath PV and the matching PVC
setup_minikube_storage
log_success "β
PVC created from model-storage-rwx-pvc-minikube.yaml"
else
eval "echo \"$(cat ${REPO_ROOT}/helpers/k8s/model-storage-rwx-pvc-template.yaml)\"" \
| kubectl apply -n "${NAMESPACE}" -f -
log_success "β
PVC created with storageClassName ${STORAGE_CLASS} and size ${STORAGE_SIZE}"
fi
log_info "π Launching model download job..."
kubectl apply -f "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml" -n "${NAMESPACE}"
log_info "β³ Waiting up to 3m for model download job to complete; this may take a while depending on connection speed and model size..."
kubectl wait --for=condition=complete --timeout=180s job/download-model -n "${NAMESPACE}" || {
log_error "π Model download job failed or timed out";
kubectl logs job/download-model -n "${NAMESPACE}";
kubectl logs -l job-name=download-model -n "${NAMESPACE}";
exit 1;
}
log_success "β
Model downloaded"
helm repo add bitnami https://charts.bitnami.com/bitnami
log_info "π οΈ Building Helm chart dependencies..."
helm dependency build .
log_success "β
Dependencies built"
log_info "π Deploying llm-d chart with ${VALUES_PATH}..."
helm upgrade -i llm-d . \
${DEBUG} \
--namespace "${NAMESPACE}" \
--values "${VALUES_PATH}" \
--set gateway.parameters.proxyUID="${PROXY_UID}"
log_success "β
llm-d deployed"
log_info "π Patching all ServiceAccounts with pull-secret..."
patch='{"imagePullSecrets":[{"name":"'"${PULL_SECRET_NAME}"'"}]}'
kubectl get deployments -n "${NAMESPACE}" -o jsonpath='{.items[*].spec.template.spec.serviceAccountName}' |
tr ' ' '\n' | sort -u |
xargs -I{} kubectl patch serviceaccount {} --namespace="${NAMESPACE}" --type merge --patch "${patch}"
kubectl patch serviceaccount default --namespace="${NAMESPACE}" --type merge --patch "${patch}"
log_success "β
ServiceAccounts patched"
MODELSERVICE_POD=$(kubectl get pods -n "${NAMESPACE}" | grep "modelservice" | awk 'NR==1{print $1}')
log_info "π Restarting pod ${MODELSERVICE_POD} to pick up new image..."
kubectl delete pod "${MODELSERVICE_POD}" -n "${NAMESPACE}" || true
if [[ "${USE_MINIKUBE_STORAGE}" == "true" ]]; then
log_info "π Creating shared hostpath for Minicube PV and PVC for Redis..."
kubectl delete pvc redis-pvc -n "${NAMESPACE}" --ignore-not-found
kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: redis-hostpath-pv
spec:
storageClassName: manual
capacity:
storage: 5Gi
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
hostPath:
path: ${HOSTPATH_DIR}/redis-data
type: DirectoryOrCreate
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: redis-data-redis-master
namespace: ${NAMESPACE}
spec:
storageClassName: manual
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
volumeName: redis-hostpath-pv
EOF
log_success "β
Redis PV and PVC created with Helm annotations."
fi
post_install
log_success "π Installation complete."
}
setup_minikube_storage() {
log_info "π¦ Setting up Minikube hostPath RWX Shared Storage..."
log_info "π Creating PV and PVC for llama model..."
kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: llama-hostpath-pv
spec:
storageClassName: manual
capacity:
storage: ${STORAGE_SIZE}
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
hostPath:
path: ${HOSTPATH_DIR}
type: DirectoryOrCreate
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-3.2-3b-instruct-pvc
namespace: ${NAMESPACE}
spec:
storageClassName: manual
accessModes:
- ReadWriteMany
resources:
requests:
storage: ${STORAGE_SIZE}
volumeName: llama-hostpath-pv
EOF
log_success "β
llama model PV and PVC created."
}
clone_gaie_repo() {
if [[ ! -d gateway-api-inference-extension ]]; then
git clone https://github.com/neuralmagic/gateway-api-inference-extension.git
fi
}
# function called right before the installer exits
post_install() {
# download-model pod deletion if it exists and in a succeeded phase
local pod
pod=$(kubectl get pods -n "${NAMESPACE}" \
-l job-name=download-model \
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [[ -z "$pod" ]]; then
return
fi
local phase
phase=$(kubectl get pod "$pod" -n "${NAMESPACE}" \
-o jsonpath='{.status.phase}' 2>/dev/null || true)
if [[ "$phase" == "Succeeded" ]]; then
kubectl delete pod "$pod" -n "${NAMESPACE}" --ignore-not-found || true
log_success "π§Ή download-model pod deleted"
else
log_info "β Pod ${pod} phase is ${phase}; skipping delete."
fi
}
uninstall() {
log_info "ποΈ Tearing down GAIE Kubernetes infrastructureβ¦"
clone_gaie_repo
pushd gateway-api-inference-extension >/dev/null
INFRASTRUCTURE_OVERRIDE=true make clean.environment.dev.kubernetes.infrastructure
popd >/dev/null
rm -rf gateway-api-inference-extension
log_info "ποΈ Uninstalling llm-d chart..."
helm uninstall llm-d --namespace "${NAMESPACE}" || true
log_info "ποΈ Deleting namespace ${NAMESPACE}..."
kubectl delete namespace "${NAMESPACE}" || true
log_info "ποΈ Deleting PVCs..."
kubectl delete pv llama-hostpath-pv --ignore-not-found
kubectl delete pvc redis-pvc -n "${NAMESPACE}" --ignore-not-found
kubectl delete pv redis-hostpath-pv --ignore-not-found
log_success "π Uninstallation complete"
}
main() {
parse_args "$@"
# If only deleting Minikube, do that and exit immediately
if [[ "$DELETE_MINIKUBE" == true ]]; then
check_cmd minikube
delete_minikube
exit 0
fi
setup_env
check_dependencies
# only check kubectl if not provisioning Minikube
if [[ "$PROVISION_MINIKUBE" != "true" && "$PROVISION_MINIKUBE_GPU" != "true" ]]; then
check_cluster_reachability
fi
locate_auth_file
validate_hf_token
if [[ "$ACTION" == "install" ]]; then
if [[ "$PROVISION_MINIKUBE_GPU" == "true" ]]; then
provision_minikube_gpu
elif [[ "$PROVISION_MINIKUBE" == "true" ]]; then
provision_minikube
fi
install
elif [[ "$ACTION" == "uninstall" ]]; then
uninstall
else
die "Unknown action: $ACTION"
fi
}
main "$@"
$ kubectl get pods --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE
e2e-helm llama-32-3b-instruct-decode-8655566f7d-dgblq 2/2 Running 0 15m
e2e-helm llama-32-3b-instruct-epp-7fb457b654-4tsnr 1/1 Running 0 15m
e2e-helm llama-32-3b-instruct-prefill-5bcf6d6c8d-cd7wk 2/2 Running 0 15m
e2e-helm llm-d-inference-gateway-5fbd8c566-xptvb 1/1 Running 0 15m
e2e-helm llm-d-modelservice-c957dd4f-x4hlh 1/1 Running 0 15m
e2e-helm llm-d-redis-master-66bd877866-slnj6 0/1 Pending 0 15m
kgateway-system kgateway-5b4f5864f5-r7nzr 1/1 Running 0 18m
kube-system coredns-668d6bf9bc-hjn7b 1/1 Running 0 18m
kube-system etcd-minikube 1/1 Running 0 18m
kube-system kube-apiserver-minikube 1/1 Running 0 18m
kube-system kube-controller-manager-minikube 1/1 Running 0 18m
kube-system kube-proxy-dswn2 1/1 Running 0 18m
kube-system kube-scheduler-minikube 1/1 Running 0 18m
kube-system nvidia-device-plugin-daemonset-np7pt 1/1 Running 0 18m
kube-system storage-provisioner 1/1 Running 0 18m
storage-gluster glusterfile-provisioner-567dcc98cd-wszgk 1/1 Running 0 18m
storage-gluster glusterfs-xmtmp 0/1 Running 0 18m
storage-gluster heketi-54cf645f7f-6h48r 1/1 Running 0 18m
kubectl exec --stdin --tty llama-32-3b-instruct-prefill-5bcf6d6c8d-cd7wk -c vllm -- /bin/bash
curl -X 'POST' 'http://localhost:8000/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
"model": "Llama-32-3B-Instruct",
"messages": [
{
"content": "Who won the World Series in 1986?",
"role": "user"
}
],
"stream": false
}'
{"id":"chatcmpl-9ef446d1609e4997b16bc7d19b331127","object":"chat.completion","created":1746501124,"model":"Llama-32-3B-Instruct","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The New York Mets won the World Series in 1986. They defeated the Boston Red Sox in the \"Boston Massacre Game\" (Game 6), winning the series 4 games to 3.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":45,"total_tokens":88,"completion_tokens":43,"prompt_tokens_details":null},"prompt_logprobs":null}