Skip to content

Instantly share code, notes, and snippets.

@nerdalert
Last active May 6, 2025 06:09
Show Gist options
  • Save nerdalert/106dd039a15af47a52e6aab2ab04d1b9 to your computer and use it in GitHub Desktop.
Save nerdalert/106dd039a15af47a52e6aab2ab04d1b9 to your computer and use it in GitHub Desktop.

HW

g6e.12xlarge or at least 2x L40S

Uninstall:

minikube delete
or just for kube parts
./llmd-installer-minikube.sh --uninstall  --namespace e2e-helm

There is a gross bug right meow, patch with:

--- a/charts/llm-d/templates/modelservice/deployment.yaml
+++ b/charts/llm-d/templates/modelservice/deployment.yaml
@@ -46,11 +46,11 @@ spec:
         - --epp-cluster-role
         - endpoint-picker-clusterrole
         - --epp-pull-secrets
-        - endpoint-picker-pull-secret
+        - llm-d-pull-secret
         - --pd-cluster-role
-        - prefill-decode-clusterrole
+        - llm-d-pull-secret
         - --pd-pull-secrets
-        - prefill-decode-pull-secret
+        - llm-d-pull-secret
         # MSV2 HACK END
         command:
         - /manager
@@ -95,3 +95,4 @@ spec:
           type: RuntimeDefault
       serviceAccountName: {{ include "modelservice.serviceAccountName" . }}
 {{- end}}

Install full cluster (no minikube cluster up)

export HF_TOKEN=12345
./llmd-installer-minikube.sh --provision-minikube-gpu --namespace e2e-helm

Updated quickstart/llmd-installer-minikube.sh script:

#!/usr/bin/env bash
# -*- indent-tabs-mode: nil; tab-width: 4; sh-indentation: 4; -*-

set -euo pipefail

### GLOBALS ###
NAMESPACE="llm-d"
PROVISION_MINIKUBE=false
PROVISION_MINIKUBE_GPU=false
USE_MINIKUBE_STORAGE=false
STORAGE_SIZE="15Gi"
STORAGE_CLASS="efs-sc"
DELETE_MINIKUBE=false
ACTION="install"
HF_TOKEN_CLI=""
AUTH_FILE_CLI=""
PULL_SECRET_NAME="llm-d-pull-secret"
SCRIPT_DIR=""
REPO_ROOT=""
INSTALL_DIR=""
CHART_DIR=""
HF_NAME=""
HF_KEY=""
PROXY_UID=""
AUTH_FILE=""
HOSTPATH_DIR=${HOSTPATH_DIR:="/mnt/data/llama-model-storage"}
VALUES_FILE="values.yaml"
DEBUG=""

### HELP & LOGGING ###
print_help() {
  cat <<EOF
Usage: $(basename "$0") [OPTIONS]

Options:
  --hf-token TOKEN           Hugging Face token (or set HF_TOKEN env var)
  --auth-file PATH           Path to containers auth.json
  --provision-minikube       Provision a local Minikube cluster without GPU support (p/d pods will stay pending)
  --provision-minikube-gpu   Provision a local Minikube cluster with GPU support
  --delete-minikube          Delete local Minikube cluster
  --minikube-storage         Use Minikube-specific PVC manifest for storage
  --storage-size SIZE        Size of storage volume (default: 7Gi)
  --storage-class CLASS      Storage class to use (default: efs-sc)
  --namespace NAME           K8s namespace (default: llm-d)
  --values-file PATH         Path to Helm values.yaml file (default: values.yaml)
  --uninstall                Uninstall the llm-d components from the current cluster
  --debug                    Add debug mode to the helm install
  -h, --help                 Show this help and exit
EOF
}

log_info()    { echo -e "$*"; }
log_success() { echo -e "$*"; }
log_error()   { echo -e "❌ $*" >&2; }
die()         { log_error "$*"; exit 1; }

### UTILITIES ###
check_cmd() {
  command -v "$1" &>/dev/null || die "Required command not found: $1"
}

check_dependencies() {
  local required_cmds=(git yq jq helm kubectl kustomize make)
  for cmd in "${required_cmds[@]}"; do
    check_cmd "$cmd"
  done
}

check_cluster_reachability() {
  if kubectl cluster-info &> /dev/null; then
    log_info "kubectl can reach to a running Kubernetes cluster."
  else
    die "kubectl cannot reach any running Kubernetes cluster."
  fi
}

# Derive an OpenShift PROXY_UID; default to 0 if not available
fetch_proxy_uid() {
  log_info "Fetching OCP proxy UID..."
  local uid_range
  uid_range=$(kubectl get namespace "${NAMESPACE}" -o jsonpath='{.metadata.annotations.openshift\.io/sa\.scc\.uid-range}' 2>/dev/null || true)
  if [[ -n "$uid_range" ]]; then
    PROXY_UID=$(echo "$uid_range" | awk -F'/' '{print $1 + 1}')
    log_success "Derived PROXY_UID=${PROXY_UID}"
  else
    PROXY_UID=0
    log_info "No OpenShift SCC annotation found; defaulting PROXY_UID=${PROXY_UID}"
  fi
}

parse_args() {
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --hf-token)               HF_TOKEN_CLI="$2"; shift 2 ;;
      --auth-file)              AUTH_FILE_CLI="$2"; shift 2 ;;
      --provision-minikube)     PROVISION_MINIKUBE=true; USE_MINIKUBE_STORAGE=true; shift ;;
      --provision-minikube-gpu) PROVISION_MINIKUBE_GPU=true; USE_MINIKUBE_STORAGE=true; shift ;;
      --delete-minikube)        DELETE_MINIKUBE=true; shift ;;
      --minikube-storage)       USE_MINIKUBE_STORAGE=true; shift ;;
      --storage-size)           STORAGE_SIZE="$2"; shift 2 ;;
      --storage-class)          STORAGE_CLASS="$2"; shift 2 ;;
      --namespace)              NAMESPACE="$2"; shift 2 ;;
      --values-file)            VALUES_FILE="$2"; shift 2 ;;
      --uninstall)              ACTION="uninstall"; shift ;;
      --debug)                  DEBUG="--debug"; shift;;
      -h|--help)                print_help; exit 0 ;;
      *)                        die "Unknown option: $1" ;;
    esac
  done
}

### ENV & PATH SETUP ###
setup_env() {
  log_info "πŸ“‚ Setting up script environment..."
  SCRIPT_DIR=$(realpath "$(pwd)")
  REPO_ROOT=$(git rev-parse --show-toplevel)
  INSTALL_DIR=$(realpath "${REPO_ROOT}/quickstart")
  CHART_DIR=$(realpath "${REPO_ROOT}/charts/llm-d")

  if [[ "$SCRIPT_DIR" != "$INSTALL_DIR" ]]; then
    die "Script must be run from ${INSTALL_DIR}"
  fi
}

locate_auth_file() {
  log_info "πŸ”‘ Locating container auth file..."
  if [[ -n "$AUTH_FILE_CLI" && -f "$AUTH_FILE_CLI" ]]; then
    AUTH_FILE="$AUTH_FILE_CLI"
  elif [[ -f "$HOME/.config/containers/auth.json" ]]; then
    AUTH_FILE="$HOME/.config/containers/auth.json"
  elif [[ -f "$HOME/.config/containers/config.json" ]]; then
    AUTH_FILE="$HOME/.config/containers/config.json"
  else
    echo "No auth file found in ~/.config/containers/"
    echo "Please authenticate with either:"
    echo
    echo "# Docker"
    echo "docker --config ~/.config/containers/ login quay.io"
    echo "docker --config ~/.config/containers/ login registry.redhat.io"
    echo
    echo "# Podman"
    echo "podman login quay.io  --authfile ~/.config/containers/auth.json"
    echo "podman login registry.redhat.io  --authfile ~/.config/containers/auth.json"
    exit 1
  fi
  log_success "βœ… Auth file: ${AUTH_FILE}"
}

validate_hf_token() {
  if [[ "$ACTION" == "install" ]]; then
    log_info "πŸ€– Validating Hugging Face token..."
    HF_TOKEN="${HF_TOKEN_CLI:-${HF_TOKEN:-}}"
    [[ -n "$HF_TOKEN" ]] || die "HF_TOKEN not set."
    log_success "βœ… HF_TOKEN validated"
  fi
}

### MINIKUBE HANDLERS ###
provision_minikube() {
  log_info "🌱 Provisioning Minikube cluster..."
  minikube start
  log_success "πŸš€ Minikube started."
}

provision_minikube_gpu() {
  log_info "🌱 Provisioning Minikube GPU cluster…"
  minikube start \
    --driver docker \
    --container-runtime docker \
    --gpus all
  log_success "πŸš€ Minikube GPU cluster started."
}

delete_minikube() {
  log_info "πŸ—‘οΈ Deleting Minikube cluster..."
  minikube delete
  log_success "πŸ™€ Minikube deleted."
}

install() {
  log_info "πŸ—οΈ Installing GAIE Kubernetes infrastructure…"
  clone_gaie_repo
  pushd gateway-api-inference-extension >/dev/null
    INFRASTRUCTURE_OVERRIDE=true make environment.dev.kubernetes.infrastructure
  popd >/dev/null
  rm -rf gateway-api-inference-extension
  log_success "βœ… GAIE infra applied"
  log_info "πŸ“¦ Creating namespace ${NAMESPACE}..."
  kubectl create namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
  kubectl config set-context --current --namespace="${NAMESPACE}"
  log_success "βœ… Namespace ready"

  log_info "πŸ” Creating pull secret ${PULL_SECRET_NAME}..."
  kubectl create secret generic "${PULL_SECRET_NAME}" \
    -n "${NAMESPACE}" \
    --from-file=.dockerconfigjson="${AUTH_FILE}" \
    --type=kubernetes.io/dockerconfigjson \
    --dry-run=client -o yaml | kubectl apply -f -
  log_success "βœ… Pull secret created"

  log_info "πŸ”§ Patching default ServiceAccount..."
  kubectl patch serviceaccount default \
    -n "${NAMESPACE}" \
    --type merge \
    --patch '{"imagePullSecrets":[{"name":"'"${PULL_SECRET_NAME}"'"}]}'
  log_success "βœ… ServiceAccount patched"

  cd "${CHART_DIR}"
  # Resolve which values.yaml to use:
  #   - If the user passed --values-file (i.e. $VALUES_FILE != "values.yaml"), treat it as
  #     either relative or absolute path and require it to exist.
  #   - Otherwise default to $CHART_DIR/values.yaml.
  if [[ "$VALUES_FILE" != "values.yaml" ]]; then
    if [[ -f "$VALUES_FILE" ]]; then
      VALUES_PATH=$(realpath "$VALUES_FILE")
      log_info "βœ… Using custom values file: ${VALUES_PATH}"
    else
      die "Custom values file not found: $VALUES_FILE"
    fi
  else
    VALUES_PATH="${CHART_DIR}/values.yaml"
  fi

  if [[ "$(yq -r .auth.hfToken.enabled "${VALUES_PATH}")" == "true" ]]; then
    log_info "πŸ” Creating HF token secret (from ${VALUES_PATH})..."
    HF_NAME=$(yq -r .auth.hfToken.name "${VALUES_PATH}")
    HF_KEY=$(yq -r .auth.hfToken.key  "${VALUES_PATH}")
    kubectl create secret generic "${HF_NAME}" \
      --from-literal="${HF_KEY}=${HF_TOKEN}" \
      --dry-run=client -o yaml | kubectl apply -f -
    log_success "βœ… HF token secret created"
  fi

  fetch_proxy_uid

  log_info "πŸ“œ Applying modelservice CRD..."
  kubectl apply -f crds/modelservice-crd.yaml
  log_success "βœ… ModelService CRD applied"

  log_info "πŸ“ Patching load-model job manifest with HF secret name='${HF_NAME}', key='${HF_KEY}'"
  # try brew’s yq first; if that fails, fall back to linux installed pkg syntax -_-
  if ! yq -i ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.name = \"${HF_NAME}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"; then
    yq -i -y ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.name = \"${HF_NAME}\"" "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"
  fi
  if ! yq -i ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.key  = \"${HF_KEY}\""  "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"; then
    yq -i -y ".spec.template.spec.containers[0].env[0].valueFrom.secretKeyRef.key  = \"${HF_KEY}\""  "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml"
  fi
  log_success "βœ… Job manifest patched"

  log_info "πŸ’Ύ Provisioning model storage…"
  if [[ "${USE_MINIKUBE_STORAGE}" == "true" ]]; then
    # this creates both the hostPath PV and the matching PVC
    setup_minikube_storage
    log_success "βœ… PVC created from model-storage-rwx-pvc-minikube.yaml"
  else
    eval "echo \"$(cat ${REPO_ROOT}/helpers/k8s/model-storage-rwx-pvc-template.yaml)\"" \
        | kubectl apply -n "${NAMESPACE}" -f -
    log_success "βœ… PVC created with storageClassName ${STORAGE_CLASS} and size ${STORAGE_SIZE}"
  fi

  log_info "πŸš€ Launching model download job..."
  kubectl apply -f "${REPO_ROOT}/helpers/k8s/load-model-on-pvc.yaml" -n "${NAMESPACE}"

  log_info "⏳ Waiting up to 3m for model download job to complete; this may take a while depending on connection speed and model size..."
  kubectl wait --for=condition=complete --timeout=180s job/download-model -n "${NAMESPACE}" || {
    log_error "πŸ™€ Model download job failed or timed out";
    kubectl logs job/download-model -n "${NAMESPACE}";
    kubectl logs -l job-name=download-model -n "${NAMESPACE}";
    exit 1;
  }
  log_success "βœ… Model downloaded"

  helm repo add bitnami  https://charts.bitnami.com/bitnami
  log_info "πŸ› οΈ Building Helm chart dependencies..."
  helm dependency build .
  log_success "βœ… Dependencies built"

  log_info "🚚 Deploying llm-d chart with ${VALUES_PATH}..."
  helm upgrade -i llm-d . \
    ${DEBUG} \
    --namespace "${NAMESPACE}" \
    --values "${VALUES_PATH}" \
    --set gateway.parameters.proxyUID="${PROXY_UID}"
  log_success "βœ… llm-d deployed"

  log_info "πŸ”„ Patching all ServiceAccounts with pull-secret..."
  patch='{"imagePullSecrets":[{"name":"'"${PULL_SECRET_NAME}"'"}]}'
  kubectl get deployments -n "${NAMESPACE}" -o jsonpath='{.items[*].spec.template.spec.serviceAccountName}' |
    tr ' ' '\n' | sort -u |
    xargs -I{} kubectl patch serviceaccount {} --namespace="${NAMESPACE}" --type merge --patch "${patch}"
  kubectl patch serviceaccount default --namespace="${NAMESPACE}" --type merge --patch "${patch}"
  log_success "βœ… ServiceAccounts patched"

  MODELSERVICE_POD=$(kubectl get pods -n "${NAMESPACE}" | grep "modelservice" | awk 'NR==1{print $1}')
  log_info "πŸ” Restarting pod ${MODELSERVICE_POD} to pick up new image..."
  kubectl delete pod "${MODELSERVICE_POD}" -n "${NAMESPACE}" || true

  if [[ "${USE_MINIKUBE_STORAGE}" == "true" ]]; then
  log_info "πŸ”„ Creating shared hostpath for Minicube PV and PVC for Redis..."
  kubectl delete pvc redis-pvc -n "${NAMESPACE}" --ignore-not-found
  kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
  name: redis-hostpath-pv
spec:
  storageClassName: manual
  capacity:
    storage: 5Gi
  accessModes:
    - ReadWriteMany
  persistentVolumeReclaimPolicy: Retain
  hostPath:
    path: ${HOSTPATH_DIR}/redis-data
    type: DirectoryOrCreate
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: redis-data-redis-master
  namespace: ${NAMESPACE}
spec:
  storageClassName: manual
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 5Gi
  volumeName: redis-hostpath-pv
EOF
  log_success "βœ… Redis PV and PVC created with Helm annotations."
  fi

  post_install

  log_success "πŸŽ‰ Installation complete."
}

setup_minikube_storage() {
  log_info "πŸ“¦ Setting up Minikube hostPath RWX Shared Storage..."
  log_info "πŸ”„ Creating PV and PVC for llama model..."
kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
  name: llama-hostpath-pv
spec:
  storageClassName: manual
  capacity:
    storage: ${STORAGE_SIZE}
  accessModes:
    - ReadWriteMany
  persistentVolumeReclaimPolicy: Retain
  hostPath:
    path: ${HOSTPATH_DIR}
    type: DirectoryOrCreate
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: llama-3.2-3b-instruct-pvc
  namespace: ${NAMESPACE}
spec:
  storageClassName: manual
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: ${STORAGE_SIZE}
  volumeName: llama-hostpath-pv
EOF
  log_success "βœ… llama model PV and PVC created."
}

clone_gaie_repo() {
  if [[ ! -d gateway-api-inference-extension ]]; then
    git clone https://github.com/neuralmagic/gateway-api-inference-extension.git
  fi
}

# function called right before the installer exits
post_install() {
  # download-model pod deletion if it exists and in a succeeded phase
  local pod
  pod=$(kubectl get pods -n "${NAMESPACE}" \
    -l job-name=download-model \
    -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
  if [[ -z "$pod" ]]; then
    return
  fi
  local phase
  phase=$(kubectl get pod "$pod" -n "${NAMESPACE}" \
    -o jsonpath='{.status.phase}' 2>/dev/null || true)
  if [[ "$phase" == "Succeeded" ]]; then
    kubectl delete pod "$pod" -n "${NAMESPACE}" --ignore-not-found || true
    log_success "🧹 download-model pod deleted"
  else
    log_info "β†’ Pod ${pod} phase is ${phase}; skipping delete."
  fi
}

uninstall() {
  log_info "πŸ—‘οΈ Tearing down GAIE Kubernetes infrastructure…"
  clone_gaie_repo
  pushd gateway-api-inference-extension >/dev/null
    INFRASTRUCTURE_OVERRIDE=true make clean.environment.dev.kubernetes.infrastructure
  popd >/dev/null
  rm -rf gateway-api-inference-extension
  log_info "πŸ—‘οΈ Uninstalling llm-d chart..."
  helm uninstall llm-d --namespace "${NAMESPACE}" || true
  log_info "πŸ—‘οΈ Deleting namespace ${NAMESPACE}..."
  kubectl delete namespace "${NAMESPACE}" || true
  log_info "πŸ—‘οΈ Deleting PVCs..."
  kubectl delete pv llama-hostpath-pv --ignore-not-found
  kubectl delete pvc redis-pvc -n "${NAMESPACE}" --ignore-not-found
  kubectl delete pv redis-hostpath-pv --ignore-not-found
  log_success "πŸ’€ Uninstallation complete"
}

main() {
  parse_args "$@"

  # If only deleting Minikube, do that and exit immediately
  if [[ "$DELETE_MINIKUBE" == true ]]; then
    check_cmd minikube
    delete_minikube
    exit 0
  fi

  setup_env
  check_dependencies

  # only check kubectl if not provisioning Minikube
  if [[ "$PROVISION_MINIKUBE" != "true" && "$PROVISION_MINIKUBE_GPU" != "true" ]]; then
    check_cluster_reachability
  fi

  locate_auth_file
  validate_hf_token

  if [[ "$ACTION" == "install" ]]; then
    if [[ "$PROVISION_MINIKUBE_GPU" == "true" ]]; then
      provision_minikube_gpu
    elif [[ "$PROVISION_MINIKUBE" == "true" ]]; then
      provision_minikube
    fi
  install
  elif [[ "$ACTION" == "uninstall" ]]; then
    uninstall
  else
    die "Unknown action: $ACTION"
  fi
}

main "$@"


Validate


$ kubectl get pods --all-namespaces
NAMESPACE         NAME                                            READY   STATUS    RESTARTS   AGE
e2e-helm          llama-32-3b-instruct-decode-8655566f7d-dgblq    2/2     Running   0          15m
e2e-helm          llama-32-3b-instruct-epp-7fb457b654-4tsnr       1/1     Running   0          15m
e2e-helm          llama-32-3b-instruct-prefill-5bcf6d6c8d-cd7wk   2/2     Running   0          15m
e2e-helm          llm-d-inference-gateway-5fbd8c566-xptvb         1/1     Running   0          15m
e2e-helm          llm-d-modelservice-c957dd4f-x4hlh               1/1     Running   0          15m
e2e-helm          llm-d-redis-master-66bd877866-slnj6             0/1     Pending   0          15m
kgateway-system   kgateway-5b4f5864f5-r7nzr                       1/1     Running   0          18m
kube-system       coredns-668d6bf9bc-hjn7b                        1/1     Running   0          18m
kube-system       etcd-minikube                                   1/1     Running   0          18m
kube-system       kube-apiserver-minikube                         1/1     Running   0          18m
kube-system       kube-controller-manager-minikube                1/1     Running   0          18m
kube-system       kube-proxy-dswn2                                1/1     Running   0          18m
kube-system       kube-scheduler-minikube                         1/1     Running   0          18m
kube-system       nvidia-device-plugin-daemonset-np7pt            1/1     Running   0          18m
kube-system       storage-provisioner                             1/1     Running   0          18m
storage-gluster   glusterfile-provisioner-567dcc98cd-wszgk        1/1     Running   0          18m
storage-gluster   glusterfs-xmtmp                                 0/1     Running   0          18m
storage-gluster   heketi-54cf645f7f-6h48r                         1/1     Running   0          18m

kubectl exec --stdin --tty  llama-32-3b-instruct-prefill-5bcf6d6c8d-cd7wk -c vllm  -- /bin/bash

curl -X 'POST'   'http://localhost:8000/v1/chat/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
    "model": "Llama-32-3B-Instruct",
    "messages": [
      {
        "content": "Who won the World Series in 1986?",
        "role": "user"
      }
    ],
    "stream": false
  }'
  
  
{"id":"chatcmpl-9ef446d1609e4997b16bc7d19b331127","object":"chat.completion","created":1746501124,"model":"Llama-32-3B-Instruct","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The New York Mets won the World Series in 1986. They defeated the Boston Red Sox in the \"Boston Massacre Game\" (Game 6), winning the series 4 games to 3.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":45,"total_tokens":88,"completion_tokens":43,"prompt_tokens_details":null},"prompt_logprobs":null}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment