Skip to content

Instantly share code, notes, and snippets.

@jkremser
Last active March 11, 2025 16:46
Show Gist options
  • Save jkremser/ad3dc850b96b54ecc8657204645300e9 to your computer and use it in GitHub Desktop.
Save jkremser/ad3dc850b96b54ecc8657204645300e9 to your computer and use it in GitHub Desktop.
# helm upgrade -i kserve community-charts/kserve --version 1.0.1 --set kserve.controller.deploymentMode=RawDeployment
# kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml
# kubectl rollout status -ncert-manager deploy/cert-manager-webhook
# helm upgrade -i kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.15.0-rc1
# helm upgrade -i kserve oci://ghcr.io/kserve/charts/kserve --version v0.15.0-rc1 --set kserve.controller.deploymentMode=RawDeployment
# kubectl rollout status deploy/kserve-controller-manager
apiVersion: v1
kind: Secret
metadata:
name: hf-secret
type: Opaque
stringData:
HF_TOKEN: hf_*****
---
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: huggingface-llama3
spec:
predictor:
maxReplicas: 100
minReplicas: 1
deploymentStrategy:
type: Recreate
model:
args:
# - --model=/mnt/models
# - --port=8080
# - --served-model-name=dsp.llama-3.1-70b-instruct
# - --load-format=safetensors
# - --dtype=auto
# - --kv-cache-dtype=auto
# - --guided-decoding-backend=outlines
# - --tensor-parallel-size=1
# - --gpu-memory-utilization=0.99
# - --max-num-batched-tokens=4096
# - --max-model-len=4096
# - --enable-auto-tool-choice
# - --tool-call-parser=llama3_json
# - --chat-template=/mnt/models/tool_chat_template_llama3.1_json.jinja
- --model_name=llama3
- --load-format=safetensors
- --kv-cache-dtype=auto
- --tensor-parallel-size=1
- --enable-auto-tool-choice
- --tool-call-parser=llama3_json
- --model_id=meta-llama/meta-llama-3-8b-instruct
- --max-num-batched-tokens=2048
- --max-model-len=2048
- --gpu-memory-utilization=0.99
- --dtype=float16
env:
- name: KSERVE_OPENAI_ROUTE_PREFIX
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: HF_TOKEN
optional: false
modelFormat:
name: huggingface
name: ""
resources:
limits:
cpu: "4"
memory: 16Gi
nvidia.com/gpu: "1"
requests:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
# runtime: vllm-openai
# runtimeVersion: v0.6.3
---
apiVersion: serving.kserve.io/v1alpha1
kind: ClusterServingRuntime
metadata:
name: vllm-openai
spec:
annotations:
prometheus.kserve.io/path: /metrics
prometheus.kserve.io/port: "8080"
containers:
- args:
- --model
- /mnt/models
- --port
- "8080"
- --served-model-name
- '{{.Name}}'
image: docker.io/vllm/vllm-openai:v0.6.3
name: kserve-container
ports:
- containerPort: 8080
protocol: TCP
resources:
limits:
cpu: 1
memory: 2Gi
requests:
cpu: 1
memory: 2Gi
supportedModelFormats:
- autoSelect: false
name: huggingface
priority: 2
#!/bin/sh
set -e
export CLUSTER="gpu"
export PROJECT="kedify-initial"
#export GPU="nvidia-tesla-t4"
# export MACHINE="n1-highmem-8"
# export DISK="pd-standard"
export GPU="nvidia-l4"
export MACHINE="g2-standard-8"
export DISK="pd-balanced"
gcloud -q beta container clusters delete ${CLUSTER} --zone us-east4-a --project ${PROJECT} --async
gcloud beta container clusters create ${CLUSTER} \
--project ${PROJECT} \
--zone us-east4-a \
--release-channel "regular" \
--machine-type "${MACHINE}" \
--accelerator "type=${GPU},count=1,gpu-driver-version=default" \
--image-type "UBUNTU_CONTAINERD" \
--disk-type "${DISK}" \
--disk-size "300" \
--metadata disable-legacy-endpoints=true \
--service-account "${PROJECT}@${PROJECT}.iam.gserviceaccount.com" \
--spot \
--no-enable-intra-node-visibility \
--max-pods-per-node "110" \
--num-nodes "1" \
--logging=SYSTEM,WORKLOAD \
--monitoring=SYSTEM \
--enable-ip-alias \
--security-posture=disabled \
--workload-vulnerability-scanning=disabled \
--no-enable-managed-prometheus \
--no-enable-intra-node-visibility \
--default-max-pods-per-node "110" \
--no-enable-master-authorized-networks \
--tags=nvidia-ingress-all
sleep 5
gcloud container clusters update ${CLUSTER} \
--project ${PROJECT} \
--zone us-east4-a \
--enable-autoprovisioning \
--min-cpu=1 --max-cpu=8 --min-memory=1 --max-memory=52 \
--autoprovisioning-scopes=https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring,https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/compute
sleep 5
# login
gcloud container clusters get-credentials ${CLUSTER} --zone us-east4-a --project ${PROJECT}
# install kserve and model
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} rollout status -ncert-manager deploy/cert-manager-webhook
helm --kube-context gke_kedify-initial_us-east4-a_${CLUSTER} upgrade -i kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.15.0-rc1
helm --kube-context gke_kedify-initial_us-east4-a_${CLUSTER} upgrade -i kserve oci://ghcr.io/kserve/charts/kserve --version v0.15.0-rc1 --set kserve.controller.deploymentMode=RawDeployment
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} rollout status deploy/kserve-controller-manager
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} apply -f ./all.yaml
# expose the svc
kubectl expose deploy/huggingface-llama3-predictor --type=LoadBalancer --name=llama3 --port=8080 --target-port=8080
while [ "x$(kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} get svc llama3 -ojsonpath='{.status.loadBalancer.ingress[].ip}')" = "x" ] ; do sleep 1;printf .; done
IP=$(kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} get svc llama3 -ojsonpath='{.status.loadBalancer.ingress[].ip}')
PROMPT="What tool is the best for templating YAMLs in k8s ecosystem?"
curl -XPOST -H 'Content-Type: application/json' -H "Host: huggingface-llama3-predictor" http://${IP}:8080/v1/chat/completions -d '{ "model": "llama3", "messages": [ { "role": "user", "content": "${PROMPT}"} ], "stream": false, "max_tokens": 300 }' | jq
#hey -c 100 curl -XPOST -H 'Content-Type: application/json' -H "Host: huggingface-llama3-predictor" http://${IP}:8080/v1/chat/completions -d '{ "model": "llama3", "messages": [ { "role": "user", "content": "${PROMPT}"} ], "stream": false, "max_tokens": 300 }' | jq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment