Last active
March 11, 2025 16:46
-
-
Save jkremser/ad3dc850b96b54ecc8657204645300e9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# helm upgrade -i kserve community-charts/kserve --version 1.0.1 --set kserve.controller.deploymentMode=RawDeployment | |
# kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml | |
# kubectl rollout status -ncert-manager deploy/cert-manager-webhook | |
# helm upgrade -i kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.15.0-rc1 | |
# helm upgrade -i kserve oci://ghcr.io/kserve/charts/kserve --version v0.15.0-rc1 --set kserve.controller.deploymentMode=RawDeployment | |
# kubectl rollout status deploy/kserve-controller-manager | |
apiVersion: v1 | |
kind: Secret | |
metadata: | |
name: hf-secret | |
type: Opaque | |
stringData: | |
HF_TOKEN: hf_***** | |
--- | |
apiVersion: serving.kserve.io/v1beta1 | |
kind: InferenceService | |
metadata: | |
name: huggingface-llama3 | |
spec: | |
predictor: | |
maxReplicas: 100 | |
minReplicas: 1 | |
deploymentStrategy: | |
type: Recreate | |
model: | |
args: | |
# - --model=/mnt/models | |
# - --port=8080 | |
# - --served-model-name=dsp.llama-3.1-70b-instruct | |
# - --load-format=safetensors | |
# - --dtype=auto | |
# - --kv-cache-dtype=auto | |
# - --guided-decoding-backend=outlines | |
# - --tensor-parallel-size=1 | |
# - --gpu-memory-utilization=0.99 | |
# - --max-num-batched-tokens=4096 | |
# - --max-model-len=4096 | |
# - --enable-auto-tool-choice | |
# - --tool-call-parser=llama3_json | |
# - --chat-template=/mnt/models/tool_chat_template_llama3.1_json.jinja | |
- --model_name=llama3 | |
- --load-format=safetensors | |
- --kv-cache-dtype=auto | |
- --tensor-parallel-size=1 | |
- --enable-auto-tool-choice | |
- --tool-call-parser=llama3_json | |
- --model_id=meta-llama/meta-llama-3-8b-instruct | |
- --max-num-batched-tokens=2048 | |
- --max-model-len=2048 | |
- --gpu-memory-utilization=0.99 | |
- --dtype=float16 | |
env: | |
- name: KSERVE_OPENAI_ROUTE_PREFIX | |
- name: HF_TOKEN | |
valueFrom: | |
secretKeyRef: | |
name: hf-secret | |
key: HF_TOKEN | |
optional: false | |
modelFormat: | |
name: huggingface | |
name: "" | |
resources: | |
limits: | |
cpu: "4" | |
memory: 16Gi | |
nvidia.com/gpu: "1" | |
requests: | |
cpu: "4" | |
memory: 8Gi | |
nvidia.com/gpu: "1" | |
# runtime: vllm-openai | |
# runtimeVersion: v0.6.3 | |
--- | |
apiVersion: serving.kserve.io/v1alpha1 | |
kind: ClusterServingRuntime | |
metadata: | |
name: vllm-openai | |
spec: | |
annotations: | |
prometheus.kserve.io/path: /metrics | |
prometheus.kserve.io/port: "8080" | |
containers: | |
- args: | |
- --model | |
- /mnt/models | |
- --port | |
- "8080" | |
- --served-model-name | |
- '{{.Name}}' | |
image: docker.io/vllm/vllm-openai:v0.6.3 | |
name: kserve-container | |
ports: | |
- containerPort: 8080 | |
protocol: TCP | |
resources: | |
limits: | |
cpu: 1 | |
memory: 2Gi | |
requests: | |
cpu: 1 | |
memory: 2Gi | |
supportedModelFormats: | |
- autoSelect: false | |
name: huggingface | |
priority: 2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
set -e | |
export CLUSTER="gpu" | |
export PROJECT="kedify-initial" | |
#export GPU="nvidia-tesla-t4" | |
# export MACHINE="n1-highmem-8" | |
# export DISK="pd-standard" | |
export GPU="nvidia-l4" | |
export MACHINE="g2-standard-8" | |
export DISK="pd-balanced" | |
gcloud -q beta container clusters delete ${CLUSTER} --zone us-east4-a --project ${PROJECT} --async | |
gcloud beta container clusters create ${CLUSTER} \ | |
--project ${PROJECT} \ | |
--zone us-east4-a \ | |
--release-channel "regular" \ | |
--machine-type "${MACHINE}" \ | |
--accelerator "type=${GPU},count=1,gpu-driver-version=default" \ | |
--image-type "UBUNTU_CONTAINERD" \ | |
--disk-type "${DISK}" \ | |
--disk-size "300" \ | |
--metadata disable-legacy-endpoints=true \ | |
--service-account "${PROJECT}@${PROJECT}.iam.gserviceaccount.com" \ | |
--spot \ | |
--no-enable-intra-node-visibility \ | |
--max-pods-per-node "110" \ | |
--num-nodes "1" \ | |
--logging=SYSTEM,WORKLOAD \ | |
--monitoring=SYSTEM \ | |
--enable-ip-alias \ | |
--security-posture=disabled \ | |
--workload-vulnerability-scanning=disabled \ | |
--no-enable-managed-prometheus \ | |
--no-enable-intra-node-visibility \ | |
--default-max-pods-per-node "110" \ | |
--no-enable-master-authorized-networks \ | |
--tags=nvidia-ingress-all | |
sleep 5 | |
gcloud container clusters update ${CLUSTER} \ | |
--project ${PROJECT} \ | |
--zone us-east4-a \ | |
--enable-autoprovisioning \ | |
--min-cpu=1 --max-cpu=8 --min-memory=1 --max-memory=52 \ | |
--autoprovisioning-scopes=https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring,https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/compute | |
sleep 5 | |
# login | |
gcloud container clusters get-credentials ${CLUSTER} --zone us-east4-a --project ${PROJECT} | |
# install kserve and model | |
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml | |
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} rollout status -ncert-manager deploy/cert-manager-webhook | |
helm --kube-context gke_kedify-initial_us-east4-a_${CLUSTER} upgrade -i kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.15.0-rc1 | |
helm --kube-context gke_kedify-initial_us-east4-a_${CLUSTER} upgrade -i kserve oci://ghcr.io/kserve/charts/kserve --version v0.15.0-rc1 --set kserve.controller.deploymentMode=RawDeployment | |
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} rollout status deploy/kserve-controller-manager | |
kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} apply -f ./all.yaml | |
# expose the svc | |
kubectl expose deploy/huggingface-llama3-predictor --type=LoadBalancer --name=llama3 --port=8080 --target-port=8080 | |
while [ "x$(kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} get svc llama3 -ojsonpath='{.status.loadBalancer.ingress[].ip}')" = "x" ] ; do sleep 1;printf .; done | |
IP=$(kubectl --context gke_kedify-initial_us-east4-a_${CLUSTER} get svc llama3 -ojsonpath='{.status.loadBalancer.ingress[].ip}') | |
PROMPT="What tool is the best for templating YAMLs in k8s ecosystem?" | |
curl -XPOST -H 'Content-Type: application/json' -H "Host: huggingface-llama3-predictor" http://${IP}:8080/v1/chat/completions -d '{ "model": "llama3", "messages": [ { "role": "user", "content": "${PROMPT}"} ], "stream": false, "max_tokens": 300 }' | jq | |
#hey -c 100 curl -XPOST -H 'Content-Type: application/json' -H "Host: huggingface-llama3-predictor" http://${IP}:8080/v1/chat/completions -d '{ "model": "llama3", "messages": [ { "role": "user", "content": "${PROMPT}"} ], "stream": false, "max_tokens": 300 }' | jq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment