Skip to content

Instantly share code, notes, and snippets.

@nerdalert
Created May 7, 2025 17:22
Show Gist options
  • Save nerdalert/53f6e09e042358cdd002041f138b7144 to your computer and use it in GitHub Desktop.
Save nerdalert/53f6e09e042358cdd002041f138b7144 to your computer and use it in GitHub Desktop.
$ helm template llm-d . --debug --namespace default --values values.yaml
install.go:225: 2025-05-07 17:20:53.000638786 +0000 UTC m=+0.031145623 [debug] Original chart version: ""
install.go:242: 2025-05-07 17:20:53.000679067 +0000 UTC m=+0.031185914 [debug] CHART PATH: /home/ubuntu/tmp/llm-d-deployer/charts/llm-d
---
# Source: llm-d/charts/redis/templates/master/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
automountServiceAccountToken: false
metadata:
name: llm-d-redis-master
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
---
# Source: llm-d/templates/modelservice/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: llm-d-llm-d-modelservice
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
imagePullSecrets:
- name: llm-d-pull-secret
---
# Source: llm-d/charts/redis/templates/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: llm-d-redis
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
type: Opaque
data:
redis-password: "SjZib2ltOWNpaw=="
---
# Source: llm-d/charts/redis/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-redis-configuration
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
data:
redis.conf: |-
# User-supplied common configuration:
# Enable AOF https://redis.io/topics/persistence#append-only-file
appendonly yes
# Disable RDB persistence, AOF persistence already enabled.
save ""
# End of common configuration
master.conf: |-
dir /data
# User-supplied master configuration:
rename-command FLUSHDB ""
rename-command FLUSHALL ""
# End of master configuration
replica.conf: |-
dir /data
# User-supplied replica configuration:
rename-command FLUSHDB ""
rename-command FLUSHALL ""
# End of replica configuration
users.acl: |-
---
# Source: llm-d/charts/redis/templates/health-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-redis-health
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
data:
ping_readiness_local.sh: |-
#!/bin/bash
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
response=$(
timeout -s 15 $1 \
redis-cli \
-h localhost \
-p $REDIS_PORT \
ping
)
if [ "$?" -eq "124" ]; then
echo "Timed out"
exit 1
fi
if [ "$response" != "PONG" ]; then
echo "$response"
exit 1
fi
ping_liveness_local.sh: |-
#!/bin/bash
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
response=$(
timeout -s 15 $1 \
redis-cli \
-h localhost \
-p $REDIS_PORT \
ping
)
if [ "$?" -eq "124" ]; then
echo "Timed out"
exit 1
fi
responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ] && [ "$responseFirstWord" != "MASTERDOWN" ]; then
echo "$response"
exit 1
fi
ping_readiness_master.sh: |-
#!/bin/bash
[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
response=$(
timeout -s 15 $1 \
redis-cli \
-h $REDIS_MASTER_HOST \
-p $REDIS_MASTER_PORT_NUMBER \
ping
)
if [ "$?" -eq "124" ]; then
echo "Timed out"
exit 1
fi
if [ "$response" != "PONG" ]; then
echo "$response"
exit 1
fi
ping_liveness_master.sh: |-
#!/bin/bash
[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
response=$(
timeout -s 15 $1 \
redis-cli \
-h $REDIS_MASTER_HOST \
-p $REDIS_MASTER_PORT_NUMBER \
ping
)
if [ "$?" -eq "124" ]; then
echo "Timed out"
exit 1
fi
responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ]; then
echo "$response"
exit 1
fi
ping_readiness_local_and_master.sh: |-
script_dir="$(dirname "$0")"
exit_status=0
"$script_dir/ping_readiness_local.sh" $1 || exit_status=$?
"$script_dir/ping_readiness_master.sh" $1 || exit_status=$?
exit $exit_status
ping_liveness_local_and_master.sh: |-
script_dir="$(dirname "$0")"
exit_status=0
"$script_dir/ping_liveness_local.sh" $1 || exit_status=$?
"$script_dir/ping_liveness_master.sh" $1 || exit_status=$?
exit $exit_status
---
# Source: llm-d/charts/redis/templates/scripts-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-redis-scripts
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
data:
start-master.sh: |
#!/bin/bash
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
if [[ -f /opt/bitnami/redis/mounted-etc/master.conf ]];then
cp /opt/bitnami/redis/mounted-etc/master.conf /opt/bitnami/redis/etc/master.conf
fi
if [[ -f /opt/bitnami/redis/mounted-etc/redis.conf ]];then
cp /opt/bitnami/redis/mounted-etc/redis.conf /opt/bitnami/redis/etc/redis.conf
fi
if [[ -f /opt/bitnami/redis/mounted-etc/users.acl ]];then
cp /opt/bitnami/redis/mounted-etc/users.acl /opt/bitnami/redis/etc/users.acl
fi
ARGS=("--port" "${REDIS_PORT}")
ARGS+=("--requirepass" "${REDIS_PASSWORD}")
ARGS+=("--masterauth" "${REDIS_PASSWORD}")
ARGS+=("--include" "/opt/bitnami/redis/etc/redis.conf")
ARGS+=("--include" "/opt/bitnami/redis/etc/master.conf")
exec redis-server "${ARGS[@]}"
---
# Source: llm-d/templates/modelservice/not-working-presets/basic-gpu-preset.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-basic-gpu-preset
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
data:
configMaps: |
- apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-config-decoder
data:
lmcache-decoder-config.yaml: |
local_cpu: False
max_local_cpu_size: 0
#local_disk:
max_local_disk_size: 0
remote_serde: NULL
enable_nixl: False
- apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-config-prefiller
data:
lmcache-prefiller-config.yaml: |
local_cpu: False
max_local_cpu_size: 0
#local_disk:
max_local_disk_size: 0
remote_serde: NULL
enable_nixl: False
decodeDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
initContainers:
- name: routing-proxy
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
args:
- "--port=8000"
- "--vllm-port=8001"
ports:
- containerPort: 8000
protocol: TCP
restartPolicy: Always
containers:
- name: vllm
image: quay.io/llm-d/llm-d-dev:0.0.5
securityContext:
allowPrivilegeEscalation: false
args:
- "--port"
- "8001"
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
env:
# TODO: rm this env for llama
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: LMCACHE_CONFIG_FILE
value: /vllm-workspace/lmcache-decoder-config.yaml
- name: LMCACHE_USE_EXPERIMENTAL
value: "True"
- name: VLLM_ENABLE_V1_MULTIPROCESSING
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: spawn
- name: HF_HUB_CACHE
value: /vllm-workspace/models
ports:
- containerPort: 55555
protocol: TCP
volumeMounts:
- name: config-decoder
mountPath: /vllm-workspace
- name: model-cache
mountPath: /vllm-workspace/models
volumes:
- name: config-decoder
configMap:
name: llm-d-modelservice-config-decoder
- name: model-cache
emptyDir:
sizeLimit: 1Gi
prefillDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: vllm
image: quay.io/llm-d/llm-d-dev:0.0.5
securityContext:
allowPrivilegeEscalation: false
args:
- "--port"
- "8000"
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: LMCACHE_CONFIG_FILE
value: /vllm-workspace/lmcache-prefiller-config.yaml
- name: LMCACHE_USE_EXPERIMENTAL
value: "True"
- name: VLLM_ENABLE_V1_MULTIPROCESSING
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: spawn
- name: HF_HUB_CACHE
value: /vllm-workspace/models
volumeMounts:
- name: config-prefiller
mountPath: /vllm-workspace
- name: model-cache
mountPath: /vllm-workspace/models
ports:
- containerPort: 8000
protocol: TCP
volumes:
- name: config-prefiller
configMap:
name: llm-d-modelservice-config-prefiller
- name: model-cache
emptyDir:
sizeLimit: 1Gi
decodeService: |
apiVersion: v1
kind: Service
spec:
clusterIP: None
ports:
- name: nixl
port: 55555
protocol: TCP
- name: vllm
port: 8000
protocol: TCP
prefillService: |
apiVersion: v1
kind: Service
spec:
clusterIP: None
ports:
- name: nixl
port: 55555
protocol: TCP
- name: vllm
port: 8000
protocol: TCP
eppService: |
apiVersion: v1
kind: Service
spec:
ports:
- port: 9002 # Needs to match the port of the eppDeployment
protocol: TCP
name: grpc
- port: 9003
protocol: TCP
name: grpc-health
- port: 9090
protocol: TCP
name: metrics
type: NodePort # accepts "LoadBalancer" or "NodePort"
eppDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- args:
- --poolName
- llm-d-modelservice
- --poolNamespace
- default
- -v
- "4"
- --zap-encoder
- "json"
- --grpcPort
- -"9002"
- --grpcHealthPort
- -"9003"
env:
- name: KVCACHE_INDEXER_REDIS_ADDR
value: llm-d-redis-master.default.svc.cluster.local:6379:8100
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
grpc:
port: 9003
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: epp
ports:
- name: grpc
containerPort: 9002
protocol: TCP
- name: grpc-health
containerPort: 9003
protocol: TCP
- name: metrics
containerPort: 9090
protocol: TCP
readinessProbe:
failureThreshold: 3
grpc:
port: 9003
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
inferencePool: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
spec:
targetPortNumber: 8000
inferenceModel: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
---
# Source: llm-d/templates/modelservice/not-working-presets/basic-sim-preset.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-basic-sim-preset
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
data:
decodeDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: vllm
image: quay.io/llm-d/vllm-sim-dev:0.0.4
securityContext:
allowPrivilegeEscalation: false
args:
- "--port"
- "8001"
ports:
- containerPort: 55555
protocol: TCP
decodeService: |
apiVersion: v1
kind: Service
spec:
clusterIP: None
ports:
- name: vllm
port: 8000
protocol: TCP
eppService: |
apiVersion: v1
kind: Service
spec:
ports:
- port: 9002 # Needs to match the port of the eppDeployment
protocol: TCP
name: grpc
- port: 9003
protocol: TCP
name: grpc-health
- port: 9090
protocol: TCP
name: metrics
type: NodePort # accepts "LoadBalancer" or "NodePort"
eppDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- args:
- --poolName
- llm-d-modelservice
- --poolNamespace
- default
- -v
- "4"
- --zap-encoder
- json
- --grpcPort
- "9002"
- --grpcHealthPort
- "9003"
env:
- name: KVCACHE_INDEXER_REDIS_ADDR
value: llm-d-redis-master.default.svc.cluster.local:6379:8100
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
grpc:
port: 9003
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: epp
ports:
- name: grpc
containerPort: 9002
protocol: TCP
- name: grpc-health
containerPort: 9003
protocol: TCP
- name: metrics
containerPort: 9090
protocol: TCP
readinessProbe:
failureThreshold: 3
grpc:
port: 9003
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
inferencePool: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
spec:
targetPortNumber: 8000
inferenceModel: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
---
# Source: llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-basic-gpu-with-nixl-preset
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
data:
configMaps: |
- apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-config-decoder
data:
lmcache-decoder-config.yaml: |
local_cpu: False
max_local_cpu_size: 0
max_local_disk_size: 0
remote_serde: NULL
enable_nixl: True
nixl_role: receiver
nixl_peer_host: 0.0.0.0
nixl_peer_port: 55555
nixl_buffer_size: 524288
nixl_buffer_device: "cuda"
nixl_enable_gc: True
- apiVersion: v1
kind: ConfigMap
metadata:
name: llm-d-modelservice-config-prefiller
data:
lmcache-prefiller-config.yaml: |
local_cpu: False
max_local_cpu_size: 0
max_local_disk_size: 0
remote_serde: NULL
enable_nixl: True
nixl_role: "sender"
nixl_peer_host: "{{ .DecodeServiceName }}"
nixl_peer_port: 55555
nixl_buffer_size: 524288
nixl_buffer_device: "cuda"
nixl_enable_gc: True
decodeDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: routing-proxy
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
args:
- "--port=8001"
- "--vllm-port=8000"
ports:
- containerPort: 8000
protocol: TCP
- name: vllm
image: quay.io/llm-d/llm-d-dev:0.0.5
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
args:
- "--port"
- "8000"
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
env:
- name: XDG_CACHE_HOME
value: /tmp
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: LMCACHE_DISTRIBUTED_URL
value: ${POD_IP}:80
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: llm-d-hf-token
key: HF_TOKEN
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: LMCACHE_USE_EXPERIMENTAL
value: "True"
- name: VLLM_ENABLE_V1_MULTIPROCESSING
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: spawn
- name: LMCACHE_CONFIG_FILE
value: /vllm-workspace/lmcache-decoder-config.yaml
- name: LMCACHE_LOOKUP_URL
value: llm-d-redis-master.default.svc.cluster.local:6379
ports:
- containerPort: 8001
protocol: TCP
- containerPort: 55555
protocol: TCP
volumeMounts:
- name: config-decoder
mountPath: /vllm-workspace
- name: model-cache
mountPath: /vllm-workspace/models
- name: model-storage
mountPath: /cache
volumes:
- name: config-decoder
configMap:
name: llm-d-modelservice-config-decoder
- name: model-cache
emptyDir:
sizeLimit: 1Gi
prefillDeployment: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: "routing-proxy"
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
args:
- "--port=8001"
- "--vllm-port=8000"
ports:
- containerPort: 8000
protocol: TCP
- name: vllm
image: quay.io/llm-d/llm-d-dev:0.0.5
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
args:
- "--port"
- "8000"
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
env:
- name: XDG_CACHE_HOME
value: /tmp
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: LMCACHE_DISTRIBUTED_URL
value: "${POD_IP}:80"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: llm-d-hf-token
key: HF_TOKEN
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: LMCACHE_USE_EXPERIMENTAL
value: "True"
- name: VLLM_ENABLE_V1_MULTIPROCESSING
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: spawn
- name: LMCACHE_CONFIG_FILE
value: /vllm-workspace/lmcache-prefiller-config.yaml
- name: LMCACHE_LOOKUP_URL
value: llm-d-redis-master.default.svc.cluster.local:6379
volumeMounts:
- name: config-prefiller
mountPath: /vllm-workspace
- name: model-cache
mountPath: /vllm-workspace/models
- name: model-storage
mountPath: /cache
ports:
- containerPort: 8001
protocol: TCP
- containerPort: 55555
protocol: TCP
volumes:
- name: config-prefiller
configMap:
name: llm-d-modelservice-config-prefiller
- name: model-cache
emptyDir:
sizeLimit: 1Gi
decodeService: |
apiVersion: v1
kind: Service
metadata:
labels:
llmd.ai/gather-metrics: "true"
spec:
clusterIP: None
ports:
- name: nixl
port: 55555
protocol: TCP
- name: vllm
port: 8000
protocol: TCP
prefillService: |
apiVersion: v1
kind: Service
metadata:
labels:
llmd.ai/gather-metrics: "true"
spec:
clusterIP: None
ports:
- name: nixl
port: 55555
protocol: TCP
- name: vllm
port: 8000
protocol: TCP
eppService: |
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/gateway: llm-d-inference-gateway
llmd.ai/gather-metrics: "true"
spec:
ports:
- port: 9002 # Needs to match the port of the eppDeployment
protocol: TCP
name: grpc
- port: 9003
protocol: TCP
name: grpc-health
- port: 9090
protocol: TCP
name: metrics
type: NodePort # accepts "LoadBalancer" or "NodePort"
selector:
app.kubernetes.io/gateway: llm-d-inference-gateway
eppDeployment: |
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/gateway: llm-d-inference-gateway
spec:
selector:
matchLabels:
app.kubernetes.io/gateway: llm-d-inference-gateway
template:
metadata:
labels:
app.kubernetes.io/gateway: llm-d-inference-gateway
spec:
serviceAccountName: endpoint-picker-sa # manually created in workaround w/ proper RBAC
containers:
- args:
- --poolName
- "{{ .InferencePoolName }}"
- --poolNamespace
- "{{ .ModelServiceNamespace }}"
- -v
- "4"
- --zap-encoder
- json
- --grpcPort
- "9002"
- --grpcHealthPort
- "9003"
env:
- name: KVCACHE_INDEXER_REDIS_ADDR
value: llm-d-redis-master.default.svc.cluster.local:6379:8100
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: llm-d-hf-token
key: HF_TOKEN
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64
imagePullPolicy: IfNotPresent
# livenessProbe:
# failureThreshold: 3
# grpc:
# port: 9003
# service: "{{ .EPPServiceName }}"
# initialDelaySeconds: 5
# periodSeconds: 10
# successThreshold: 1
# timeoutSeconds: 1
# readinessProbe:
# failureThreshold: 3
# grpc:
# port: 9003
# service: "{{ .EPPServiceName }}"
# initialDelaySeconds: 5
# periodSeconds: 10
# successThreshold: 1
# timeoutSeconds: 1
name: epp
ports:
- name: grpc
containerPort: 9002
protocol: TCP
- name: grpc-health
containerPort: 9003
protocol: TCP
- name: metrics
containerPort: 9090
protocol: TCP
inferencePool: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
spec:
targetPortNumber: 8000
inferenceModel: |
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
---
# Source: llm-d/charts/redis/templates/master/pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: redis-data-llm-d-redis-master
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
app.kubernetes.io/component: master
spec:
accessModes:
- "ReadWriteOnce"
resources:
requests:
storage: "5Gi"
---
# Source: llm-d/templates/modelservice/clusterRoleAdmin.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-admin-role
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- llm-d.ai
resources:
- modelservices
verbs:
- '*'
- apiGroups:
- llm-d.ai
resources:
- modelservices/status
verbs:
- get
---
# Source: llm-d/templates/modelservice/clusterRoleEditorRole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-editor-role
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- llm-d.ai
resources:
- modelservices
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- llm-d.ai
resources:
- modelservices/status
verbs:
- get
---
# Source: llm-d/templates/modelservice/clusterRoleManager.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-manager-role
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- list
- watch
- create
- apiGroups:
- ""
resources:
- services
- serviceaccounts
verbs:
- get
- list
- watch
- create
- update
- apiGroups:
- rbac.authorization.k8s.io
resources:
- roles
- rolebindings
verbs:
- get
- list
- watch
- create
- update
# MSV2 HACK BEGIN ---------------
- apiGroups:
- "v1"
resources:
- "secrets"
verbs:
- "get"
- "list"
- "watch"
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencepools"
- "inferencemodels"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- ""
resources:
- "pods"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "discovery.k8s.io"
resources:
- "endpointslices"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "authentication.k8s.io"
resources:
- "tokenreviews"
verbs:
- "create"
- apiGroups:
- "authorization.k8s.io"
resources:
- "subjectaccessreviews"
verbs:
- "create"
# MSV2 HACK END ---------------
- apiGroups:
- apps
resources:
- deployments
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- apps
resources:
- deployments/scale
verbs:
- patch
- update
- apiGroups:
- inference.networking.x-k8s.io
resources:
- inferencemodel
- inferencepool
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- llm-d.ai
resources:
- modelservices
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- llm-d.ai
resources:
- modelservices/finalizers
verbs:
- update
- apiGroups:
- llm-d.ai
resources:
- modelservices/status
verbs:
- get
- patch
- update
---
# Source: llm-d/templates/modelservice/clusterRoleMetricsAuth.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-metrics-auth-role
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
---
# Source: llm-d/templates/modelservice/clusterRoleMetricsReader.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-metrics-reader
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- nonResourceURLs:
- /metrics
verbs:
- get
---
# Source: llm-d/templates/modelservice/clusterRoleViewer.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: llm-d-modelservice-viewer-role
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- llm-d.ai
resources:
- modelservices
verbs:
- get
- list
- watch
- apiGroups:
- llm-d.ai
resources:
- modelservices/status
verbs:
- get
---
# Source: llm-d/templates/modelservice/ms-v2-hack/clusterRole-epp.yaml
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: endpoint-picker-clusterrole
rules:
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencepools"
- "inferencemodels"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- ""
resources:
- "pods"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "discovery.k8s.io"
resources:
- "endpointslices"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "authentication.k8s.io"
resources:
- "tokenreviews"
verbs:
- "create"
- apiGroups:
- "authorization.k8s.io"
resources:
- "subjectaccessreviews"
verbs:
- "create"
---
# Source: llm-d/templates/modelservice/clusterRoleBindingManager.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: llm-d-modelservice-manager-rolebinding
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: llm-d-modelservice-manager-role
subjects:
- kind: ServiceAccount
name: llm-d-llm-d-modelservice
namespace: default
---
# Source: llm-d/templates/modelservice/clusterRoleBindingMetricsAuth.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: llm-d-modelservice-metrics-auth-rolebinding
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: llm-d-modelservice-metrics-auth-role
subjects:
- kind: ServiceAccount
apiGroup: ""
name: llm-d-llm-d-modelservice
namespace: default
---
# Source: llm-d/templates/modelservice/role.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: llm-d-modelservice
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
rules:
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
---
# Source: llm-d/templates/modelservice/rolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: llm-d-modelservice
namespace: default
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: modelservice
annotations:
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: llm-d-modelservice
subjects:
- kind: ServiceAccount
apiGroup: ""
name: llm-d-llm-d-modelservice
namespace: default
---
# Source: llm-d/charts/redis/templates/headless-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: llm-d-redis-headless
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
spec:
type: ClusterIP
clusterIP: None
ports:
- name: tcp-redis
port: 6379
targetPort: redis
selector:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/name: redis
---
# Source: llm-d/charts/redis/templates/master/service.yaml
apiVersion: v1
kind: Service
metadata:
name: llm-d-redis-master
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
app.kubernetes.io/component: master
spec:
type: ClusterIP
internalTrafficPolicy: Cluster
sessionAffinity: None
ports:
- name: tcp-redis
port: 6379
targetPort: redis
nodePort: null
selector:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/name: redis
app.kubernetes.io/component: master
---
# Source: llm-d/templates/modelservice/service.yaml
apiVersion: v1
kind: Service
metadata:
name: llm-d-modelservice
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
control-plane: controller-manager
app.kubernetes.io/component: modelservice
llmd.ai/gather-metrics: "true"
annotations:
spec:
ports:
- port: 8443
protocol: TCP
targetPort: 8443
selector:
app.kubernetes.io/name: llm-d
app.kubernetes.io/instance: llm-d
control-plane: controller-manager
app.kubernetes.io/component: modelservice
type: ClusterIP
---
# Source: llm-d/charts/redis/templates/master/application.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-d-redis-master
namespace: "default"
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
app.kubernetes.io/component: master
spec:
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/name: redis
app.kubernetes.io/component: master
strategy:
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: redis
app.kubernetes.io/version: 7.4.3
helm.sh/chart: redis-20.13.4
app.kubernetes.io/component: master
annotations:
checksum/configmap: 2a9ab4a5432825504d910f022638674ce88eaefe9f9f595ad8bc107377d104fb
checksum/health: aff24913d801436ea469d8d374b2ddb3ec4c43ee7ab24663d5f8ff1a1b6991a9
checksum/scripts: 0717e77fd3bb941f602860e9be4f2ed87b481cddeadf37be463f8512ecde0c3e
checksum/secret: 5448db886ef764f92d30ce06839b56a06ebc552f1facb07eaa4de676f3ec9097
spec:
imagePullSecrets:
- name: llm-d-pull-secret
securityContext:
fsGroup: 1001
fsGroupChangePolicy: Always
supplementalGroups: []
sysctls: []
serviceAccountName: llm-d-redis-master
automountServiceAccountToken: false
affinity:
podAffinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/instance: llm-d
app.kubernetes.io/name: redis
app.kubernetes.io/component: master
topologyKey: kubernetes.io/hostname
weight: 1
nodeAffinity:
enableServiceLinks: true
terminationGracePeriodSeconds: 30
containers:
- name: redis
image: registry.redhat.io/rhel9/redis-7:9.5-1744185101
imagePullPolicy: "IfNotPresent"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 1001
runAsNonRoot: true
runAsUser: 1001
seLinuxOptions: {}
seccompProfile:
type: RuntimeDefault
command:
- /bin/bash
args:
- -ec
- /opt/bitnami/scripts/start-scripts/start-master.sh
env:
- name: BITNAMI_DEBUG
value: "false"
- name: REDIS_REPLICATION_MODE
value: master
- name: ALLOW_EMPTY_PASSWORD
value: "no"
- name: REDIS_PASSWORD_FILE
value: "/opt/bitnami/redis/secrets/redis-password"
- name: REDIS_TLS_ENABLED
value: "no"
- name: REDIS_PORT
value: "6379"
ports:
- name: redis
containerPort: 6379
livenessProbe:
initialDelaySeconds: 20
periodSeconds: 5
# One second longer than command timeout should prevent generation of zombie processes.
timeoutSeconds: 6
successThreshold: 1
failureThreshold: 5
exec:
command:
- /bin/bash
- -ec
- /health/ping_liveness_local.sh 5
readinessProbe:
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 2
successThreshold: 1
failureThreshold: 5
exec:
command:
- /bin/bash
- -ec
- /health/ping_readiness_local.sh 1
resources:
limits:
cpu: 250m
memory: 256Mi
requests:
cpu: 100m
memory: 128Mi
volumeMounts:
- name: start-scripts
mountPath: /opt/bitnami/scripts/start-scripts
- name: health
mountPath: /health
- name: redis-password
mountPath: /opt/bitnami/redis/secrets/
- name: redis-data
mountPath: /data
- name: config
mountPath: /opt/bitnami/redis/mounted-etc
- name: empty-dir
mountPath: /opt/bitnami/redis/etc/
subPath: app-conf-dir
- name: empty-dir
mountPath: /tmp
subPath: tmp-dir
volumes:
- name: start-scripts
configMap:
name: llm-d-redis-scripts
defaultMode: 0755
- name: health
configMap:
name: llm-d-redis-health
defaultMode: 0755
- name: redis-password
secret:
secretName: llm-d-redis
items:
- key: redis-password
path: redis-password
- name: config
configMap:
name: llm-d-redis-configuration
- name: empty-dir
emptyDir: {}
- name: redis-data
persistentVolumeClaim:
claimName: redis-data-llm-d-redis-master
---
# Source: llm-d/templates/modelservice/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-d-modelservice
namespace: default
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
control-plane: controller-manager
app.kubernetes.io/component: modelservice
annotations:
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: llm-d
app.kubernetes.io/instance: llm-d
control-plane: controller-manager
app.kubernetes.io/component: modelservice
template:
metadata:
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
control-plane: controller-manager
app.kubernetes.io/component: modelservice
annotations:
spec:
containers:
- args:
- --metrics-bind-address=:8443
- --leader-elect=false
- --health-probe-bind-address=:8081
# MSV2 HACK BEGIN
- --epp-cluster-role
- endpoint-picker-clusterrole
- --epp-pull-secrets
- "llm-d-pull-secret"
- --pd-pull-secrets
- "llm-d-pull-secret"
# MSV2 HACK END
command:
- /manager
image: quay.io/llm-d/llm-d-model-service:0.0.6
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 8081
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 20
successThreshold: 1
timeoutSeconds: 1
name: manager
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: 8081
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 500m
memory: 128Mi
requests:
cpu: 10m
memory: 64Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
securityContext:
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
serviceAccountName: llm-d-llm-d-modelservice
---
# Source: llm-d/templates/sample-application/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: llm-d-inference-gateway
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/gateway: llm-d-inference-gateway
app.kubernetes.io/component: sample-application
annotations:
spec:
rules:
- http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: llm-d-inference-gateway
port:
number: 80
---
# Source: llm-d/templates/inference-gateway/gateway.yaml
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: llm-d-inference-gateway
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/gateway: llm-d-inference-gateway
app.kubernetes.io/component: inference-gateway
annotations:
spec:
gatewayClassName: "kgateway"
listeners:
- name: default
port: 80
protocol: HTTP
---
# Source: llm-d/templates/inference-gateway/gatewayparameters.yaml
apiVersion: gateway.kgateway.dev/v1alpha1
kind: GatewayParameters
metadata:
name: llm-d-inference-gateway
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/gateway: llm-d-inference-gateway
app.kubernetes.io/component: inference-gateway
annotations:
spec:
kube:
envoyContainer:
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
service:
type: "NodePort"
extraLabels:
gateway: custom
podTemplate:
extraLabels:
gateway: custom
sdsContainer:
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
seccompProfile:
type: RuntimeDefault
---
# Source: llm-d/templates/sample-application/httproutes.yaml
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: llama-3-2-3b-instruct
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: sample-application
annotations:
spec:
parentRefs:
- name: llm-d-inference-gateway
rules:
- matches:
- path:
type: PathPrefix
value: /
backendRefs:
- group: inference.networking.x-k8s.io
kind: InferencePool
name: "llama-3-2-3b-instruct-inference-pool"
port: 8000
---
# Source: llm-d/templates/sample-application/modelservice.yaml
apiVersion: llm-d.ai/v1alpha1
kind: ModelService
metadata:
name: llama-3-2-3b-instruct
labels:
app.kubernetes.io/name: llm-d
helm.sh/chart: llm-d-0.1.1
app.kubernetes.io/instance: llm-d
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/version: "0.0.1"
app.kubernetes.io/component: sample-application
annotations:
spec:
decoupleScaling: false
baseConfigMapRef:
name: llm-d-modelservice-basic-gpu-with-nixl-preset
routing:
modelName: Llama-3.2-3B-Instruct
modelArtifacts:
uri: pvc://llama-3.2-3b-instruct-pvc/models/meta-llama/Llama-32-3B-Instruct
decode:
replicas: 1
containers:
- name: "vllm"
command:
- vllm
- serve
args:
- "/cache/models/meta-llama/Llama-3.2-3B-Instruct"
- "--served-model-name"
- Llama-3.2-3B-Instruct
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
prefill:
replicas: 1
containers:
- name: "vllm"
command:
- vllm
- serve
args:
- "/cache/models/meta-llama/Llama-3.2-3B-Instruct"
- "--served-model-name"
- Llama-3.2-3B-Instruct
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment