Created
May 7, 2025 17:22
-
-
Save nerdalert/53f6e09e042358cdd002041f138b7144 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ helm template llm-d . --debug --namespace default --values values.yaml | |
install.go:225: 2025-05-07 17:20:53.000638786 +0000 UTC m=+0.031145623 [debug] Original chart version: "" | |
install.go:242: 2025-05-07 17:20:53.000679067 +0000 UTC m=+0.031185914 [debug] CHART PATH: /home/ubuntu/tmp/llm-d-deployer/charts/llm-d | |
--- | |
# Source: llm-d/charts/redis/templates/master/serviceaccount.yaml | |
apiVersion: v1 | |
kind: ServiceAccount | |
automountServiceAccountToken: false | |
metadata: | |
name: llm-d-redis-master | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
--- | |
# Source: llm-d/templates/modelservice/serviceaccount.yaml | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: llm-d-llm-d-modelservice | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
imagePullSecrets: | |
- name: llm-d-pull-secret | |
--- | |
# Source: llm-d/charts/redis/templates/secret.yaml | |
apiVersion: v1 | |
kind: Secret | |
metadata: | |
name: llm-d-redis | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
type: Opaque | |
data: | |
redis-password: "SjZib2ltOWNpaw==" | |
--- | |
# Source: llm-d/charts/redis/templates/configmap.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-redis-configuration | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
data: | |
redis.conf: |- | |
# User-supplied common configuration: | |
# Enable AOF https://redis.io/topics/persistence#append-only-file | |
appendonly yes | |
# Disable RDB persistence, AOF persistence already enabled. | |
save "" | |
# End of common configuration | |
master.conf: |- | |
dir /data | |
# User-supplied master configuration: | |
rename-command FLUSHDB "" | |
rename-command FLUSHALL "" | |
# End of master configuration | |
replica.conf: |- | |
dir /data | |
# User-supplied replica configuration: | |
rename-command FLUSHDB "" | |
rename-command FLUSHALL "" | |
# End of replica configuration | |
users.acl: |- | |
--- | |
# Source: llm-d/charts/redis/templates/health-configmap.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-redis-health | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
data: | |
ping_readiness_local.sh: |- | |
#!/bin/bash | |
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" | |
[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD" | |
response=$( | |
timeout -s 15 $1 \ | |
redis-cli \ | |
-h localhost \ | |
-p $REDIS_PORT \ | |
ping | |
) | |
if [ "$?" -eq "124" ]; then | |
echo "Timed out" | |
exit 1 | |
fi | |
if [ "$response" != "PONG" ]; then | |
echo "$response" | |
exit 1 | |
fi | |
ping_liveness_local.sh: |- | |
#!/bin/bash | |
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" | |
[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD" | |
response=$( | |
timeout -s 15 $1 \ | |
redis-cli \ | |
-h localhost \ | |
-p $REDIS_PORT \ | |
ping | |
) | |
if [ "$?" -eq "124" ]; then | |
echo "Timed out" | |
exit 1 | |
fi | |
responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}') | |
if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ] && [ "$responseFirstWord" != "MASTERDOWN" ]; then | |
echo "$response" | |
exit 1 | |
fi | |
ping_readiness_master.sh: |- | |
#!/bin/bash | |
[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")" | |
[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD" | |
response=$( | |
timeout -s 15 $1 \ | |
redis-cli \ | |
-h $REDIS_MASTER_HOST \ | |
-p $REDIS_MASTER_PORT_NUMBER \ | |
ping | |
) | |
if [ "$?" -eq "124" ]; then | |
echo "Timed out" | |
exit 1 | |
fi | |
if [ "$response" != "PONG" ]; then | |
echo "$response" | |
exit 1 | |
fi | |
ping_liveness_master.sh: |- | |
#!/bin/bash | |
[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")" | |
[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD" | |
response=$( | |
timeout -s 15 $1 \ | |
redis-cli \ | |
-h $REDIS_MASTER_HOST \ | |
-p $REDIS_MASTER_PORT_NUMBER \ | |
ping | |
) | |
if [ "$?" -eq "124" ]; then | |
echo "Timed out" | |
exit 1 | |
fi | |
responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}') | |
if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ]; then | |
echo "$response" | |
exit 1 | |
fi | |
ping_readiness_local_and_master.sh: |- | |
script_dir="$(dirname "$0")" | |
exit_status=0 | |
"$script_dir/ping_readiness_local.sh" $1 || exit_status=$? | |
"$script_dir/ping_readiness_master.sh" $1 || exit_status=$? | |
exit $exit_status | |
ping_liveness_local_and_master.sh: |- | |
script_dir="$(dirname "$0")" | |
exit_status=0 | |
"$script_dir/ping_liveness_local.sh" $1 || exit_status=$? | |
"$script_dir/ping_liveness_master.sh" $1 || exit_status=$? | |
exit $exit_status | |
--- | |
# Source: llm-d/charts/redis/templates/scripts-configmap.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-redis-scripts | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
data: | |
start-master.sh: | | |
#!/bin/bash | |
[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")" | |
if [[ -f /opt/bitnami/redis/mounted-etc/master.conf ]];then | |
cp /opt/bitnami/redis/mounted-etc/master.conf /opt/bitnami/redis/etc/master.conf | |
fi | |
if [[ -f /opt/bitnami/redis/mounted-etc/redis.conf ]];then | |
cp /opt/bitnami/redis/mounted-etc/redis.conf /opt/bitnami/redis/etc/redis.conf | |
fi | |
if [[ -f /opt/bitnami/redis/mounted-etc/users.acl ]];then | |
cp /opt/bitnami/redis/mounted-etc/users.acl /opt/bitnami/redis/etc/users.acl | |
fi | |
ARGS=("--port" "${REDIS_PORT}") | |
ARGS+=("--requirepass" "${REDIS_PASSWORD}") | |
ARGS+=("--masterauth" "${REDIS_PASSWORD}") | |
ARGS+=("--include" "/opt/bitnami/redis/etc/redis.conf") | |
ARGS+=("--include" "/opt/bitnami/redis/etc/master.conf") | |
exec redis-server "${ARGS[@]}" | |
--- | |
# Source: llm-d/templates/modelservice/not-working-presets/basic-gpu-preset.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-basic-gpu-preset | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
data: | |
configMaps: | | |
- apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-config-decoder | |
data: | |
lmcache-decoder-config.yaml: | | |
local_cpu: False | |
max_local_cpu_size: 0 | |
#local_disk: | |
max_local_disk_size: 0 | |
remote_serde: NULL | |
enable_nixl: False | |
- apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-config-prefiller | |
data: | |
lmcache-prefiller-config.yaml: | | |
local_cpu: False | |
max_local_cpu_size: 0 | |
#local_disk: | |
max_local_disk_size: 0 | |
remote_serde: NULL | |
enable_nixl: False | |
decodeDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
initContainers: | |
- name: routing-proxy | |
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6 | |
securityContext: | |
allowPrivilegeEscalation: false | |
runAsNonRoot: true | |
args: | |
- "--port=8000" | |
- "--vllm-port=8001" | |
ports: | |
- containerPort: 8000 | |
protocol: TCP | |
restartPolicy: Always | |
containers: | |
- name: vllm | |
image: quay.io/llm-d/llm-d-dev:0.0.5 | |
securityContext: | |
allowPrivilegeEscalation: false | |
args: | |
- "--port" | |
- "8001" | |
- "--kv-transfer-config" | |
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' | |
env: | |
# TODO: rm this env for llama | |
- name: CUDA_VISIBLE_DEVICES | |
value: "0" | |
- name: UCX_TLS | |
value: "cuda_ipc,cuda_copy,tcp" | |
- name: LMCACHE_CONFIG_FILE | |
value: /vllm-workspace/lmcache-decoder-config.yaml | |
- name: LMCACHE_USE_EXPERIMENTAL | |
value: "True" | |
- name: VLLM_ENABLE_V1_MULTIPROCESSING | |
value: "1" | |
- name: VLLM_WORKER_MULTIPROC_METHOD | |
value: spawn | |
- name: HF_HUB_CACHE | |
value: /vllm-workspace/models | |
ports: | |
- containerPort: 55555 | |
protocol: TCP | |
volumeMounts: | |
- name: config-decoder | |
mountPath: /vllm-workspace | |
- name: model-cache | |
mountPath: /vllm-workspace/models | |
volumes: | |
- name: config-decoder | |
configMap: | |
name: llm-d-modelservice-config-decoder | |
- name: model-cache | |
emptyDir: | |
sizeLimit: 1Gi | |
prefillDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- name: vllm | |
image: quay.io/llm-d/llm-d-dev:0.0.5 | |
securityContext: | |
allowPrivilegeEscalation: false | |
args: | |
- "--port" | |
- "8000" | |
- "--kv-transfer-config" | |
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' | |
env: | |
- name: CUDA_VISIBLE_DEVICES | |
value: "0" | |
- name: UCX_TLS | |
value: "cuda_ipc,cuda_copy,tcp" | |
- name: LMCACHE_CONFIG_FILE | |
value: /vllm-workspace/lmcache-prefiller-config.yaml | |
- name: LMCACHE_USE_EXPERIMENTAL | |
value: "True" | |
- name: VLLM_ENABLE_V1_MULTIPROCESSING | |
value: "1" | |
- name: VLLM_WORKER_MULTIPROC_METHOD | |
value: spawn | |
- name: HF_HUB_CACHE | |
value: /vllm-workspace/models | |
volumeMounts: | |
- name: config-prefiller | |
mountPath: /vllm-workspace | |
- name: model-cache | |
mountPath: /vllm-workspace/models | |
ports: | |
- containerPort: 8000 | |
protocol: TCP | |
volumes: | |
- name: config-prefiller | |
configMap: | |
name: llm-d-modelservice-config-prefiller | |
- name: model-cache | |
emptyDir: | |
sizeLimit: 1Gi | |
decodeService: | | |
apiVersion: v1 | |
kind: Service | |
spec: | |
clusterIP: None | |
ports: | |
- name: nixl | |
port: 55555 | |
protocol: TCP | |
- name: vllm | |
port: 8000 | |
protocol: TCP | |
prefillService: | | |
apiVersion: v1 | |
kind: Service | |
spec: | |
clusterIP: None | |
ports: | |
- name: nixl | |
port: 55555 | |
protocol: TCP | |
- name: vllm | |
port: 8000 | |
protocol: TCP | |
eppService: | | |
apiVersion: v1 | |
kind: Service | |
spec: | |
ports: | |
- port: 9002 # Needs to match the port of the eppDeployment | |
protocol: TCP | |
name: grpc | |
- port: 9003 | |
protocol: TCP | |
name: grpc-health | |
- port: 9090 | |
protocol: TCP | |
name: metrics | |
type: NodePort # accepts "LoadBalancer" or "NodePort" | |
eppDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- args: | |
- --poolName | |
- llm-d-modelservice | |
- --poolNamespace | |
- default | |
- -v | |
- "4" | |
- --zap-encoder | |
- "json" | |
- --grpcPort | |
- -"9002" | |
- --grpcHealthPort | |
- -"9003" | |
env: | |
- name: KVCACHE_INDEXER_REDIS_ADDR | |
value: llm-d-redis-master.default.svc.cluster.local:6379:8100 | |
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64 | |
imagePullPolicy: Always | |
livenessProbe: | |
failureThreshold: 3 | |
grpc: | |
port: 9003 | |
initialDelaySeconds: 5 | |
periodSeconds: 10 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
name: epp | |
ports: | |
- name: grpc | |
containerPort: 9002 | |
protocol: TCP | |
- name: grpc-health | |
containerPort: 9003 | |
protocol: TCP | |
- name: metrics | |
containerPort: 9090 | |
protocol: TCP | |
readinessProbe: | |
failureThreshold: 3 | |
grpc: | |
port: 9003 | |
initialDelaySeconds: 5 | |
periodSeconds: 10 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
inferencePool: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferencePool | |
spec: | |
targetPortNumber: 8000 | |
inferenceModel: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferenceModel | |
--- | |
# Source: llm-d/templates/modelservice/not-working-presets/basic-sim-preset.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-basic-sim-preset | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
data: | |
decodeDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- name: vllm | |
image: quay.io/llm-d/vllm-sim-dev:0.0.4 | |
securityContext: | |
allowPrivilegeEscalation: false | |
args: | |
- "--port" | |
- "8001" | |
ports: | |
- containerPort: 55555 | |
protocol: TCP | |
decodeService: | | |
apiVersion: v1 | |
kind: Service | |
spec: | |
clusterIP: None | |
ports: | |
- name: vllm | |
port: 8000 | |
protocol: TCP | |
eppService: | | |
apiVersion: v1 | |
kind: Service | |
spec: | |
ports: | |
- port: 9002 # Needs to match the port of the eppDeployment | |
protocol: TCP | |
name: grpc | |
- port: 9003 | |
protocol: TCP | |
name: grpc-health | |
- port: 9090 | |
protocol: TCP | |
name: metrics | |
type: NodePort # accepts "LoadBalancer" or "NodePort" | |
eppDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- args: | |
- --poolName | |
- llm-d-modelservice | |
- --poolNamespace | |
- default | |
- -v | |
- "4" | |
- --zap-encoder | |
- json | |
- --grpcPort | |
- "9002" | |
- --grpcHealthPort | |
- "9003" | |
env: | |
- name: KVCACHE_INDEXER_REDIS_ADDR | |
value: llm-d-redis-master.default.svc.cluster.local:6379:8100 | |
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64 | |
imagePullPolicy: Always | |
livenessProbe: | |
failureThreshold: 3 | |
grpc: | |
port: 9003 | |
initialDelaySeconds: 5 | |
periodSeconds: 10 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
name: epp | |
ports: | |
- name: grpc | |
containerPort: 9002 | |
protocol: TCP | |
- name: grpc-health | |
containerPort: 9003 | |
protocol: TCP | |
- name: metrics | |
containerPort: 9090 | |
protocol: TCP | |
readinessProbe: | |
failureThreshold: 3 | |
grpc: | |
port: 9003 | |
initialDelaySeconds: 5 | |
periodSeconds: 10 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
inferencePool: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferencePool | |
spec: | |
targetPortNumber: 8000 | |
inferenceModel: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferenceModel | |
--- | |
# Source: llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-basic-gpu-with-nixl-preset | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
data: | |
configMaps: | | |
- apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-config-decoder | |
data: | |
lmcache-decoder-config.yaml: | | |
local_cpu: False | |
max_local_cpu_size: 0 | |
max_local_disk_size: 0 | |
remote_serde: NULL | |
enable_nixl: True | |
nixl_role: receiver | |
nixl_peer_host: 0.0.0.0 | |
nixl_peer_port: 55555 | |
nixl_buffer_size: 524288 | |
nixl_buffer_device: "cuda" | |
nixl_enable_gc: True | |
- apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: llm-d-modelservice-config-prefiller | |
data: | |
lmcache-prefiller-config.yaml: | | |
local_cpu: False | |
max_local_cpu_size: 0 | |
max_local_disk_size: 0 | |
remote_serde: NULL | |
enable_nixl: True | |
nixl_role: "sender" | |
nixl_peer_host: "{{ .DecodeServiceName }}" | |
nixl_peer_port: 55555 | |
nixl_buffer_size: 524288 | |
nixl_buffer_device: "cuda" | |
nixl_enable_gc: True | |
decodeDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- name: routing-proxy | |
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6 | |
securityContext: | |
allowPrivilegeEscalation: false | |
runAsNonRoot: true | |
args: | |
- "--port=8001" | |
- "--vllm-port=8000" | |
ports: | |
- containerPort: 8000 | |
protocol: TCP | |
- name: vllm | |
image: quay.io/llm-d/llm-d-dev:0.0.5 | |
imagePullPolicy: IfNotPresent | |
securityContext: | |
allowPrivilegeEscalation: false | |
args: | |
- "--port" | |
- "8000" | |
- "--kv-transfer-config" | |
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' | |
env: | |
- name: XDG_CACHE_HOME | |
value: /tmp | |
- name: POD_IP | |
valueFrom: | |
fieldRef: | |
apiVersion: v1 | |
fieldPath: status.podIP | |
- name: LMCACHE_DISTRIBUTED_URL | |
value: ${POD_IP}:80 | |
- name: HF_TOKEN | |
valueFrom: | |
secretKeyRef: | |
name: llm-d-hf-token | |
key: HF_TOKEN | |
- name: CUDA_VISIBLE_DEVICES | |
value: "0" | |
- name: UCX_TLS | |
value: "cuda_ipc,cuda_copy,tcp" | |
- name: LMCACHE_USE_EXPERIMENTAL | |
value: "True" | |
- name: VLLM_ENABLE_V1_MULTIPROCESSING | |
value: "1" | |
- name: VLLM_WORKER_MULTIPROC_METHOD | |
value: spawn | |
- name: LMCACHE_CONFIG_FILE | |
value: /vllm-workspace/lmcache-decoder-config.yaml | |
- name: LMCACHE_LOOKUP_URL | |
value: llm-d-redis-master.default.svc.cluster.local:6379 | |
ports: | |
- containerPort: 8001 | |
protocol: TCP | |
- containerPort: 55555 | |
protocol: TCP | |
volumeMounts: | |
- name: config-decoder | |
mountPath: /vllm-workspace | |
- name: model-cache | |
mountPath: /vllm-workspace/models | |
- name: model-storage | |
mountPath: /cache | |
volumes: | |
- name: config-decoder | |
configMap: | |
name: llm-d-modelservice-config-decoder | |
- name: model-cache | |
emptyDir: | |
sizeLimit: 1Gi | |
prefillDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
spec: | |
template: | |
spec: | |
containers: | |
- name: "routing-proxy" | |
image: quay.io/llm-d/llm-d-routing-sidecar-dev:0.0.6 | |
securityContext: | |
allowPrivilegeEscalation: false | |
runAsNonRoot: true | |
args: | |
- "--port=8001" | |
- "--vllm-port=8000" | |
ports: | |
- containerPort: 8000 | |
protocol: TCP | |
- name: vllm | |
image: quay.io/llm-d/llm-d-dev:0.0.5 | |
imagePullPolicy: IfNotPresent | |
securityContext: | |
allowPrivilegeEscalation: false | |
args: | |
- "--port" | |
- "8000" | |
- "--kv-transfer-config" | |
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' | |
env: | |
- name: XDG_CACHE_HOME | |
value: /tmp | |
- name: POD_IP | |
valueFrom: | |
fieldRef: | |
apiVersion: v1 | |
fieldPath: status.podIP | |
- name: LMCACHE_DISTRIBUTED_URL | |
value: "${POD_IP}:80" | |
- name: HF_TOKEN | |
valueFrom: | |
secretKeyRef: | |
name: llm-d-hf-token | |
key: HF_TOKEN | |
- name: CUDA_VISIBLE_DEVICES | |
value: "0" | |
- name: UCX_TLS | |
value: "cuda_ipc,cuda_copy,tcp" | |
- name: LMCACHE_USE_EXPERIMENTAL | |
value: "True" | |
- name: VLLM_ENABLE_V1_MULTIPROCESSING | |
value: "1" | |
- name: VLLM_WORKER_MULTIPROC_METHOD | |
value: spawn | |
- name: LMCACHE_CONFIG_FILE | |
value: /vllm-workspace/lmcache-prefiller-config.yaml | |
- name: LMCACHE_LOOKUP_URL | |
value: llm-d-redis-master.default.svc.cluster.local:6379 | |
volumeMounts: | |
- name: config-prefiller | |
mountPath: /vllm-workspace | |
- name: model-cache | |
mountPath: /vllm-workspace/models | |
- name: model-storage | |
mountPath: /cache | |
ports: | |
- containerPort: 8001 | |
protocol: TCP | |
- containerPort: 55555 | |
protocol: TCP | |
volumes: | |
- name: config-prefiller | |
configMap: | |
name: llm-d-modelservice-config-prefiller | |
- name: model-cache | |
emptyDir: | |
sizeLimit: 1Gi | |
decodeService: | | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
labels: | |
llmd.ai/gather-metrics: "true" | |
spec: | |
clusterIP: None | |
ports: | |
- name: nixl | |
port: 55555 | |
protocol: TCP | |
- name: vllm | |
port: 8000 | |
protocol: TCP | |
prefillService: | | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
labels: | |
llmd.ai/gather-metrics: "true" | |
spec: | |
clusterIP: None | |
ports: | |
- name: nixl | |
port: 55555 | |
protocol: TCP | |
- name: vllm | |
port: 8000 | |
protocol: TCP | |
eppService: | | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
labels: | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
llmd.ai/gather-metrics: "true" | |
spec: | |
ports: | |
- port: 9002 # Needs to match the port of the eppDeployment | |
protocol: TCP | |
name: grpc | |
- port: 9003 | |
protocol: TCP | |
name: grpc-health | |
- port: 9090 | |
protocol: TCP | |
name: metrics | |
type: NodePort # accepts "LoadBalancer" or "NodePort" | |
selector: | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
eppDeployment: | | |
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
labels: | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
spec: | |
selector: | |
matchLabels: | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
template: | |
metadata: | |
labels: | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
spec: | |
serviceAccountName: endpoint-picker-sa # manually created in workaround w/ proper RBAC | |
containers: | |
- args: | |
- --poolName | |
- "{{ .InferencePoolName }}" | |
- --poolNamespace | |
- "{{ .ModelServiceNamespace }}" | |
- -v | |
- "4" | |
- --zap-encoder | |
- json | |
- --grpcPort | |
- "9002" | |
- --grpcHealthPort | |
- "9003" | |
env: | |
- name: KVCACHE_INDEXER_REDIS_ADDR | |
value: llm-d-redis-master.default.svc.cluster.local:6379:8100 | |
- name: HF_TOKEN | |
valueFrom: | |
secretKeyRef: | |
name: llm-d-hf-token | |
key: HF_TOKEN | |
image: quay.io/llm-d/llm-d-gateway-api-inference-extension-dev:0.0.5-amd64 | |
imagePullPolicy: IfNotPresent | |
# livenessProbe: | |
# failureThreshold: 3 | |
# grpc: | |
# port: 9003 | |
# service: "{{ .EPPServiceName }}" | |
# initialDelaySeconds: 5 | |
# periodSeconds: 10 | |
# successThreshold: 1 | |
# timeoutSeconds: 1 | |
# readinessProbe: | |
# failureThreshold: 3 | |
# grpc: | |
# port: 9003 | |
# service: "{{ .EPPServiceName }}" | |
# initialDelaySeconds: 5 | |
# periodSeconds: 10 | |
# successThreshold: 1 | |
# timeoutSeconds: 1 | |
name: epp | |
ports: | |
- name: grpc | |
containerPort: 9002 | |
protocol: TCP | |
- name: grpc-health | |
containerPort: 9003 | |
protocol: TCP | |
- name: metrics | |
containerPort: 9090 | |
protocol: TCP | |
inferencePool: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferencePool | |
spec: | |
targetPortNumber: 8000 | |
inferenceModel: | | |
apiVersion: inference.networking.x-k8s.io/v1alpha2 | |
kind: InferenceModel | |
--- | |
# Source: llm-d/charts/redis/templates/master/pvc.yaml | |
kind: PersistentVolumeClaim | |
apiVersion: v1 | |
metadata: | |
name: redis-data-llm-d-redis-master | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
app.kubernetes.io/component: master | |
spec: | |
accessModes: | |
- "ReadWriteOnce" | |
resources: | |
requests: | |
storage: "5Gi" | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleAdmin.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-admin-role | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices | |
verbs: | |
- '*' | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices/status | |
verbs: | |
- get | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleEditorRole.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-editor-role | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices | |
verbs: | |
- create | |
- delete | |
- get | |
- list | |
- patch | |
- update | |
- watch | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices/status | |
verbs: | |
- get | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleManager.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-manager-role | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- "" | |
resources: | |
- configmaps | |
verbs: | |
- get | |
- list | |
- watch | |
- create | |
- apiGroups: | |
- "" | |
resources: | |
- services | |
- serviceaccounts | |
verbs: | |
- get | |
- list | |
- watch | |
- create | |
- update | |
- apiGroups: | |
- rbac.authorization.k8s.io | |
resources: | |
- roles | |
- rolebindings | |
verbs: | |
- get | |
- list | |
- watch | |
- create | |
- update | |
# MSV2 HACK BEGIN --------------- | |
- apiGroups: | |
- "v1" | |
resources: | |
- "secrets" | |
verbs: | |
- "get" | |
- "list" | |
- "watch" | |
- apiGroups: | |
- "inference.networking.x-k8s.io" | |
resources: | |
- "inferencepools" | |
- "inferencemodels" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "" | |
resources: | |
- "pods" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "discovery.k8s.io" | |
resources: | |
- "endpointslices" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "authentication.k8s.io" | |
resources: | |
- "tokenreviews" | |
verbs: | |
- "create" | |
- apiGroups: | |
- "authorization.k8s.io" | |
resources: | |
- "subjectaccessreviews" | |
verbs: | |
- "create" | |
# MSV2 HACK END --------------- | |
- apiGroups: | |
- apps | |
resources: | |
- deployments | |
verbs: | |
- create | |
- delete | |
- get | |
- list | |
- patch | |
- update | |
- watch | |
- apiGroups: | |
- apps | |
resources: | |
- deployments/scale | |
verbs: | |
- patch | |
- update | |
- apiGroups: | |
- inference.networking.x-k8s.io | |
resources: | |
- inferencemodel | |
- inferencepool | |
verbs: | |
- create | |
- delete | |
- get | |
- list | |
- patch | |
- update | |
- watch | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices | |
verbs: | |
- create | |
- delete | |
- get | |
- list | |
- patch | |
- update | |
- watch | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices/finalizers | |
verbs: | |
- update | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices/status | |
verbs: | |
- get | |
- patch | |
- update | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleMetricsAuth.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-metrics-auth-role | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- authentication.k8s.io | |
resources: | |
- tokenreviews | |
verbs: | |
- create | |
- apiGroups: | |
- authorization.k8s.io | |
resources: | |
- subjectaccessreviews | |
verbs: | |
- create | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleMetricsReader.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-metrics-reader | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- nonResourceURLs: | |
- /metrics | |
verbs: | |
- get | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleViewer.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: llm-d-modelservice-viewer-role | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices | |
verbs: | |
- get | |
- list | |
- watch | |
- apiGroups: | |
- llm-d.ai | |
resources: | |
- modelservices/status | |
verbs: | |
- get | |
--- | |
# Source: llm-d/templates/modelservice/ms-v2-hack/clusterRole-epp.yaml | |
kind: ClusterRole | |
apiVersion: rbac.authorization.k8s.io/v1 | |
metadata: | |
name: endpoint-picker-clusterrole | |
rules: | |
- apiGroups: | |
- "inference.networking.x-k8s.io" | |
resources: | |
- "inferencepools" | |
- "inferencemodels" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "" | |
resources: | |
- "pods" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "discovery.k8s.io" | |
resources: | |
- "endpointslices" | |
verbs: | |
- "get" | |
- "watch" | |
- "list" | |
- apiGroups: | |
- "authentication.k8s.io" | |
resources: | |
- "tokenreviews" | |
verbs: | |
- "create" | |
- apiGroups: | |
- "authorization.k8s.io" | |
resources: | |
- "subjectaccessreviews" | |
verbs: | |
- "create" | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleBindingManager.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: llm-d-modelservice-manager-rolebinding | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: llm-d-modelservice-manager-role | |
subjects: | |
- kind: ServiceAccount | |
name: llm-d-llm-d-modelservice | |
namespace: default | |
--- | |
# Source: llm-d/templates/modelservice/clusterRoleBindingMetricsAuth.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: llm-d-modelservice-metrics-auth-rolebinding | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: llm-d-modelservice-metrics-auth-role | |
subjects: | |
- kind: ServiceAccount | |
apiGroup: "" | |
name: llm-d-llm-d-modelservice | |
namespace: default | |
--- | |
# Source: llm-d/templates/modelservice/role.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: Role | |
metadata: | |
name: llm-d-modelservice | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
rules: | |
- apiGroups: | |
- "" | |
resources: | |
- services | |
verbs: | |
- get | |
- list | |
- watch | |
- create | |
- update | |
- patch | |
- delete | |
- apiGroups: | |
- "" | |
resources: | |
- events | |
verbs: | |
- create | |
- patch | |
- apiGroups: | |
- coordination.k8s.io | |
resources: | |
- leases | |
verbs: | |
- get | |
- list | |
- watch | |
- create | |
- update | |
- patch | |
- delete | |
- apiGroups: | |
- "" | |
resources: | |
- events | |
verbs: | |
- create | |
- patch | |
--- | |
# Source: llm-d/templates/modelservice/rolebinding.yaml | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: RoleBinding | |
metadata: | |
name: llm-d-modelservice | |
namespace: default | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: modelservice | |
annotations: | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: Role | |
name: llm-d-modelservice | |
subjects: | |
- kind: ServiceAccount | |
apiGroup: "" | |
name: llm-d-llm-d-modelservice | |
namespace: default | |
--- | |
# Source: llm-d/charts/redis/templates/headless-svc.yaml | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
name: llm-d-redis-headless | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
spec: | |
type: ClusterIP | |
clusterIP: None | |
ports: | |
- name: tcp-redis | |
port: 6379 | |
targetPort: redis | |
selector: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/name: redis | |
--- | |
# Source: llm-d/charts/redis/templates/master/service.yaml | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
name: llm-d-redis-master | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
app.kubernetes.io/component: master | |
spec: | |
type: ClusterIP | |
internalTrafficPolicy: Cluster | |
sessionAffinity: None | |
ports: | |
- name: tcp-redis | |
port: 6379 | |
targetPort: redis | |
nodePort: null | |
selector: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/component: master | |
--- | |
# Source: llm-d/templates/modelservice/service.yaml | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
name: llm-d-modelservice | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
control-plane: controller-manager | |
app.kubernetes.io/component: modelservice | |
llmd.ai/gather-metrics: "true" | |
annotations: | |
spec: | |
ports: | |
- port: 8443 | |
protocol: TCP | |
targetPort: 8443 | |
selector: | |
app.kubernetes.io/name: llm-d | |
app.kubernetes.io/instance: llm-d | |
control-plane: controller-manager | |
app.kubernetes.io/component: modelservice | |
type: ClusterIP | |
--- | |
# Source: llm-d/charts/redis/templates/master/application.yaml | |
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
name: llm-d-redis-master | |
namespace: "default" | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
app.kubernetes.io/component: master | |
spec: | |
replicas: 1 | |
revisionHistoryLimit: 10 | |
selector: | |
matchLabels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/component: master | |
strategy: | |
type: RollingUpdate | |
template: | |
metadata: | |
labels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/version: 7.4.3 | |
helm.sh/chart: redis-20.13.4 | |
app.kubernetes.io/component: master | |
annotations: | |
checksum/configmap: 2a9ab4a5432825504d910f022638674ce88eaefe9f9f595ad8bc107377d104fb | |
checksum/health: aff24913d801436ea469d8d374b2ddb3ec4c43ee7ab24663d5f8ff1a1b6991a9 | |
checksum/scripts: 0717e77fd3bb941f602860e9be4f2ed87b481cddeadf37be463f8512ecde0c3e | |
checksum/secret: 5448db886ef764f92d30ce06839b56a06ebc552f1facb07eaa4de676f3ec9097 | |
spec: | |
imagePullSecrets: | |
- name: llm-d-pull-secret | |
securityContext: | |
fsGroup: 1001 | |
fsGroupChangePolicy: Always | |
supplementalGroups: [] | |
sysctls: [] | |
serviceAccountName: llm-d-redis-master | |
automountServiceAccountToken: false | |
affinity: | |
podAffinity: | |
podAntiAffinity: | |
preferredDuringSchedulingIgnoredDuringExecution: | |
- podAffinityTerm: | |
labelSelector: | |
matchLabels: | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/name: redis | |
app.kubernetes.io/component: master | |
topologyKey: kubernetes.io/hostname | |
weight: 1 | |
nodeAffinity: | |
enableServiceLinks: true | |
terminationGracePeriodSeconds: 30 | |
containers: | |
- name: redis | |
image: registry.redhat.io/rhel9/redis-7:9.5-1744185101 | |
imagePullPolicy: "IfNotPresent" | |
securityContext: | |
allowPrivilegeEscalation: false | |
capabilities: | |
drop: | |
- ALL | |
readOnlyRootFilesystem: true | |
runAsGroup: 1001 | |
runAsNonRoot: true | |
runAsUser: 1001 | |
seLinuxOptions: {} | |
seccompProfile: | |
type: RuntimeDefault | |
command: | |
- /bin/bash | |
args: | |
- -ec | |
- /opt/bitnami/scripts/start-scripts/start-master.sh | |
env: | |
- name: BITNAMI_DEBUG | |
value: "false" | |
- name: REDIS_REPLICATION_MODE | |
value: master | |
- name: ALLOW_EMPTY_PASSWORD | |
value: "no" | |
- name: REDIS_PASSWORD_FILE | |
value: "/opt/bitnami/redis/secrets/redis-password" | |
- name: REDIS_TLS_ENABLED | |
value: "no" | |
- name: REDIS_PORT | |
value: "6379" | |
ports: | |
- name: redis | |
containerPort: 6379 | |
livenessProbe: | |
initialDelaySeconds: 20 | |
periodSeconds: 5 | |
# One second longer than command timeout should prevent generation of zombie processes. | |
timeoutSeconds: 6 | |
successThreshold: 1 | |
failureThreshold: 5 | |
exec: | |
command: | |
- /bin/bash | |
- -ec | |
- /health/ping_liveness_local.sh 5 | |
readinessProbe: | |
initialDelaySeconds: 20 | |
periodSeconds: 5 | |
timeoutSeconds: 2 | |
successThreshold: 1 | |
failureThreshold: 5 | |
exec: | |
command: | |
- /bin/bash | |
- -ec | |
- /health/ping_readiness_local.sh 1 | |
resources: | |
limits: | |
cpu: 250m | |
memory: 256Mi | |
requests: | |
cpu: 100m | |
memory: 128Mi | |
volumeMounts: | |
- name: start-scripts | |
mountPath: /opt/bitnami/scripts/start-scripts | |
- name: health | |
mountPath: /health | |
- name: redis-password | |
mountPath: /opt/bitnami/redis/secrets/ | |
- name: redis-data | |
mountPath: /data | |
- name: config | |
mountPath: /opt/bitnami/redis/mounted-etc | |
- name: empty-dir | |
mountPath: /opt/bitnami/redis/etc/ | |
subPath: app-conf-dir | |
- name: empty-dir | |
mountPath: /tmp | |
subPath: tmp-dir | |
volumes: | |
- name: start-scripts | |
configMap: | |
name: llm-d-redis-scripts | |
defaultMode: 0755 | |
- name: health | |
configMap: | |
name: llm-d-redis-health | |
defaultMode: 0755 | |
- name: redis-password | |
secret: | |
secretName: llm-d-redis | |
items: | |
- key: redis-password | |
path: redis-password | |
- name: config | |
configMap: | |
name: llm-d-redis-configuration | |
- name: empty-dir | |
emptyDir: {} | |
- name: redis-data | |
persistentVolumeClaim: | |
claimName: redis-data-llm-d-redis-master | |
--- | |
# Source: llm-d/templates/modelservice/deployment.yaml | |
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
name: llm-d-modelservice | |
namespace: default | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
control-plane: controller-manager | |
app.kubernetes.io/component: modelservice | |
annotations: | |
spec: | |
replicas: 1 | |
selector: | |
matchLabels: | |
app.kubernetes.io/name: llm-d | |
app.kubernetes.io/instance: llm-d | |
control-plane: controller-manager | |
app.kubernetes.io/component: modelservice | |
template: | |
metadata: | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
control-plane: controller-manager | |
app.kubernetes.io/component: modelservice | |
annotations: | |
spec: | |
containers: | |
- args: | |
- --metrics-bind-address=:8443 | |
- --leader-elect=false | |
- --health-probe-bind-address=:8081 | |
# MSV2 HACK BEGIN | |
- --epp-cluster-role | |
- endpoint-picker-clusterrole | |
- --epp-pull-secrets | |
- "llm-d-pull-secret" | |
- --pd-pull-secrets | |
- "llm-d-pull-secret" | |
# MSV2 HACK END | |
command: | |
- /manager | |
image: quay.io/llm-d/llm-d-model-service:0.0.6 | |
imagePullPolicy: Always | |
livenessProbe: | |
failureThreshold: 3 | |
httpGet: | |
path: /healthz | |
port: 8081 | |
scheme: HTTP | |
initialDelaySeconds: 15 | |
periodSeconds: 20 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
name: manager | |
readinessProbe: | |
failureThreshold: 3 | |
httpGet: | |
path: /readyz | |
port: 8081 | |
scheme: HTTP | |
initialDelaySeconds: 5 | |
periodSeconds: 10 | |
successThreshold: 1 | |
timeoutSeconds: 1 | |
resources: | |
limits: | |
cpu: 500m | |
memory: 128Mi | |
requests: | |
cpu: 10m | |
memory: 64Mi | |
securityContext: | |
allowPrivilegeEscalation: false | |
capabilities: | |
drop: | |
- ALL | |
securityContext: | |
runAsNonRoot: true | |
seccompProfile: | |
type: RuntimeDefault | |
serviceAccountName: llm-d-llm-d-modelservice | |
--- | |
# Source: llm-d/templates/sample-application/ingress.yaml | |
apiVersion: networking.k8s.io/v1 | |
kind: Ingress | |
metadata: | |
name: llm-d-inference-gateway | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
app.kubernetes.io/component: sample-application | |
annotations: | |
spec: | |
rules: | |
- http: | |
paths: | |
- path: / | |
pathType: Prefix | |
backend: | |
service: | |
name: llm-d-inference-gateway | |
port: | |
number: 80 | |
--- | |
# Source: llm-d/templates/inference-gateway/gateway.yaml | |
apiVersion: gateway.networking.k8s.io/v1 | |
kind: Gateway | |
metadata: | |
name: llm-d-inference-gateway | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
app.kubernetes.io/component: inference-gateway | |
annotations: | |
spec: | |
gatewayClassName: "kgateway" | |
listeners: | |
- name: default | |
port: 80 | |
protocol: HTTP | |
--- | |
# Source: llm-d/templates/inference-gateway/gatewayparameters.yaml | |
apiVersion: gateway.kgateway.dev/v1alpha1 | |
kind: GatewayParameters | |
metadata: | |
name: llm-d-inference-gateway | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/gateway: llm-d-inference-gateway | |
app.kubernetes.io/component: inference-gateway | |
annotations: | |
spec: | |
kube: | |
envoyContainer: | |
securityContext: | |
allowPrivilegeEscalation: false | |
readOnlyRootFilesystem: true | |
runAsNonRoot: true | |
seccompProfile: | |
type: RuntimeDefault | |
service: | |
type: "NodePort" | |
extraLabels: | |
gateway: custom | |
podTemplate: | |
extraLabels: | |
gateway: custom | |
sdsContainer: | |
securityContext: | |
allowPrivilegeEscalation: false | |
readOnlyRootFilesystem: true | |
seccompProfile: | |
type: RuntimeDefault | |
--- | |
# Source: llm-d/templates/sample-application/httproutes.yaml | |
apiVersion: gateway.networking.k8s.io/v1 | |
kind: HTTPRoute | |
metadata: | |
name: llama-3-2-3b-instruct | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: sample-application | |
annotations: | |
spec: | |
parentRefs: | |
- name: llm-d-inference-gateway | |
rules: | |
- matches: | |
- path: | |
type: PathPrefix | |
value: / | |
backendRefs: | |
- group: inference.networking.x-k8s.io | |
kind: InferencePool | |
name: "llama-3-2-3b-instruct-inference-pool" | |
port: 8000 | |
--- | |
# Source: llm-d/templates/sample-application/modelservice.yaml | |
apiVersion: llm-d.ai/v1alpha1 | |
kind: ModelService | |
metadata: | |
name: llama-3-2-3b-instruct | |
labels: | |
app.kubernetes.io/name: llm-d | |
helm.sh/chart: llm-d-0.1.1 | |
app.kubernetes.io/instance: llm-d | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/version: "0.0.1" | |
app.kubernetes.io/component: sample-application | |
annotations: | |
spec: | |
decoupleScaling: false | |
baseConfigMapRef: | |
name: llm-d-modelservice-basic-gpu-with-nixl-preset | |
routing: | |
modelName: Llama-3.2-3B-Instruct | |
modelArtifacts: | |
uri: pvc://llama-3.2-3b-instruct-pvc/models/meta-llama/Llama-32-3B-Instruct | |
decode: | |
replicas: 1 | |
containers: | |
- name: "vllm" | |
command: | |
- vllm | |
- serve | |
args: | |
- "/cache/models/meta-llama/Llama-3.2-3B-Instruct" | |
- "--served-model-name" | |
- Llama-3.2-3B-Instruct | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
requests: | |
nvidia.com/gpu: 1 | |
prefill: | |
replicas: 1 | |
containers: | |
- name: "vllm" | |
command: | |
- vllm | |
- serve | |
args: | |
- "/cache/models/meta-llama/Llama-3.2-3B-Instruct" | |
- "--served-model-name" | |
- Llama-3.2-3B-Instruct | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
requests: | |
nvidia.com/gpu: 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment