Skip to content

Instantly share code, notes, and snippets.

@jkremser
Last active May 23, 2025 14:36
Show Gist options
  • Save jkremser/1d68e2f33c80f098abe85ba067602c0d to your computer and use it in GitHub Desktop.
Save jkremser/1d68e2f33c80f098abe85ba067602c0d to your computer and use it in GitHub Desktop.
DCGM + OTel scaler setup
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: model-dcgm
annotations:
autoscaling.keda.sh/paused: "false"
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: llama
triggers:
- type: external
metadata:
scalerAddress: "keda-otel-scaler.keda.svc:4318"
# 'svc' dimension is added by OTel processor (transform)
metricQuery: "sum(DCGM_FI_DEV_MEM_COPY_UTIL{svc=llama})"
operationOverTime: "avg"
targetValue: "40"
minReplicaCount: 1
maxReplicaCount: 4
fallback:
failureThreshold: 10
replicas: 1
advanced:
horizontalPodAutoscalerConfig:
behavior:
scaleDown:
# this should be higher in prod
stabilizationWindowSeconds: 1200
scaleUp:
# this should be much higher in prod
stabilizationWindowSeconds: 1
---
# Using the target-allocator feature in OTel operator that can dynamically add targets for prometheus receiver similarly as Prometheus operator (using the ServiceMonitors and PodMonitors)
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: collector-with-ta-prometheus-cr
namespace: gpu-operator
spec:
resources:
limits:
cpu: "500m"
memory: 300Mi
requests:
cpu: 100m
memory: 200Mi
mode: statefulset
# mode: sidecar
targetAllocator:
enabled: true
# k create sa otel-prom-reader -ngpu-operator
# k create clusterrolebinding otel-prom-reader-1 --clusterrole prom-kube-prometheus-stack-operator --serviceaccount gpu-operator:otel-prom-reader
# k create clusterrolebinding otel-prom-reader-2 --clusterrole prom-kube-prometheus-stack-prometheus --serviceaccount gpu-operator:otel-prom-reader
serviceAccount: otel-prom-reader
prometheusCR:
enabled: true
serviceMonitorSelector:
matchLabels:
smName: dcgm
podMonitorSelector: {}
config:
receivers:
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 10s
static_configs:
- targets: [ '0.0.0.0:8888' ]
metric_relabel_configs:
- action: labeldrop
regex: (id|name)
- action: labelmap
regex: label_(.+)
replacement: $$1
processors:
# we are scraping the dcgm pod/svc, so this is useless
# resourcedetection/env:
# detectors: [env]
# timeout: 2s
# override: false
transform:
metric_statements:
- context: datapoint
statements:
- set(datapoint.attributes["svc"], datapoint.attributes["exported_pod"])
# - replace_pattern(datapoint.attributes["svc"], "(.*)-[^-]+", "$1")
- replace_pattern(datapoint.attributes["svc"], "(.*)-[^-]+-[^-]+", "$1")
filter/metrics:
metrics:
include:
match_type: strict
metric_names:
- DCGM_FI_DEV_MEM_COPY_UTIL
- DCGM_FI_DEV_GPU_UTIL
exporters:
otlp/keda:
endpoint: keda-otel-scaler.keda.svc:4317
compression: "none"
tls:
insecure: true
debug:
verbosity: detailed
service:
pipelines:
metrics:
receivers: [prometheus]
processors: [filter/metrics, transform]
exporters: [debug, otlp/keda]
# the dcgm exporter was deployed together with gpu-operator using following helmchart values:
# dcgmExporter:
# enabled: true
# serviceMonitor:
# enabled: true
# interval: 5s
# additionalLabels:
# smName: dcgm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment