Last active
May 23, 2025 14:36
-
-
Save jkremser/1d68e2f33c80f098abe85ba067602c0d to your computer and use it in GitHub Desktop.
DCGM + OTel scaler setup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: keda.sh/v1alpha1 | |
kind: ScaledObject | |
metadata: | |
name: model-dcgm | |
annotations: | |
autoscaling.keda.sh/paused: "false" | |
spec: | |
scaleTargetRef: | |
apiVersion: apps/v1 | |
kind: Deployment | |
name: llama | |
triggers: | |
- type: external | |
metadata: | |
scalerAddress: "keda-otel-scaler.keda.svc:4318" | |
# 'svc' dimension is added by OTel processor (transform) | |
metricQuery: "sum(DCGM_FI_DEV_MEM_COPY_UTIL{svc=llama})" | |
operationOverTime: "avg" | |
targetValue: "40" | |
minReplicaCount: 1 | |
maxReplicaCount: 4 | |
fallback: | |
failureThreshold: 10 | |
replicas: 1 | |
advanced: | |
horizontalPodAutoscalerConfig: | |
behavior: | |
scaleDown: | |
# this should be higher in prod | |
stabilizationWindowSeconds: 1200 | |
scaleUp: | |
# this should be much higher in prod | |
stabilizationWindowSeconds: 1 | |
--- | |
# Using the target-allocator feature in OTel operator that can dynamically add targets for prometheus receiver similarly as Prometheus operator (using the ServiceMonitors and PodMonitors) | |
apiVersion: opentelemetry.io/v1beta1 | |
kind: OpenTelemetryCollector | |
metadata: | |
name: collector-with-ta-prometheus-cr | |
namespace: gpu-operator | |
spec: | |
resources: | |
limits: | |
cpu: "500m" | |
memory: 300Mi | |
requests: | |
cpu: 100m | |
memory: 200Mi | |
mode: statefulset | |
# mode: sidecar | |
targetAllocator: | |
enabled: true | |
# k create sa otel-prom-reader -ngpu-operator | |
# k create clusterrolebinding otel-prom-reader-1 --clusterrole prom-kube-prometheus-stack-operator --serviceaccount gpu-operator:otel-prom-reader | |
# k create clusterrolebinding otel-prom-reader-2 --clusterrole prom-kube-prometheus-stack-prometheus --serviceaccount gpu-operator:otel-prom-reader | |
serviceAccount: otel-prom-reader | |
prometheusCR: | |
enabled: true | |
serviceMonitorSelector: | |
matchLabels: | |
smName: dcgm | |
podMonitorSelector: {} | |
config: | |
receivers: | |
prometheus: | |
config: | |
scrape_configs: | |
- job_name: 'otel-collector' | |
scrape_interval: 10s | |
static_configs: | |
- targets: [ '0.0.0.0:8888' ] | |
metric_relabel_configs: | |
- action: labeldrop | |
regex: (id|name) | |
- action: labelmap | |
regex: label_(.+) | |
replacement: $$1 | |
processors: | |
# we are scraping the dcgm pod/svc, so this is useless | |
# resourcedetection/env: | |
# detectors: [env] | |
# timeout: 2s | |
# override: false | |
transform: | |
metric_statements: | |
- context: datapoint | |
statements: | |
- set(datapoint.attributes["svc"], datapoint.attributes["exported_pod"]) | |
# - replace_pattern(datapoint.attributes["svc"], "(.*)-[^-]+", "$1") | |
- replace_pattern(datapoint.attributes["svc"], "(.*)-[^-]+-[^-]+", "$1") | |
filter/metrics: | |
metrics: | |
include: | |
match_type: strict | |
metric_names: | |
- DCGM_FI_DEV_MEM_COPY_UTIL | |
- DCGM_FI_DEV_GPU_UTIL | |
exporters: | |
otlp/keda: | |
endpoint: keda-otel-scaler.keda.svc:4317 | |
compression: "none" | |
tls: | |
insecure: true | |
debug: | |
verbosity: detailed | |
service: | |
pipelines: | |
metrics: | |
receivers: [prometheus] | |
processors: [filter/metrics, transform] | |
exporters: [debug, otlp/keda] | |
# the dcgm exporter was deployed together with gpu-operator using following helmchart values: | |
# dcgmExporter: | |
# enabled: true | |
# serviceMonitor: | |
# enabled: true | |
# interval: 5s | |
# additionalLabels: | |
# smName: dcgm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment