Created
August 12, 2025 23:08
-
-
Save gbrayut/18152227e3d58faeb490ffd88c0951ab to your computer and use it in GitHub Desktop.
gke 1.33 managed DCGM exporter for Nvidia A100 a2-highgpu-1g
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # From https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics#configure-dcgm-collection | |
| # Managed DCGM metrics, including its underlying manifests and container images, is intended to work only on GKE clusters. | |
| # Don't use managed DCGM metrics standalone or run it outside of GKE. | |
| apiVersion: apps/v1 | |
| kind: DaemonSet | |
| metadata: | |
| annotations: | |
| components.gke.io/layer: addon | |
| deprecated.daemonset.template.generation: "1" | |
| creationTimestamp: "2025-08-12T22:55:26Z" | |
| generation: 1 | |
| labels: | |
| addonmanager.kubernetes.io/mode: Reconcile | |
| app.kubernetes.io/name: gke-managed-dcgm-exporter | |
| name: dcgm-exporter | |
| namespace: gke-managed-system | |
| resourceVersion: "1755039331114351000" | |
| uid: b4ea4936-3162-4798-82ee-7dff58af4283 | |
| spec: | |
| revisionHistoryLimit: 10 | |
| selector: | |
| matchLabels: | |
| app.kubernetes.io/name: gke-managed-dcgm-exporter | |
| template: | |
| metadata: | |
| annotations: | |
| cluster-autoscaler.kubernetes.io/safe-to-evict: "true" | |
| components.gke.io/component-name: gpu-dcgm | |
| components.gke.io/component-version: 1.33.0-gke.1 | |
| creationTimestamp: null | |
| labels: | |
| app.kubernetes.io/name: gke-managed-dcgm-exporter | |
| spec: | |
| affinity: | |
| nodeAffinity: | |
| requiredDuringSchedulingIgnoredDuringExecution: | |
| nodeSelectorTerms: | |
| - matchExpressions: | |
| - key: cloud.google.com/gke-accelerator | |
| operator: Exists | |
| - key: cloud.google.com/gke-gpu-driver-version | |
| operator: In | |
| values: | |
| - default | |
| - latest | |
| automountServiceAccountToken: false | |
| containers: | |
| - args: | |
| - --enable-dcgm-log | |
| - --dcgm-log-level | |
| - ERROR | |
| env: | |
| - name: NODE_NAME | |
| valueFrom: | |
| fieldRef: | |
| apiVersion: v1 | |
| fieldPath: spec.nodeName | |
| - name: DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE | |
| value: device-name | |
| - name: DCGM_EXPORTER_KUBERNETES | |
| value: "true" | |
| - name: DCGM_EXPORTER_LISTEN | |
| value: :9400 | |
| image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/nvidia/gke-dcgm-exporter:3.3.9-3.6.1-gke.11@sha256:80dae82c957920007dc29b8478355217b3c8c22d54fb8297e8000ed64af74c3b | |
| imagePullPolicy: IfNotPresent | |
| name: dcgm-exporter | |
| ports: | |
| - containerPort: 9400 | |
| name: metrics | |
| protocol: TCP | |
| resources: | |
| limits: | |
| memory: 350Mi | |
| requests: | |
| cpu: 100m | |
| memory: 350Mi | |
| securityContext: | |
| privileged: true | |
| readOnlyRootFilesystem: true | |
| terminationMessagePath: /dev/termination-log | |
| terminationMessagePolicy: File | |
| volumeMounts: | |
| - mountPath: /etc/dcgm-exporter | |
| name: dcgm-exporter-metrics | |
| readOnly: true | |
| - mountPath: /usr/local/nvidia | |
| name: nvidia-install-dir-host | |
| readOnly: true | |
| - mountPath: /var/lib/kubelet/pod-resources | |
| name: pod-resources | |
| readOnly: true | |
| dnsPolicy: ClusterFirst | |
| initContainers: | |
| - command: | |
| - /bin/bash | |
| - -c | |
| - | | |
| cleanup() { | |
| echo "caught stop signal, exiting..." | |
| exit 0 | |
| } | |
| trap cleanup SIGINT SIGTERM | |
| echo "checking for drivers" | |
| while true; do | |
| if ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then | |
| echo "nvml drivers found!" | |
| break | |
| fi | |
| echo "waiting for nvml drivers" | |
| sleep 1 | |
| done | |
| echo "checking for MIG partitions" | |
| if [[ -f "/etc/nvidia/gpu_config.json" ]]; then | |
| content=$(</etc/nvidia/gpu_config.json) | |
| if [[ -z "${content##*GPUPartitionSize*}" ]]; then | |
| while true; do | |
| output=$(LD_LIBRARY_PATH=/usr/local/nvidia/lib64 /usr/local/nvidia/bin/nvidia-smi -L) | |
| echo $output | |
| if [[ -z "${output##*MIG*MIG-*}" ]]; then | |
| echo "MIG partitions found, exiting" | |
| break | |
| fi | |
| echo "waiting for MIG partitions" | |
| sleep 2 | |
| done | |
| else | |
| echo "GPUPartitionSize is empty" | |
| fi | |
| else | |
| echo "no gpu config" | |
| fi | |
| echo "checking for gpu-device-plugin socket" | |
| while true; do | |
| if [[ $(ls /var/lib/kubelet/device-plugins/nvidiaGPU*.sock) ]]; then | |
| ls -la /var/lib/kubelet/device-plugins/nvidiaGPU*.sock | |
| echo "gpu-device-plugin socket found, exiting" | |
| exit 0 | |
| fi | |
| echo "waiting for gpu-device-plugin socket" | |
| sleep 1 | |
| done | |
| image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/gke-distroless/bash:gke_distroless_20250321.00_p0@sha256:7f776c36ecf7426b9d461b2b9690ff6b6c7fc1d00c78eb050da3e039d431b760 | |
| imagePullPolicy: IfNotPresent | |
| name: nvml-wait | |
| resources: {} | |
| securityContext: | |
| privileged: true | |
| readOnlyRootFilesystem: true | |
| terminationMessagePath: /dev/termination-log | |
| terminationMessagePolicy: File | |
| volumeMounts: | |
| - mountPath: /usr/local/nvidia | |
| name: nvidia-install-dir-host | |
| readOnly: true | |
| - mountPath: /etc/nvidia | |
| name: nvidia-config | |
| readOnly: true | |
| - mountPath: /var/lib/kubelet/device-plugins | |
| name: device-plugins | |
| readOnly: true | |
| priorityClassName: system-node-critical | |
| restartPolicy: Always | |
| schedulerName: default-scheduler | |
| securityContext: {} | |
| terminationGracePeriodSeconds: 30 | |
| tolerations: | |
| - operator: Exists | |
| - key: components.gke.io/gke-managed-components | |
| operator: Exists | |
| volumes: | |
| - hostPath: | |
| path: /home/kubernetes/bin/nvidia | |
| type: Directory | |
| name: nvidia-install-dir-host | |
| - configMap: | |
| defaultMode: 420 | |
| name: dcgm-exporter-metrics | |
| name: dcgm-exporter-metrics | |
| - hostPath: | |
| path: /var/lib/kubelet/pod-resources | |
| type: Directory | |
| name: pod-resources | |
| - hostPath: | |
| path: /var/lib/kubelet/device-plugins | |
| type: Directory | |
| name: device-plugins | |
| - hostPath: | |
| path: /etc/nvidia | |
| type: DirectoryOrCreate | |
| name: nvidia-config | |
| updateStrategy: | |
| rollingUpdate: | |
| maxSurge: 0 | |
| maxUnavailable: 1 | |
| type: RollingUpdate | |
| status: | |
| currentNumberScheduled: 1 | |
| desiredNumberScheduled: 1 | |
| numberAvailable: 1 | |
| numberMisscheduled: 0 | |
| numberReady: 1 | |
| observedGeneration: 1 | |
| updatedNumberScheduled: 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment