gbrayut · August 12, 2025 23:08
diff --git a/gke-dcgm-exporter.yaml b/gke-dcgm-exporter.yaml
 # From https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics#configure-dcgm-collection
 # Managed DCGM metrics, including its underlying manifests and container images, is intended to work only on GKE clusters.
 # Don't use managed DCGM metrics standalone or run it outside of GKE.
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  annotations:
    components.gke.io/layer: addon
    deprecated.daemonset.template.generation: "1"
  creationTimestamp: "2025-08-12T22:55:26Z"
  generation: 1
  labels:
    addonmanager.kubernetes.io/mode: Reconcile
    app.kubernetes.io/name: gke-managed-dcgm-exporter
  name: dcgm-exporter
  namespace: gke-managed-system
  resourceVersion: "1755039331114351000"
  uid: b4ea4936-3162-4798-82ee-7dff58af4283
 spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app.kubernetes.io/name: gke-managed-dcgm-exporter
  template:
    metadata:
      annotations:
        cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
        components.gke.io/component-name: gpu-dcgm
        components.gke.io/component-version: 1.33.0-gke.1
      creationTimestamp: null
      labels:
        app.kubernetes.io/name: gke-managed-dcgm-exporter
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: cloud.google.com/gke-accelerator
                operator: Exists
              - key: cloud.google.com/gke-gpu-driver-version
                operator: In
                values:
                - default
                - latest
      automountServiceAccountToken: false
      containers:
      - args:
        - --enable-dcgm-log
        - --dcgm-log-level
        - ERROR
        env:
        - name: NODE_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName
        - name: DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE
          value: device-name
        - name: DCGM_EXPORTER_KUBERNETES
          value: "true"
        - name: DCGM_EXPORTER_LISTEN
          value: :9400
        image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/nvidia/gke-dcgm-exporter:3.3.9-3.6.1-gke.11@sha256:80dae82c957920007dc29b8478355217b3c8c22d54fb8297e8000ed64af74c3b
        imagePullPolicy: IfNotPresent
        name: dcgm-exporter
        ports:
        - containerPort: 9400
          name: metrics
          protocol: TCP
        resources:
          limits:
            memory: 350Mi
          requests:
            cpu: 100m
            memory: 350Mi
        securityContext:
          privileged: true
          readOnlyRootFilesystem: true
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /etc/dcgm-exporter
          name: dcgm-exporter-metrics
          readOnly: true
        - mountPath: /usr/local/nvidia
          name: nvidia-install-dir-host
          readOnly: true
        - mountPath: /var/lib/kubelet/pod-resources
          name: pod-resources
          readOnly: true
      dnsPolicy: ClusterFirst
      initContainers:
      - command:
        - /bin/bash
        - -c
        - |
          cleanup() {
            echo "caught stop signal, exiting..."
            exit 0
          }
          trap cleanup SIGINT SIGTERM
          echo "checking for drivers"
          while true; do
            if ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
              echo "nvml drivers found!"
              break
            fi
            echo "waiting for nvml drivers"
            sleep 1
          done
          echo "checking for MIG partitions"
          if [[ -f "/etc/nvidia/gpu_config.json" ]]; then
            content=$(</etc/nvidia/gpu_config.json)
            if [[ -z "${content##*GPUPartitionSize*}" ]]; then
              while true; do
                output=$(LD_LIBRARY_PATH=/usr/local/nvidia/lib64 /usr/local/nvidia/bin/nvidia-smi -L)
                echo $output
                if [[ -z "${output##*MIG*MIG-*}" ]]; then
                  echo "MIG partitions found, exiting"
                  break
                fi
                echo "waiting for MIG partitions"
                sleep 2
              done
            else
              echo "GPUPartitionSize is empty"
            fi
          else
            echo "no gpu config"
          fi
          echo "checking for gpu-device-plugin socket"
          while true; do
            if [[ $(ls /var/lib/kubelet/device-plugins/nvidiaGPU*.sock) ]]; then
              ls -la /var/lib/kubelet/device-plugins/nvidiaGPU*.sock
              echo "gpu-device-plugin socket found, exiting"
              exit 0
            fi
            echo "waiting for gpu-device-plugin socket"
            sleep 1
          done
        image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/gke-distroless/bash:gke_distroless_20250321.00_p0@sha256:7f776c36ecf7426b9d461b2b9690ff6b6c7fc1d00c78eb050da3e039d431b760
        imagePullPolicy: IfNotPresent
        name: nvml-wait
        resources: {}
        securityContext:
          privileged: true
          readOnlyRootFilesystem: true
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /usr/local/nvidia
          name: nvidia-install-dir-host
          readOnly: true
        - mountPath: /etc/nvidia
          name: nvidia-config
          readOnly: true
        - mountPath: /var/lib/kubelet/device-plugins
          name: device-plugins
          readOnly: true
      priorityClassName: system-node-critical
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext: {}
      terminationGracePeriodSeconds: 30
      tolerations:
      - operator: Exists
      - key: components.gke.io/gke-managed-components
        operator: Exists
      volumes:
      - hostPath:
          path: /home/kubernetes/bin/nvidia
          type: Directory
        name: nvidia-install-dir-host
      - configMap:
          defaultMode: 420
          name: dcgm-exporter-metrics
        name: dcgm-exporter-metrics
      - hostPath:
          path: /var/lib/kubelet/pod-resources
          type: Directory
        name: pod-resources
      - hostPath:
          path: /var/lib/kubelet/device-plugins
          type: Directory
        name: device-plugins
      - hostPath:
          path: /etc/nvidia
          type: DirectoryOrCreate
        name: nvidia-config
  updateStrategy:
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 1
    type: RollingUpdate
 status:
  currentNumberScheduled: 1
  desiredNumberScheduled: 1
  numberAvailable: 1
  numberMisscheduled: 0
  numberReady: 1
  observedGeneration: 1
  updatedNumberScheduled: 1
	# From https://cloud.google.com/kubernetes-engine/docs/how-to/dcgm-metrics#configure-dcgm-collection
	# Managed DCGM metrics, including its underlying manifests and container images, is intended to work only on GKE clusters.
	# Don't use managed DCGM metrics standalone or run it outside of GKE.
	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	annotations:
	components.gke.io/layer: addon
	deprecated.daemonset.template.generation: "1"
	creationTimestamp: "2025-08-12T22:55:26Z"
	generation: 1
	labels:
	addonmanager.kubernetes.io/mode: Reconcile
	app.kubernetes.io/name: gke-managed-dcgm-exporter
	name: dcgm-exporter
	namespace: gke-managed-system
	resourceVersion: "1755039331114351000"
	uid: b4ea4936-3162-4798-82ee-7dff58af4283
	spec:
	revisionHistoryLimit: 10
	selector:
	matchLabels:
	app.kubernetes.io/name: gke-managed-dcgm-exporter
	template:
	metadata:
	annotations:
	cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
	components.gke.io/component-name: gpu-dcgm
	components.gke.io/component-version: 1.33.0-gke.1
	creationTimestamp: null
	labels:
	app.kubernetes.io/name: gke-managed-dcgm-exporter
	spec:
	affinity:
	nodeAffinity:
	requiredDuringSchedulingIgnoredDuringExecution:
	nodeSelectorTerms:
	- matchExpressions:
	- key: cloud.google.com/gke-accelerator
	operator: Exists
	- key: cloud.google.com/gke-gpu-driver-version
	operator: In
	values:
	- default
	- latest
	automountServiceAccountToken: false
	containers:
	- args:
	- --enable-dcgm-log
	- --dcgm-log-level
	- ERROR
	env:
	- name: NODE_NAME
	valueFrom:
	fieldRef:
	apiVersion: v1
	fieldPath: spec.nodeName
	- name: DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE
	value: device-name
	- name: DCGM_EXPORTER_KUBERNETES
	value: "true"
	- name: DCGM_EXPORTER_LISTEN
	value: :9400
	image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/nvidia/gke-dcgm-exporter:3.3.9-3.6.1-gke.11@sha256:80dae82c957920007dc29b8478355217b3c8c22d54fb8297e8000ed64af74c3b
	imagePullPolicy: IfNotPresent
	name: dcgm-exporter
	ports:
	- containerPort: 9400
	name: metrics
	protocol: TCP
	resources:
	limits:
	memory: 350Mi
	requests:
	cpu: 100m
	memory: 350Mi
	securityContext:
	privileged: true
	readOnlyRootFilesystem: true
	terminationMessagePath: /dev/termination-log
	terminationMessagePolicy: File
	volumeMounts:
	- mountPath: /etc/dcgm-exporter
	name: dcgm-exporter-metrics
	readOnly: true
	- mountPath: /usr/local/nvidia
	name: nvidia-install-dir-host
	readOnly: true
	- mountPath: /var/lib/kubelet/pod-resources
	name: pod-resources
	readOnly: true
	dnsPolicy: ClusterFirst
	initContainers:
	- command:
	- /bin/bash
	- -c
	- \|
	cleanup() {
	echo "caught stop signal, exiting..."
	exit 0
	}
	trap cleanup SIGINT SIGTERM
	echo "checking for drivers"
	while true; do
	if ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
	echo "nvml drivers found!"
	break
	fi
	echo "waiting for nvml drivers"
	sleep 1
	done
	echo "checking for MIG partitions"
	if [[ -f "/etc/nvidia/gpu_config.json" ]]; then
	content=$(</etc/nvidia/gpu_config.json)
	if [[ -z "${content##GPUPartitionSize}" ]]; then
	while true; do
	output=$(LD_LIBRARY_PATH=/usr/local/nvidia/lib64 /usr/local/nvidia/bin/nvidia-smi -L)
	echo $output
	if [[ -z "${output##MIGMIG-*}" ]]; then
	echo "MIG partitions found, exiting"
	break
	fi
	echo "waiting for MIG partitions"
	sleep 2
	done
	else
	echo "GPUPartitionSize is empty"
	fi
	else
	echo "no gpu config"
	fi
	echo "checking for gpu-device-plugin socket"
	while true; do
	if [[ $(ls /var/lib/kubelet/device-plugins/nvidiaGPU*.sock) ]]; then
	ls -la /var/lib/kubelet/device-plugins/nvidiaGPU*.sock
	echo "gpu-device-plugin socket found, exiting"
	exit 0
	fi
	echo "waiting for gpu-device-plugin socket"
	sleep 1
	done
	image: us-central1-artifactregistry.gcr.io/gke-release/gke-release/gke-distroless/bash:gke_distroless_20250321.00_p0@sha256:7f776c36ecf7426b9d461b2b9690ff6b6c7fc1d00c78eb050da3e039d431b760
	imagePullPolicy: IfNotPresent
	name: nvml-wait
	resources: {}
	securityContext:
	privileged: true
	readOnlyRootFilesystem: true
	terminationMessagePath: /dev/termination-log
	terminationMessagePolicy: File
	volumeMounts:
	- mountPath: /usr/local/nvidia
	name: nvidia-install-dir-host
	readOnly: true
	- mountPath: /etc/nvidia
	name: nvidia-config
	readOnly: true
	- mountPath: /var/lib/kubelet/device-plugins
	name: device-plugins
	readOnly: true
	priorityClassName: system-node-critical
	restartPolicy: Always
	schedulerName: default-scheduler
	securityContext: {}
	terminationGracePeriodSeconds: 30
	tolerations:
	- operator: Exists
	- key: components.gke.io/gke-managed-components
	operator: Exists
	volumes:
	- hostPath:
	path: /home/kubernetes/bin/nvidia
	type: Directory
	name: nvidia-install-dir-host
	- configMap:
	defaultMode: 420
	name: dcgm-exporter-metrics
	name: dcgm-exporter-metrics
	- hostPath:
	path: /var/lib/kubelet/pod-resources
	type: Directory
	name: pod-resources
	- hostPath:
	path: /var/lib/kubelet/device-plugins
	type: Directory
	name: device-plugins
	- hostPath:
	path: /etc/nvidia
	type: DirectoryOrCreate
	name: nvidia-config
	updateStrategy:
	rollingUpdate:
	maxSurge: 0
	maxUnavailable: 1
	type: RollingUpdate
	status:
	currentNumberScheduled: 1
	desiredNumberScheduled: 1
	numberAvailable: 1
	numberMisscheduled: 0
	numberReady: 1
	observedGeneration: 1
	updatedNumberScheduled: 1
No results found