gbrayut · March 10, 2025 20:01
diff --git a/00-dws-nodepool.sh b/00-dws-nodepool.sh
 # Moved these examples to https://github.com/gbrayut/cloud-examples/tree/main/gke-dws
 # so better to use that instead of the files below



 # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#create-node-pool
 # https://cloud.google.com/compute/docs/gpus#h100-gpus
 gcloud container node-pools create dws-h100 \
    --cluster=gke-iowa \
    --location=us-central1 \
    --enable-queued-provisioning \
    --accelerator="type=nvidia-h100-80gb,count=8,gpu-driver-version=latest" \
    --machine-type=a3-highgpu-8g \
    --enable-autoscaling  \
    --num-nodes=0   \
    --total-max-nodes 10  \
    --location-policy=ANY  \
    --reservation-affinity=none  \
    --no-enable-autorepair
diff --git a/01-dws-request.yaml b/01-dws-request.yaml
 apiVersion: v1
 kind: PodTemplate
 metadata:
  name: dws-pr-template
  namespace: default
  labels:
    cloud.google.com/apply-warden-policies: "true"
 template:
  metadata:
    labels:
      app: dwspod
  spec:
    nodeSelector:
      cloud.google.com/gke-nodepool: dws-h100 #dws-standardcluster-1
    tolerations:
      - key: "nvidia.com/gpu"
        operator: "Exists"
        effect: "NoSchedule"
    containers:
      - name: pi
        image: perl
        command: ["/bin/sh"]
        resources:
          limits:
            cpu: "700m"
            nvidia.com/gpu: "8"
          requests:
            cpu: "700m"
            nvidia.com/gpu: "8"
    restartPolicy: Never
 ---
 apiVersion: autoscaling.x-k8s.io/v1
 kind: ProvisioningRequest
 metadata:
  name: dws-1hr-pr
  namespace: default
 spec:
  provisioningClassName: queued-provisioning.gke.io
  parameters:
    maxRunDurationSeconds: "3600"
  podSets:
  - count: 2
    podTemplateRef:
      name: dws-pr-template

 # kubectl describe provisioningrequest dws-1hr-pr
 # should take ~5 minutes for two a3-highgpu-8g
 # NOTE: if the nodes don't have a GPU workload applied (see below) shortly after they are ready,
 # the cluster autoscaler could flag it for downscaling. DWS also will expire the ProvisioningRequest
 # capacity eary if the original workload is removed:
 #
 #  Message:  Capacity booking for the Provisioning Request has expired and the nodes are now candidates for scale down when underutilized.
 #  Reason:   BookingExpired
 # 
 #   Warning  IgnoredInScaleUp  2m11s  cluster-autoscaler    Unschedulable pod ignored in scale-up loop, 
 #   because it's consuming ProvisioningRequest default/dws-1hr-pr that is in BookingExpired state. 
 #   The pod most likely arrived too late and will never schedule as the VM was already scaled-down.
diff --git a/02-dws-deployment.yaml b/02-dws-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: whereami
  labels:
    app: whereami
 spec:
  replicas: 2
  # when testing it is faster to imeddiately delete existing pods first vs waiting for new/pending/ready transitions
  strategy:
    type: Recreate
  
  selector:
    matchLabels:
      app: whereami
  template:
    metadata:
      labels:
        app: whereami
      annotations:
        # specify pod should use DWS provisioned nodes
        cluster-autoscaler.kubernetes.io/consume-provisioning-request: dws-1hr-pr
        cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io"
    spec:
      terminationGracePeriodSeconds: 0  # faster deletion of tests pod

      nodeSelector:
        cloud.google.com/gke-nodepool: dws-h100 #dws-standardcluster-1
      tolerations:
      - key: "nvidia.com/gpu"
        operator: "Exists"
        effect: "NoSchedule"
      - key: "cloud.google.com/gke-queued"
        operator: "Exists"
      containers:
      - name: frontend
        image: us-docker.pkg.dev/google-samples/containers/gke/whereami:v1.2.8
        ports:
        - containerPort: 8080
        resources:
          requests:
            cpu: "500m"
            nvidia.com/gpu: "8"
          limits:
            nvidia.com/gpu: "8"
	# Moved these examples to https://github.com/gbrayut/cloud-examples/tree/main/gke-dws
	# so better to use that instead of the files below



	# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#create-node-pool
	# https://cloud.google.com/compute/docs/gpus#h100-gpus
	gcloud container node-pools create dws-h100 \
	--cluster=gke-iowa \
	--location=us-central1 \
	--enable-queued-provisioning \
	--accelerator="type=nvidia-h100-80gb,count=8,gpu-driver-version=latest" \
	--machine-type=a3-highgpu-8g \
	--enable-autoscaling \
	--num-nodes=0 \
	--total-max-nodes 10 \
	--location-policy=ANY \
	--reservation-affinity=none \
	--no-enable-autorepair
	apiVersion: v1
	kind: PodTemplate
	metadata:
	name: dws-pr-template
	namespace: default
	labels:
	cloud.google.com/apply-warden-policies: "true"
	template:
	metadata:
	labels:
	app: dwspod
	spec:
	nodeSelector:
	cloud.google.com/gke-nodepool: dws-h100 #dws-standardcluster-1
	tolerations:
	- key: "nvidia.com/gpu"
	operator: "Exists"
	effect: "NoSchedule"
	containers:
	- name: pi
	image: perl
	command: ["/bin/sh"]
	resources:
	limits:
	cpu: "700m"
	nvidia.com/gpu: "8"
	requests:
	cpu: "700m"
	nvidia.com/gpu: "8"
	restartPolicy: Never
	---
	apiVersion: autoscaling.x-k8s.io/v1
	kind: ProvisioningRequest
	metadata:
	name: dws-1hr-pr
	namespace: default
	spec:
	provisioningClassName: queued-provisioning.gke.io
	parameters:
	maxRunDurationSeconds: "3600"
	podSets:
	- count: 2
	podTemplateRef:
	name: dws-pr-template

	# kubectl describe provisioningrequest dws-1hr-pr
	# should take ~5 minutes for two a3-highgpu-8g
	# NOTE: if the nodes don't have a GPU workload applied (see below) shortly after they are ready,
	# the cluster autoscaler could flag it for downscaling. DWS also will expire the ProvisioningRequest
	# capacity eary if the original workload is removed:
	#
	# Message: Capacity booking for the Provisioning Request has expired and the nodes are now candidates for scale down when underutilized.
	# Reason: BookingExpired
	#
	# Warning IgnoredInScaleUp 2m11s cluster-autoscaler Unschedulable pod ignored in scale-up loop,
	# because it's consuming ProvisioningRequest default/dws-1hr-pr that is in BookingExpired state.
	# The pod most likely arrived too late and will never schedule as the VM was already scaled-down.
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: whereami
	labels:
	app: whereami
	spec:
	replicas: 2
	# when testing it is faster to imeddiately delete existing pods first vs waiting for new/pending/ready transitions
	strategy:
	type: Recreate

	selector:
	matchLabels:
	app: whereami
	template:
	metadata:
	labels:
	app: whereami
	annotations:
	# specify pod should use DWS provisioned nodes
	cluster-autoscaler.kubernetes.io/consume-provisioning-request: dws-1hr-pr
	cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io"
	spec:
	terminationGracePeriodSeconds: 0 # faster deletion of tests pod

	nodeSelector:
	cloud.google.com/gke-nodepool: dws-h100 #dws-standardcluster-1
	tolerations:
	- key: "nvidia.com/gpu"
	operator: "Exists"
	effect: "NoSchedule"
	- key: "cloud.google.com/gke-queued"
	operator: "Exists"
	containers:
	- name: frontend
	image: us-docker.pkg.dev/google-samples/containers/gke/whereami:v1.2.8
	ports:
	- containerPort: 8080
	resources:
	requests:
	cpu: "500m"
	nvidia.com/gpu: "8"
	limits:
	nvidia.com/gpu: "8"