node debug pod with custom image (tcpdump etc)

oc debug node/ip-1-2-3-4.internal --image=registry.tld/namespace/image:tag

delete pod via API - from https://access.redhat.com/solutions/3536141

echo '{ "propagationPolicy": "Background" }' | curl -k -X DELETE -d @-  -H "Authorization: Bearer $TOKEN" -H 'Accept: application/json' -H 'Content-Type: application/json'  https://openshift-master.$DOMAIN/api/v1/namespaces/$NAMESPACE/pods/$PODNAME

kill haproxy processes older than 12h

ps -eo pid,etimes,comm,user | awk '{if ($3 = /haproxy/ && $2>43200) print $1}' | xargs kill -15

add infra nodeselector to namespace with oc patch

oc patch namespace $NAMESPACE -p '{"metadata":{"annotations":{"openshift.io/node-selector":"region=infra"}}}'

get a table of canonical nodenames together with their name label:

oc get nodes -Lname --output=jsonpath='{range .items[*].metadata}{.name}{" "}{.labels.name}{"\n"}'

translate canonical nodename to inventory name

alias nn="oc get node --output=jsonpath='{range .metadata}{.labels.name}{\"\n\"}' ${1}"

add etcd endpoints to cluster-monitoring-config cm

oc -n openshift-monitoring get cm cluster-monitoring-config -oyaml|grep -q " etcd:" || (oc -n openshift-monitoring get cm cluster-monitoring-config -ojson | jq -r '.data["config.yaml"]' | sed -e "/^$/d" ; echo -e "etcd:\n  targets:\n    selector:\n      openshift.io/component: etcd\n      openshift.io/control-plane: \"true\"" ) | oc create configmap cluster-monitoring-config --from-literal=config.yaml="$(cat /dev/stdin)" --dry-run -oyaml | oc -n openshift-monitoring apply -f -

same, but without "jq"

oc -n openshift-monitoring get cm cluster-monitoring-config -oyaml|grep -q " etcd:" || (oc -n openshift-monitoring get cm cluster-monitoring-config -o go-template='{{ range $key, $value := .data}}{{if eq $key "config.yaml"}}{{$value}}{{"\n"}}{{end}}{{end}}' | sed -e "/^$/d" ; echo -e "etcd:\n  targets:\n    selector:\n      openshift.io/component: etcd\n      openshift.io/control-plane: \"true\"" ) | oc create configmap cluster-monitoring-config --from-literal=config.yaml="$(cat /dev/stdin)" --dry-run -oyaml | oc -n openshift-monitoring apply -f -

set master nodeSelector for Prometheus in cluster-monitoring-config cm

oc -n openshift-monitoring get cm cluster-monitoring-config -ojson | jq -r '.data["config.yaml"]' | sed -e '1h;2,$H;$!d;g' -e 's;\(prometheusK8s:\n  baseImage: registry.redhat.io/openshift3/prometheus\n  nodeSelector:\n    node-role.kubernetes.io/\)infra;\1master;'  | oc create configmap cluster-monitoring-config --from-literal=config.yaml="$(cat /dev/stdin)" --dry-run -oyaml | oc -n openshift-monitoring apply  -f -

reload prometheus and alertmanager

oc -n openshift-monitoring exec prometheus-k8s-0 -c prometheus -- curl -X POST http://localhost:9090/-/reload
oc -n openshift-monitoring exec prometheus-k8s-1 -c prometheus -- curl -X POST http://localhost:9090/-/reload
for i in 0 1 2; do oc -n openshift-monitoring exec alertmanager-main-$i -c alertmanager -- curl -X POST http://localhost:9093/-/reload ; done

prometheus query over node memory & cpu usage over time in percent:

100 * (1 - ((avg_over_time(node_memory_MemFree_bytes[24h]) + avg_over_time(node_memory_Cached_bytes[24h]) + avg_over_time(node_memory_Buffers_bytes[24h])) / avg_over_time(node_memory_MemTotal_bytes[24h])))
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

other useful prometheus queries:

(sum by (namespace) (kube_pod_container_resource_limits_cpu_cores))/namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
sum(kube_pod_container_resource_requests_memory_bytes{container!="POD"})
sum(container_memory_rss{container!="POD"})
sort_desc(sum by (namespace) (kube_pod_container_resource_requests_memory_bytes{container!="POD"}/1024/1024))

get specific annotation from object(s) with -o go-template

oc get namespace test-namespace -o go-template='{{ range $key, $value := .metadata.annotations}}{{if eq $key "openshift.io/requester"}}{{$value}}{{"\n"}}{{end}}{{end}}'
oc get pv -o go-template='{{range .items}}{{$pv := .metadata.name}}{{ range $key, $value := .metadata.annotations}}{{if eq $key "foo/pv-release-date"}}{{"PV: "}}{{$pv}}{{", release-date: "}}{{$value}}{{"\n"}}{{end}}{{end}}{{end}}'
oc get sa -A -o go-template='{{range .items}}{{$sa := .metadata.name}}{{$ns := .metadata.namespace}}{{ range $key, $value := .metadata.annotations}}{{if eq $key "eks.amazonaws.com/role-arn"}}{{"SA: "}}{{ $sa }}{{"NS: "}}{{ $ns }}{{", role-arn: "}}{{$value}}{{"\n"}}{{end}}{{end}}{{end}}'

get all (human) users assigned to roles in namespaces

oc get rolebindings  --all-namespaces -o jsonpath='{range .items[?(@.subjects[].kind=="User")]}{.metadata.namespace}{" "}{.roleRef.name}{" "}{.subjects[].name}{"\n"}'

get all roles that are assigned to (human) users

oc get rolebindings  --all-namespaces -o jsonpath='{range .items[?(@.subjects[].kind=="User")]}{.roleRef.name}{"\n"}'|sort|uniq

get all clusterrolebindings with subject[kind:User] and subject[kind:Group]

oc get clusterrolebinding.rbac -o jsonpath='{range .items[?(@.subjects[].kind=="User")]}{.roleRef.name}{" "}{.subjects[].name}{"\n"}'
oc get clusterrolebinding.rbac -o jsonpath='{range .items[?(@.subjects[].kind=="Group")]}{.roleRef.name}{" "}{.subjects[].name}{"\n"}'

get all clusterrolebindings that bind the cluster-admin role, print out metrics-style subject, name and binding name

oc get clusterrolebinding --all-namespaces -o jsonpath='{range .items[?(@.roleRef.name == "cluster-admin")]}{"\"1|subject="}{"subject="}{.subjects[].kind}{",name="}{.subjects[].name}{",binding="}{.metadata.name}{"\"\n"}'

get PV(C) data (in this case, AWS EFS)

oc get pv -ojsonpath='{range .items[*]}{.spec.claimRef.name}{","}{.metadata.name}{","}{.spec.claimRef.namespace}{","}{.spec.storageClassName}{","}{.spec.nfs.path}{","}{.spec.capacity.storage}{"\n"}' | while IFS="," read -r volumename volumeid namespace storageclass mountpath size [...]

make complete backup of a namespace $NAMESPACE (c0a18b11cadf3c1a9ad201ee9b3729fb76ef8cab is the sha1sum of the empty yaml list)

for r in $(oc api-resources --no-headers --namespaced=true | awk '{print $1}' | xargs); do oc get --export $r -n $NAMESPACE -oyaml > $NAMESPACE.$r.yaml; echo "c0a18b11cadf3c1a9ad201ee9b3729fb76ef8cab  $NAMESPACE.$r.yaml" | sha1sum -c >/dev/null 2>&1  && rm -f $NAMESPACE.$r.yaml ; done
oc get --export -oyaml namespace $NAMESPACE > $NAMESPACE.namespace.yaml

quickly find on which node there's no openshift-logging pod at all

diff -y <(oc get nodes --output=jsonpath='{range .items[*].metadata}{.name}{"\n"}' | sort) <(oc -n openshift-logging get pod --output=jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}'|sort | uniq )

print all namespaces which do not have quotas

comm -23 <(oc get namespaces --no-headers -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}' | sort -n) <(oc get quota --all-namespaces -ojsonpath='{range .items[*]}{.metadata.namespace}{"\n"}'|sort -n)
comm -23 <(oc get namespace --no-headers -o custom-columns=NAME:.metadata.name | sort -n) <(oc get quota --no-headers --all-namespaces -o custom-columns=quota:.metadata.namespace | sort -n)

create some API usage stats based on JSON audit log

sed "s;.*\"requestURI\":\"\([a-zA-Z0-9_/-]\+\).*\"verb\":\"\([a-zA-Z0-9_/-]\+\)\".*;\2: \1;g" tmplog | sort | uniq -c | sort -n | tail -n1000

check cluster state / diagnose cluster split-brain schizophrenia:

for i in 3 4 6; do NO_PROXY="${NO_PROXY} 10.68.36.1${i}" no_proxy="${NO_PROXY}" ; echo 10.68.36.1${i}; curl -s -X GET -H "Authorization: Bearer $(oc whoami --show-token)" https://10.68.36.1${i}/api/v1/namespaces/default/endpoints/kubernetes | jq ".subsets[].addresses[].ip"; done

get some infos about (in this case: compute) nodes

oc get nodes -l node-role.kubernetes.io/${OSE_ROLE}="true" -o jsonpath='{range .items[*]}{@.metadata.name}{" "}{.metadata.creationTimestamp}{" "}{range @.status.conditions[?(@.type == "Ready")]}{@.status}{"\n"}{end}{end}' | while read -r name creationtimestamp readiness; do [...]

get a list of all container images in use on the cluster, including corresponding namespace, pod and container name

 oc get pods --all-namespaces -o go-template='{{range .items}}{{if or (eq .status.phase "Running") (eq .status.phase "Completed")}}{{$status := .status.phase}}{{$namespace := .metadata.namespace}}{{$podname := .metadata.name}}{{range .spec.containers}}{{$namespace}}{{","}}{{$podname}}{{","}}{{.name}}{{","}}{{.image}}{{","}}{{$status}}{{"\n"}}{{end}}{{end}}{{end}}'

get the pod with the IP 5.5.5.5

oc get pods --all-namespaces -o go-template='{{range .items}}{{if and (eq .status.phase "Running") (eq .status.podIP "5.5.5.5")}}{{$namespace := .metadata.namespace}}{{$podname := .metadata.name}}{{$namespace}}{{","}}{{$podname}}{{"\n"}}{{end}}{{end}}'

"soft-drain" a node

oc adm manage-node --list-pods $NODENAME --no-headers -ojsonpath='{range .items[?(@.status.phase=="Running")]}{.metadata.name}{" "}{.metadata.namespace}{"\n"}' | egrep -v "openshift-" | while read -r name namespace; do oc -n $namespace delete pod $name --grace-period=300 --wait=false; echo "sleep 20"; sleep 20; done

get node CSR and the corresponding node, sorted by date

oc get csr -ojsonpath='{range .items[*]}{.spec.username}{"      "}{.metadata.name}{"     "}{.metadata.creationTimestamp}{"\n"}' | sort -k3

same, but print only the last CSR for each node

oc get csr -ojsonpath='{range .items[*]}{.spec.username}{"\t"}{.metadata.name}{"\t"}{.metadata.creationTimestamp}{"\n"}' | sort -r -k3 -b | sort -k1,1 -u -b | awk '{print $2}' | xargs

get all used Service nodePorts on the cluster

oc get -A svc --sort-by=.spec.ports[0].nodePort -o go-template='{{range .items}}{{$name := .metadata.name}}{{$namespace := .metadata.namespace}}{{range.spec.ports}}{{if .nodePort}}{{$namespace}}{{" "}}{{$name}}{{" "}}{{.nodePort}}{{"\n"}}{{end}}{{end}}{{end}}'

OCP4

Machine Config Operator (MCO)

force rewrite of configs on node (when stalled or conflicts)

touch /run/machine-config-daemon-force

networkPolicy example with port

apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: allow-specific-port
spec:
  ingress:
    - from:
        - namespaceSelector:
            matchLabels:
              name: foo
      ports:
        - port: 12345
          protocol: TCP
  podSelector:
    matchLabels:
      app: foo
  policyTypes:
    - Ingress

the following policy mean only namespaces labeled 'team=operations' and pods with label 'name=test-pods' can access only pods with label 'name=test-pods' in this namespace

kind: NetworkPolicy
apiVersion: networking.k8s.io/v1
metadata:
    name: allow-ns-and-pod
spec:
  podSelector:
    matchLabels:
      name: test-pods
  ingress:
    - from:
      - namespaceSelector:
          matchLabels:
            team: operations
        podSelector:
          matchLabels:
            name: test-pods
  podSelector:
     matchLabels:
       name: test-pods

felixkrohn/openshift-notepad.md

node debug pod with custom image (tcpdump etc)

delete pod via API - from https://access.redhat.com/solutions/3536141

kill haproxy processes older than 12h

add infra nodeselector to namespace with oc patch

get a table of canonical nodenames together with their name label:

translate canonical nodename to inventory name

add etcd endpoints to cluster-monitoring-config cm

same, but without "jq"

set master nodeSelector for Prometheus in cluster-monitoring-config cm

reload prometheus and alertmanager

prometheus query over node memory & cpu usage over time in percent:

other useful prometheus queries:

get specific annotation from object(s) with -o go-template

get all (human) users assigned to roles in namespaces

get all roles that are assigned to (human) users

get all clusterrolebindings with subject[kind:User] and subject[kind:Group]

get all clusterrolebindings that bind the cluster-admin role, print out metrics-style subject, name and binding name

get PV(C) data (in this case, AWS EFS)

make complete backup of a namespace $NAMESPACE (c0a18b11cadf3c1a9ad201ee9b3729fb76ef8cab is the sha1sum of the empty yaml list)

quickly find on which node there's no openshift-logging pod at all

print all namespaces which do not have quotas

create some API usage stats based on JSON audit log

check cluster state / diagnose cluster split-brain schizophrenia:

get some infos about (in this case: compute) nodes

get a list of all container images in use on the cluster, including corresponding namespace, pod and container name

get the pod with the IP 5.5.5.5

"soft-drain" a node

get node CSR and the corresponding node, sorted by date

same, but print only the last CSR for each node

get all used Service nodePorts on the cluster

OCP4

Machine Config Operator (MCO)

force rewrite of configs on node (when stalled or conflicts)

networkPolicy example with port