Created
April 4, 2025 07:45
-
-
Save fabriziosalmi/34b86ac785e44c0af12ca1593fe902a1 to your computer and use it in GitHub Desktop.
monitor_k8s.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# --- Configuration --- | |
# Add namespaces here that you want to exclude from the Application Overview | |
EXCLUDE_NAMESPACES=("kube-system" "kube-public" "kube-node-lease" "local-path-storage" "kube-flannel" "calico-system" "tigera-operator") # Added common CNI/operator namespaces | |
# --- Terminal Colors --- | |
RESET='\033[0m'; BOLD='\033[1m'; DIM='\033[2m'; RED='\033[0;31m'; LRED='\033[1;31m'; | |
GREEN='\033[0;32m'; LGREEN='\033[1;32m'; YELLOW='\033[0;33m'; BLUE='\033[0;34m'; | |
MAGENTA='\033[0;35m'; CYAN='\033[0;36m'; LGRAY='\033[0;37m'; WHITE='\033[1;37m'; | |
# --- State --- | |
METRICS_AVAILABLE=false | |
# --- Helper Functions --- | |
print_header() { echo -e "\n${BLUE}${BOLD}=== $1 ===${RESET}"; } | |
print_subheader() { echo -e "${MAGENTA}--- $1 ---${RESET}"; } | |
log_warn() { echo -e "${YELLOW}${BOLD}⚠️ WARNING:${RESET}${YELLOW} $1${RESET}"; } | |
log_error() { echo -e "${RED}${BOLD}❌ ERROR:${RESET}${RED} $1${RESET}"; } # Added Error log | |
log_info() { echo -e "${CYAN}ℹ️ INFO:${RESET} $1"; } | |
check_command() { if ! command -v "$1" &> /dev/null; then log_warn "Command '${BOLD}$1${RESET}${YELLOW}' not found. Some features might be unavailable."; return 1; fi; return 0; } | |
check_kubectl() { if ! kubectl cluster-info > /dev/null 2>&1; then log_error "Cannot connect to Kubernetes cluster via kubectl."; exit 1; fi; } | |
print_status() { | |
local status="$1"; local expected_status="${2:-Ready}"; local ok_color="${LGREEN}"; local fail_color="${LRED}" | |
# Handle the case where status might be "True" or "False" from custom-columns (Node Ready condition) | |
if [[ "$status" == "True" ]]; then status="Ready"; fi | |
if [[ "$status" == "False" ]]; then status="NotReady"; fi | |
if [[ "$status" == "$expected_status" ]]; then echo -e "${ok_color}${status}${RESET}"; else echo -e "${fail_color}${status}${RESET}"; fi | |
} | |
check_metrics_server() { | |
if kubectl get apiservice v1beta1.metrics.k8s.io -o name > /dev/null 2>&1; then | |
# Check if top nodes actually returns data (can take a moment after install) | |
if kubectl top nodes --no-headers 2>/dev/null | head -n 1 | grep -q '[0-9]'; then | |
METRICS_AVAILABLE=true; log_info "Metrics Server detected and reporting."; | |
else | |
METRICS_AVAILABLE=false; log_warn "Metrics Server API found, but 'kubectl top nodes' failed or returned no data. Metrics may be starting or unhealthy."; | |
fi | |
else | |
METRICS_AVAILABLE=false; log_warn "Metrics Server API service (v1beta1.metrics.k8s.io) not found. Resource usage stats skipped."; | |
log_warn "Install: kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml"; | |
fi | |
} | |
# --- Initial Checks --- | |
check_command kubectl; check_command awk; check_command sed; check_command grep; check_command sort; check_command head; check_command tail; check_command wc; check_command cut; check_command printf; check_command date | |
# Optional: check_command jq | |
check_kubectl | |
check_metrics_server | |
# --- Cluster Overview --- | |
print_header "Cluster Overview" | |
K8S_SERVER_VERSION=""; K8S_VERSION_JSON="" | |
if K8S_VERSION_JSON=$(kubectl version -o json 2>/dev/null); then | |
if check_command jq; then K8S_SERVER_VERSION=$(echo "$K8S_VERSION_JSON" | jq -r .serverVersion.gitVersion 2>/dev/null); fi | |
# Fallback parsing if jq failed or not present | |
if [ -z "$K8S_SERVER_VERSION" ]; then K8S_SERVER_VERSION=$(echo "$K8S_VERSION_JSON" | grep '"gitVersion":' | head -n1 | sed -e 's/.*: *"//' -e 's/",?//'); fi | |
fi | |
# Final fallback if JSON failed | |
if [ -z "$K8S_SERVER_VERSION" ]; then K8S_SERVER_VERSION=$(kubectl version 2>/dev/null | grep 'Server Version:' | awk '{print $3}'); fi | |
K8S_API_ENDPOINT=$(kubectl cluster-info 2>/dev/null | grep 'Kubernetes control plane' | awk '/is running at/ {print $NF}') | |
echo -e " ${BOLD}API Endpoint:${RESET}\t${CYAN}${K8S_API_ENDPOINT:-N/A}${RESET}" | |
echo -e " ${BOLD}Server Version:${RESET}\t${CYAN}${K8S_SERVER_VERSION:-N/A}${RESET}" | |
# --- Node Status --- | |
print_header "Node Status" | |
# Use custom-columns; note the escaped dot in the label selector | |
NODE_OUTPUT_LINES=$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,STATUS:.status.conditions[?(@.type=="Ready")].status,ROLES:.metadata.labels."kubernetes\.io/role",VERSION:.status.nodeInfo.kubeletVersion,INTERNAL-IP:.status.addresses[?(@.type=="InternalIP")].address,OS-IMAGE:.status.nodeInfo.osImage' --no-headers 2>/dev/null) | |
if [ -z "$NODE_OUTPUT_LINES" ]; then | |
log_warn "No nodes found or failed to get node information." | |
else | |
NUM_NODES=$(echo "$NODE_OUTPUT_LINES" | wc -l) | |
# Count lines where the second column (STATUS, which is 'True' or 'False') is "True" | |
NUM_READY=$(echo "$NODE_OUTPUT_LINES" | awk '$2 == "True"' | wc -l) | |
NUM_NOT_READY=$((NUM_NODES - NUM_READY)) | |
STATUS_COLOR="${LGREEN}"; [[ "$NUM_NOT_READY" -gt 0 ]] && STATUS_COLOR="${LRED}" | |
echo -e " ${BOLD}Total Nodes:${RESET}\t${NUM_NODES} (${LGREEN}${NUM_READY} Ready${RESET}, ${STATUS_COLOR}${NUM_NOT_READY} NotReady${RESET})" | |
# Pre-fetch metrics if available | |
declare -A NODE_CPU_USAGE NODE_MEM_USAGE CPU_PERCENT MEM_PERCENT | |
if $METRICS_AVAILABLE; then | |
while IFS= read -r line; do | |
# Skip lines that don't have the expected number of fields (e.g., header if --no-headers failed, or error messages) | |
[[ $(echo "$line" | wc -w) -lt 5 ]] && continue | |
name=$(echo "$line" | awk '{print $1}'); cpu=$(echo "$line" | awk '{print $2}'); cpu_p=$(echo "$line" | awk '{print $3}'); mem=$(echo "$line" | awk '{print $4}'); mem_p=$(echo "$line" | awk '{print $5}') | |
# Ensure keys are set even if values are empty momentarily | |
NODE_CPU_USAGE["$name"]="${cpu:-N/A}"; CPU_PERCENT["$name"]="${cpu_p%\%}"; NODE_MEM_USAGE["$name"]="${mem:-N/A}"; MEM_PERCENT["$name"]="${mem_p%\%}" | |
done < <(kubectl top nodes --no-headers 2>/dev/null) | |
fi | |
print_subheader "Node Details" | |
# Adjust padding slightly for potentially longer colored status | |
printf " ${BOLD}%-25s %-18s %-20s %-15s %-15s %-30s${RESET}" "NAME" "STATUS" "ROLES" "VERSION" "INTERNAL-IP" "OS-IMAGE" | |
if $METRICS_AVAILABLE; then printf " ${BOLD}%-10s %-5s %-8s %-5s${RESET}" "CPU(c)" "CPU%" "MEM(Mi)" "MEM%"; fi | |
printf "\n" | |
echo "$NODE_OUTPUT_LINES" | while IFS= read -r line; do | |
# Use awk to extract fields robustly, OS Image is everything after field 5 | |
name=$(echo "$line" | awk '{print $1}') | |
status_val=$(echo "$line" | awk '{print $2}') | |
roles=$(echo "$line" | awk '{print $3}') | |
version=$(echo "$line" | awk '{print $4}') | |
internal_ip=$(echo "$line" | awk '{print $5}') | |
# This awk command reassigns fields 1-5 to empty strings, then prints the rest ($0), | |
# which effectively gives fields 6 onwards. sed removes leading space. | |
os_image=$(echo "$line" | awk '{ $1=$2=$3=$4=$5=""; print $0 }' | sed 's/^[ \t]*//') | |
status_colored=$(print_status "$status_val") # print_status handles True/False -> Ready/NotReady + Color | |
[[ "$roles" == "<none>" || -z "$roles" ]] && roles="<worker>" # Handle no role label or empty role field | |
# Truncate OS Image if needed, printf %.30s handles the rest | |
os_image_display="${os_image}" | |
# printf handles width limiting, but let's truncate os_image slightly earlier if very long | |
# if [ ${#os_image_display} -gt 30 ]; then os_image_display="${os_image_display:0:27}..."; fi | |
# Note: Color codes affect alignment; widths below are approximate visual guides. | |
printf " %-25s %-18b %-20s %-15s %-15s %-30.30s" "$name" "$status_colored" "$roles" "$version" "$internal_ip" "$os_image_display" | |
if $METRICS_AVAILABLE; then | |
# Use parameter expansion ${VAR:-Default} for safety, though pre-fetch loop tries to set N/A | |
cpu_val="${NODE_CPU_USAGE[$name]:-N/A}"; cpu_p_val="${CPU_PERCENT[$name]:-N/A}" | |
mem_val="${NODE_MEM_USAGE[$name]:-N/A}"; mem_p_val="${MEM_PERCENT[$name]:-N/A}" | |
# Add '%' suffix only if value is not N/A | |
[[ "$cpu_p_val" != "N/A" ]] && cpu_p_disp="${cpu_p_val}%" || cpu_p_disp="N/A" | |
[[ "$mem_p_val" != "N/A" ]] && mem_p_disp="${mem_p_val}%" || mem_p_disp="N/A" | |
printf " %-10s %-5s %-8s %-5s" "$cpu_val" "$cpu_p_disp" "$mem_val" "$mem_p_disp" | |
fi | |
printf "\n" | |
done | |
fi | |
# --- Control Plane Health --- | |
print_header "Control Plane Health" | |
# Use verbose readyz endpoint first | |
HEALTH_OUTPUT=$(kubectl get --raw='/readyz?verbose' 2>/dev/null); HEALTH_STATUS="Unknown" | |
if [[ -n "$HEALTH_OUTPUT" ]]; then | |
# Exclude specific known "informational" non-ready items if needed, e.g., 'informer-sync' | |
# UNHEALTHY_LINES=$(echo "$HEALTH_OUTPUT" | grep '\[-\]' | grep -v 'informer-sync') # Example exclusion | |
UNHEALTHY_LINES=$(echo "$HEALTH_OUTPUT" | grep '\[-\]') | |
if [[ -n "$UNHEALTHY_LINES" ]]; then | |
HEALTH_STATUS="Unhealthy"; echo -e "${LRED}${BOLD}❌ /readyz reports unhealthy components:${RESET}"; echo "$UNHEALTHY_LINES" | sed 's/^/ /'; | |
elif echo "$HEALTH_OUTPUT" | grep -q "\[+\]"; then HEALTH_STATUS="Healthy"; echo -e "${LGREEN}✅ /readyz reports healthy.${RESET}"; | |
# Handle case where output exists but has no [+] or [-] (unlikely but possible) | |
else log_warn "Could not determine health from /readyz output. Output present but format unexpected."; echo "$HEALTH_OUTPUT"; fi | |
else # Fallback checks if /readyz failed | |
HEALTH_OUTPUT=$(kubectl get --raw='/healthz' 2>/dev/null) | |
if [[ "$HEALTH_OUTPUT" == "ok" ]]; then HEALTH_STATUS="Healthy"; echo -e "${LGREEN}✅ /healthz reports healthy.${RESET}"; log_info "(Use '/readyz?verbose' for component details if available)"; | |
else | |
# healthz failed or returned non-"ok", try componentstatuses (often deprecated/removed, but worth a try) | |
log_warn "Could not fetch health via /readyz or /healthz. Checking componentstatuses (may be deprecated)..."; | |
if CS_OUTPUT=$(kubectl get componentstatuses --no-headers 2>/dev/null); then | |
if echo "$CS_OUTPUT" | grep -vq 'Healthy'; then # Check if any line is not Healthy | |
HEALTH_STATUS="Unhealthy"; echo -e "${LRED}${BOLD}❌ Componentstatuses report unhealthy:${RESET}"; echo "$CS_OUTPUT" | grep -v 'Healthy' | sed 's/^/ /'; | |
elif echo "$CS_OUTPUT" | grep -q 'Healthy'; then # Check if at least one healthy component was found | |
HEALTH_STATUS="Healthy"; echo -e "${LGREEN}✅ Componentstatuses report healthy.${RESET}"; | |
else # Command succeeded but returned no lines (e.g., API disabled) | |
log_warn "Got empty response from componentstatuses. Health status remains Unknown."; | |
fi | |
else log_warn "Cannot get componentstatuses API. Health check inconclusive."; fi | |
fi | |
fi | |
# --- Core Addon Status --- | |
print_header "Core Addon Status" | |
# CoreDNS Check (kube-system namespace assumed) | |
print_subheader "CoreDNS" | |
# Try Deployment first (common) then DaemonSet (less common) | |
DNS_READY=0; DNS_TOTAL=0; DNS_TYPE="Deployment"; DNS_STATUS="N/A" | |
if kubectl get deployment coredns -n kube-system -o jsonpath='{.status.readyReplicas}/{.status.replicas}' > /dev/null 2>&1; then | |
DNS_STATUS=$(kubectl get deployment coredns -n kube-system -o jsonpath='{.status.readyReplicas}/{.status.replicas}' 2>/dev/null) | |
elif kubectl get daemonset coredns -n kube-system -o jsonpath='{.status.numberReady}/{.status.desiredNumberScheduled}' > /dev/null 2>&1; then | |
DNS_TYPE="DaemonSet" | |
DNS_STATUS=$(kubectl get daemonset coredns -n kube-system -o jsonpath='{.status.numberReady}/{.status.desiredNumberScheduled}' 2>/dev/null) | |
else | |
DNS_STATUS="N/A (Not Found)" | |
fi | |
if [[ "$DNS_STATUS" =~ ^[0-9]+/[0-9]+$ ]]; then # Check format is "X/Y" | |
DNS_READY=$(echo "$DNS_STATUS" | cut -d'/' -f1) | |
DNS_TOTAL=$(echo "$DNS_STATUS" | cut -d'/' -f2) | |
if [[ "$DNS_READY" -gt 0 ]] && [[ "$DNS_READY" -eq "$DNS_TOTAL" ]]; then | |
echo -e " ${LGREEN}✅ CoreDNS ${DNS_TYPE} ready (${DNS_STATUS} replicas/pods).${RESET}"; | |
else | |
echo -e " ${LRED}❌ CoreDNS ${DNS_TYPE} status: ${DNS_STATUS} ready${RESET}"; | |
kubectl get pods -n kube-system -l k8s-app=kube-dns --no-headers 2>/dev/null | awk '{printf " Pod: %-40s Status: %-10s Restarts: %s\n", $1, $3, $4}'; | |
fi | |
elif [[ "$DNS_STATUS" == "N/A (Not Found)" ]]; then | |
log_warn "CoreDNS Deployment/DaemonSet not found in kube-system namespace." | |
else | |
log_warn "Could not parse CoreDNS status: ${DNS_STATUS}" | |
fi | |
# CNI Check (Example: Calico in kube-system or calico-system) | |
# Adapt the namespace and labels/names if using a different CNI (Flannel, Cilium, etc.) | |
print_subheader "CNI (Calico Example)" | |
CALICO_NS="kube-system" # Default, try calico-system if not found | |
if ! kubectl get namespace "$CALICO_NS" > /dev/null 2>&1; then | |
if kubectl get namespace "calico-system" > /dev/null 2>&1; then | |
CALICO_NS="calico-system" | |
log_info "Detected Calico components in 'calico-system' namespace." | |
elif kubectl get namespace "tigera-operator" > /dev/null 2>&1; then | |
# If using Tigera Operator, resources might be in tigera-operator or managed differently | |
log_info "Detected 'tigera-operator' namespace. Calico health might be managed by the operator." | |
# Attempt checks in tigera-operator as a guess, might need adjustment | |
CALICO_NS="tigera-operator" # Adjust if operator manages resources elsewhere | |
# Note: A better check might involve checking the Operator's status itself | |
else | |
log_warn "Neither 'kube-system' nor 'calico-system' found. Skipping Calico check or adapt script for your CNI setup." | |
CALICO_NS="" # Ensure skip below | |
fi | |
fi | |
if [[ -n "$CALICO_NS" ]]; then | |
# Check calico-node DaemonSet | |
CALICO_NODE_STATUS=$(kubectl get daemonset calico-node -n "$CALICO_NS" -o jsonpath='{.status.numberReady}/{.status.desiredNumberScheduled}' 2>/dev/null || echo "N/A") | |
CALICO_NODE_READY=0; CALICO_NODE_DESIRED=0 | |
if [[ "$CALICO_NODE_STATUS" =~ ^[0-9]+/[0-9]+$ ]]; then | |
CALICO_NODE_READY=$(echo "$CALICO_NODE_STATUS" | cut -d'/' -f1) | |
CALICO_NODE_DESIRED=$(echo "$CALICO_NODE_STATUS" | cut -d'/' -f2) | |
elif [[ "$CALICO_NODE_STATUS" == "N/A" ]]; then | |
log_warn "Calico DaemonSet 'calico-node' not found in namespace '${CALICO_NS}'." | |
fi | |
# Check calico-kube-controllers Deployment | |
CALICO_CTRL_STATUS=$(kubectl get deployment calico-kube-controllers -n "$CALICO_NS" -o jsonpath='{.status.readyReplicas}/{.status.replicas}' 2>/dev/null || echo "N/A") | |
CALICO_CTRL_READY=0; CALICO_CTRL_TOTAL=0 | |
if [[ "$CALICO_CTRL_STATUS" =~ ^[0-9]+/[0-9]+$ ]]; then | |
CALICO_CTRL_READY=$(echo "$CALICO_CTRL_STATUS" | cut -d'/' -f1) | |
CALICO_CTRL_TOTAL=$(echo "$CALICO_CTRL_STATUS" | cut -d'/' -f2) | |
elif [[ "$CALICO_CTRL_STATUS" == "N/A" ]]; then | |
log_warn "Calico Deployment 'calico-kube-controllers' not found in namespace '${CALICO_NS}'." | |
fi | |
# Assess overall Calico health based on checks | |
CALICO_OK=true | |
[[ "$CALICO_NODE_STATUS" == "N/A" && "$CALICO_CTRL_STATUS" == "N/A" ]] && CALICO_OK=false # Both missing | |
# Consider healthy if at least one component is found and healthy, or if both are found and healthy | |
NODE_HEALTHY=false; CTRL_HEALTHY=false | |
[[ "$CALICO_NODE_DESIRED" -gt 0 && "$CALICO_NODE_READY" -eq "$CALICO_NODE_DESIRED" ]] && NODE_HEALTHY=true | |
[[ "$CALICO_CTRL_TOTAL" -gt 0 && "$CALICO_CTRL_READY" -eq "$CALICO_CTRL_TOTAL" ]] && CTRL_HEALTHY=true | |
# Report based on findings | |
if [[ "$CALICO_NODE_STATUS" != "N/A" || "$CALICO_CTRL_STATUS" != "N/A" ]]; then # Only report if something was found | |
if ($NODE_HEALTHY || [[ "$CALICO_NODE_STATUS" == "N/A" ]]) && ($CTRL_HEALTHY || [[ "$CALICO_CTRL_STATUS" == "N/A" ]]); then | |
echo -e " ${LGREEN}✅ Calico components appear healthy (Nodes: ${CALICO_NODE_STATUS:-Not Found}, Controllers: ${CALICO_CTRL_STATUS:-Not Found}).${RESET}"; | |
else | |
echo -e " ${LRED}❌ Calico status issues detected in namespace '${CALICO_NS}':${RESET}"; | |
[[ "$CALICO_NODE_STATUS" != "N/A" ]] && echo -e " DaemonSet 'calico-node': ${CALICO_NODE_STATUS} Ready/Desired" | |
[[ "$CALICO_CTRL_STATUS" != "N/A" ]] && echo -e " Deployment 'calico-kube-controllers': ${CALICO_CTRL_STATUS} Ready/Total" | |
# Show problem pods only if issues detected | |
kubectl get pods -n "$CALICO_NS" -l 'k8s-app in (calico-node, calico-kube-controllers)' --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | awk '{printf " Problem Pod: %-40s Status: %-10s\n", $1, $3}'; | |
fi | |
fi # End if something was found | |
fi # End if Calico namespace exists | |
# --- Application Namespace Overview --- | |
print_header "Application Namespace Overview" | |
ALL_NAMESPACES=$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null) | |
APPS_FOUND_COUNT=0 | |
if [ -z "$ALL_NAMESPACES" ]; then log_warn "Failed to retrieve namespaces."; else | |
printf " ${BOLD}%-30s %-10s %-10s %-10s %-10s %-15s${RESET}\n" "NAMESPACE" "RUNNING" "PENDING" "FAILED" "SUCCEEDED" "DEPLOYMENTS" | |
# Process namespaces; use process substitution to avoid losing APPS_FOUND_COUNT in subshell | |
while IFS= read -r ns; do | |
exclude=false; for excluded_ns in "${EXCLUDE_NAMESPACES[@]}"; do [[ "$ns" == "$excluded_ns" ]] && exclude=true && break; done; if $exclude; then continue; fi | |
((APPS_FOUND_COUNT++)) | |
# Get pod statuses for the namespace | |
pod_statuses=$(kubectl get pods -n "$ns" --no-headers -o custom-columns=STATUS:.status.phase 2>/dev/null) | |
# **FIXED**: Get counts, remove potential trailing newline from grep -c, default to 0 | |
running_pods=$(echo "$pod_statuses" | grep -cw "Running" | tr -d '\n') | |
running_pods=${running_pods:-0} | |
pending_pods=$(echo "$pod_statuses" | grep -cw "Pending" | tr -d '\n') | |
pending_pods=${pending_pods:-0} | |
failed_pods=$(echo "$pod_statuses" | grep -cw "Failed" | tr -d '\n') | |
failed_pods=${failed_pods:-0} | |
succeeded_pods=$(echo "$pod_statuses" | grep -cw "Succeeded" | tr -d '\n') | |
succeeded_pods=${succeeded_pods:-0} | |
# Get deployment status (Ready/Total), default to "0/0" | |
deploy_status_raw=$(kubectl get deployments -n "$ns" --no-headers 2>/dev/null | awk ' | |
BEGIN { ready=0; total=0 } # Initialize counters | |
{ | |
split($2, rep, "/"); # $2 is the READY column like "1/1" | |
# Check if both parts look like numbers before adding | |
if (rep[1] ~ /^[0-9]+$/ && rep[2] ~ /^[0-9]+$/) { | |
ready += rep[1]; | |
total += rep[2]; | |
} | |
} | |
END { | |
# Print the sum if any deployments were processed (NR>0), else print 0/0 | |
# Use printf to avoid trailing newline from print | |
if (NR > 0) printf "%d/%d", ready, total; else printf "0/0"; | |
}' || echo "0/0") # Ensure output even if awk/kubectl fails | |
# **FIXED**: Clean and parse deployment status robustly | |
# Remove ALL whitespace (including potential newlines) from the status string | |
deploy_status=$(echo "$deploy_status_raw" | tr -d '[:space:]') | |
# Parse the cleaned status string | |
ready_deps=$(echo "$deploy_status" | cut -d'/' -f1) | |
total_deps=$(echo "$deploy_status" | cut -d'/' -f2) | |
# Ensure values are numeric before arithmetic; default to 0 if not | |
[[ ! "$ready_deps" =~ ^[0-9]+$ ]] && ready_deps=0 | |
[[ ! "$total_deps" =~ ^[0-9]+$ ]] && total_deps=0 | |
# Determine namespace color based on pod status | |
ns_color="${GREEN}"; [[ "$pending_pods" -gt 0 ]] && ns_color="${YELLOW}"; [[ "$failed_pods" -gt 0 ]] && ns_color="${RED}" | |
# Determine deployment status color using safe numeric comparisons | |
dep_status_color="${GREEN}"; | |
if [[ "$total_deps" -eq 0 ]]; then | |
dep_status_color="${LGRAY}" # Gray for 0/0 deployments | |
elif [[ "$ready_deps" -lt "$total_deps" ]]; then | |
dep_status_color="${YELLOW}" # Yellow if not all deployments are ready | |
fi | |
# Print the formatted line for the namespace | |
# Use the original $deploy_status for display 'X/Y', not the cleaned numeric variables | |
printf " ${ns_color}%-30s${RESET} %-10s %-10s %-10s %-10s ${dep_status_color}%-15s${RESET}\n" \ | |
"$ns" "$running_pods" "$pending_pods" "$failed_pods" "$succeeded_pods" "$deploy_status" | |
done < <(echo "$ALL_NAMESPACES") # Use process substitution to read namespaces | |
fi | |
# Check the counter *after* the loop finishes | |
if [[ $APPS_FOUND_COUNT -eq 0 ]]; then | |
log_info "No application namespaces found (excluding system namespaces like ${EXCLUDE_NAMESPACES[*]})." | |
fi | |
# Check the counter *after* the loop finishes | |
if [[ $APPS_FOUND_COUNT -eq 0 ]]; then | |
log_info "No application namespaces found (excluding system namespaces like ${EXCLUDE_NAMESPACES[*]})." | |
fi | |
# --- Resource Usage Summary (Cluster Wide - Nodes) --- | |
if $METRICS_AVAILABLE; then | |
print_header "Cluster Resource Usage (Nodes)" | |
# Use awk to process kubectl top nodes output | |
TOP_NODES_OUTPUT=$(kubectl top nodes --no-headers 2>/dev/null) | |
if [[ -n "$TOP_NODES_OUTPUT" ]]; then | |
echo "$TOP_NODES_OUTPUT" | awk ' | |
BEGIN { t_cpu=0; t_mem=0; n=0 } | |
{ | |
# CPU processing: remove 'm' suffix | |
cpu=$2; sub(/m$/,"",cpu); t_cpu+=cpu; | |
# Memory processing: convert Gi/Ki to Mi | |
mem=$4; unit="Mi"; # Default unit | |
if(match(mem,/Gi/)){ mem_val=substr(mem, 1, RLENGTH-2); mem=mem_val*1024; } | |
else if(match(mem,/Ki/)){ mem_val=substr(mem, 1, RLENGTH-2); mem=mem_val/1024; } | |
else { sub(/Mi/,"",mem); } # Assume Mi if no Gi/Ki suffix | |
t_mem+=mem; | |
n++; # Count nodes processed | |
} | |
END { | |
if(n>0){ | |
printf " ${BOLD}%-20s %-15s %-15s${RESET}\n","RESOURCE","TOTAL USAGE","AVG PER NODE"; | |
# Format CPU with 'm' suffix | |
printf " %-20s %-15s %-15s\n","CPU (Cores)", t_cpu "m", int(t_cpu/n) "m"; | |
# Format Memory, convert total back to GiB if large enough for readability | |
if (t_mem > 2048) { t_mem_disp = sprintf("%.1f GiB", t_mem/1024); avg_mem_disp = sprintf("%.1f MiB", t_mem/n) } # Show total GiB, avg MiB | |
else { t_mem_disp = int(t_mem) " MiB"; avg_mem_disp = int(t_mem/n) " MiB" } # Show both MiB | |
printf " %-20s %-15s %-15s\n","Memory", t_mem_disp, avg_mem_disp; | |
} else { print " No node metrics available (kubectl top nodes returned no processable data)."; } | |
}' | |
else | |
log_warn "kubectl top nodes returned no data, cannot calculate cluster resource usage." | |
fi | |
else | |
print_header "Cluster Resource Usage (Nodes)"; log_info "Skipped. Install metrics-server for node resource usage." | |
fi | |
# --- Recent Events --- | |
print_header "Recent Warning/Error Events (Last 10)" | |
# Define columns for events | |
EVENT_COLS="LAST_SEEN:.lastTimestamp,TYPE:.type,REASON:.reason,NAMESPACE:.metadata.namespace,OBJECT:.involvedObject.kind/.involvedObject.name,MESSAGE:.message" | |
# Get last 10 non-Normal events across all namespaces, sorted by time | |
# Using sort-by='.metadata.creationTimestamp' might be slightly more reliable than lastTimestamp sometimes | |
EVENTS=$(kubectl get events --sort-by='.lastTimestamp' --field-selector type!=Normal -A -o custom-columns="${EVENT_COLS}" --no-headers 2>/dev/null | tail -n 10) | |
if [ -n "$EVENTS" ]; then | |
printf " ${BOLD}%-26s %-10s %-18s %-20s %-30s %s${RESET}\n" "LAST_SEEN" "TYPE" "REASON" "NAMESPACE" "OBJECT" "MESSAGE" | |
echo "$EVENTS" | while IFS= read -r line; do | |
# Extract fields using awk, message is the rest of the line | |
last_seen=$(echo "$line" | awk '{print $1}') | |
type=$(echo "$line" | awk '{print $2}') | |
reason=$(echo "$line" | awk '{print $3}') | |
namespace=$(echo "$line" | awk '{print $4}') | |
object=$(echo "$line" | awk '{print $5}') | |
# **FIXED**: Simpler message extraction | |
message=$(echo "$line" | awk '{ $1=$2=$3=$4=$5=""; print $0 }' | sed 's/^[ \t]*//') | |
# Determine color based on Type or Reason | |
color="${YELLOW}"; # Default to Warning color | |
if [[ "$type" == *"Error"* || "$reason" == *"Failed"* || "$reason" == *"Error"* || "$reason" == *"Unhealthy"* ]]; then | |
color="${LRED}"; # Use Error color for more severe events | |
fi | |
# Truncate message if too long for display cleanly | |
if [ ${#message} -gt 80 ]; then message="${message:0:77}..."; fi | |
# Truncate object name if needed (printf %.30s handles this too) | |
# if [ ${#object} -gt 30 ]; then object="${object:0:27}..."; fi | |
# Use printf for formatted output | |
# Using %b for the fields with color codes to interpret escapes (might help alignment slightly) | |
printf " ${color}%-26s %-10s %-18s %-20s %-30.30s %s${RESET}\n" \ | |
"$last_seen" "$type" "$reason" "$namespace" "$object" "$message" | |
done | |
else | |
log_info "No recent Warning or Error events found." | |
fi | |
echo -e "\n${BLUE}${BOLD}=== Monitoring Complete (`date`) ===${RESET}" | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment