-
Upgrade & reboot
apt update && apt full-upgrade reboot
Why: pull in latest PVE fixes and kernel.
-
Raise pveproxy FD limit
mkdir -p /etc/systemd/system/pveproxy.service.d/ cat > /etc/systemd/system/pveproxy.service.d/limits.conf <<EOF [Service] LimitNOFILE=262144 EOF systemctl daemon-reload systemctl restart pveproxy.service
Why: prevents “Too many open files” under heavy API load.
-
Install stack-dump tool
apt update apt install elfutils which eu-stack # should show /usr/bin/eu-stack
Why: lightweight backtrace (vs full gdb).
-
Deploy watchdog script
cat > /usr/local/sbin/pveproxy-watchdog.sh <<'EOF' #!/bin/bash set -euo pipefail LOGDIR=/var/log/pveproxy-watchdog mkdir -p "$LOGDIR" timestamp() { date '+%Y%m%d-%H%M%S'; } if pidof -s pveproxy >/dev/null && \ curl -sk --max-time 5 https://127.0.0.1:8006/api2/json/nodes >/dev/null; then exit 0 fi STATE=$( pidof -s pveproxy >/dev/null && echo hung || echo dead ) TS=$(timestamp) BASEDIR="$LOGDIR/$TS-$STATE" mkdir -p "$BASEDIR" echo "[$(date)] Detected pveproxy is $STATE" >>"$BASEDIR/summary.log" journalctl -u pveproxy --no-pager -n200 >"$BASEDIR/journal.log" if PID=$(pidof -s pveproxy); then ls /proc/$PID/fd | wc -l >"$BASEDIR/fd_count.txt" eu-stack --pid="$PID" >"$BASEDIR/stacktrace.log" || true cat /proc/$PID/status >"$BASEDIR/status.txt" cat /proc/$PID/maps >"$BASEDIR/maps.txt" fi echo "[$(date)] Restarting pveproxy.service" >>"$BASEDIR/summary.log" systemctl restart pveproxy.service EOF chmod +x /usr/local/sbin/pveproxy-watchdog.sh
Why: collects logs, FD count, stack & maps before auto-restart.
-
Install & start timer
cat > /etc/systemd/system/pveproxy-watchdog.timer <<EOF [Unit] Description=Check pveproxy every minute [Timer] OnBootSec=1min OnUnitInactiveSec=1min Persistent=true Unit=pveproxy-watchdog.service [Install] WantedBy=timers.target EOF cat > /etc/systemd/system/pveproxy-watchdog.service <<EOF [Unit] Description=Hard watchdog for pveproxy After=pveproxy.service [Service] Type=oneshot ExecStart=/usr/local/sbin/pveproxy-watchdog.sh [Install] WantedBy=timers.target EOF systemctl daemon-reload systemctl enable pveproxy-watchdog.service systemctl enable --now pveproxy-watchdog.timer
Why: ensures the script runs ~1 min after boot and every minute thereafter.
-
Timer active & enabled
systemctl is-active pveproxy-watchdog.timer # active systemctl is-enabled pveproxy-watchdog.timer # enabled
-
Watchdog service enabled
systemctl is-enabled pveproxy-watchdog.service # enabled
-
Current pveproxy FD limit
systemctl show pveproxy.service -p LimitNOFILE # or, at runtime: cat /proc/$(pidof -s pveproxy)/limits | grep "Max open files"
-
Check timer schedule & last run
systemctl list-timers --all pveproxy-watchdog.timer journalctl -u pveproxy-watchdog.timer --since "10 minutes ago"