wallter · April 15, 2026 03:28
diff --git a/gpu-fan-control.py b/gpu-fan-control.py
 #!/usr/bin/env python3
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2026 Tyler Wall
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 """
 gpu-fan-control.py — Userspace GPU fan controller for Blackwell thermal management

 Overrides the GPU's firmware fan controller with a userspace fan curve via NVML.
 Optionally controls motherboard fan headers (hwmon sysfs) for supplemental
 GPU cooling fans (e.g. case fans blowing directly onto the GPU).

 === Why this exists ===

  Blackwell GPU firmware (RTX PRO 6000, RTX 50-series) has a fan controller
  bug: fans stay at ~53% while the GPU reaches 92C (1C from the 93C thermal
  limit), causing GSP firmware crashes (Xid 79/154). Telemetry at 100ms shows
  the firmware's fan target is 30% while actual is 53% — it overrides itself
  but not nearly enough. The GPU crashes mid-fan-ramp because the fan controller
  reacts too late.

  This script takes fan control away from the firmware and applies a
  temperature-proportional curve that keeps temps below the danger zone.

  See: https://github.com/NVIDIA/open-gpu-kernel-modules/issues/1045

  Companion telemetry script: https://gist.github.com/wallter/4c60d1a6607defde7a0fd87709ae9ee0

 === What this replaces ===

  | Temp | Firmware fans | This script (GPU) | This script (external) |
  |------|---------------|-------------------|------------------------|
  | 70C  | ~40%          | 70%               | 180/255 (71%)          |
  | 80C  | ~50%          | 90%               | 240/255 (94%)          |
  | 85C  | ~53%          | 95%               | 255/255 (100%)         |
  | 88C  | ~55% (crash)  | 100% (emergency)  | 255/255 (100%)         |

 Platform: Linux only (NVML for GPU fans, sysfs hwmon for motherboard fans).

 === Install ===

  # 1. Install pynvml (system-wide for systemd)
  sudo /usr/bin/python3 -m pip install --break-system-packages nvidia-ml-py

  # 2. Copy script
  sudo cp gpu-fan-control.py /usr/local/bin/

  # 3. Create systemd service (edit --hwmon-chip and --hwmon-pwm for your board)
  sudo tee /etc/systemd/system/gpu-fan-control.service <<'UNIT'
  [Unit]
  Description=GPU Fan Controller (Blackwell thermal mitigation)
  After=nvidia-persistenced.service
  Wants=nvidia-persistenced.service

  [Service]
  Type=simple
  # Adjust --hwmon-chip for your motherboard's Super I/O chip.
  # Find yours: cat /sys/class/hwmon/hwmon*/name
  # Omit --hwmon-chip entirely if you only want GPU fan control.
  ExecStart=/usr/bin/python3 /usr/local/bin/gpu-fan-control.py --hwmon-chip nct6798 --hwmon-pwm pwm1
  Restart=on-failure
  RestartSec=5

  [Install]
  WantedBy=multi-user.target
  UNIT

  # 4. Enable and start
  sudo systemctl daemon-reload
  sudo systemctl enable --now gpu-fan-control

  # 5. Verify
  sudo systemctl status gpu-fan-control
  sudo kill -USR1 $(pgrep -f gpu-fan-control.py)   # print status

 === Uninstall ===

  sudo systemctl disable --now gpu-fan-control
  sudo rm /etc/systemd/system/gpu-fan-control.service /usr/local/bin/gpu-fan-control.py
  sudo systemctl daemon-reload
  # Fan control returns to GPU firmware automatically on script exit.

 === Arguments ===

  --gpu INDEX            GPU index (default: 0)
  --interval SECONDS     Polling interval (default: 2.0)
  --hwmon-chip NAME      Motherboard Super I/O chip for external fans (optional)
  --hwmon-pwm NAME       PWM sysfs name, e.g. pwm1 (default: pwm1)
  --no-gpu-fans          Don't control GPU built-in fans (hwmon only mode)
  --no-hwmon             Don't control motherboard fans (GPU fans only mode)
  --dry-run              Print actions without changing fan speeds
  --dynamic-tdp          Enable incremental TDP stepping controller.
                         Uses a rolling 2-minute temperature average to make
                         decisions, stepping the power limit in 1W increments
                         with a 30s cooldown between steps (2W/min traverse):

                           Step DOWN  when avg_temp >= 80C AND firmware shows
                                      stress (SwPowerCap >= 75% OR throttle
                                      oscillations >= 30/min).
                           Step UP    when avg_temp <= 77C AND no firmware
                                      stress — gradual recovery.
                           Deadband   between 77C and 80C: no change.

                         Fan boost scales linearly with reduction depth:
                         0% at baseline, +10% at 30W or more below baseline.
                         Reduced TDP plus extra cooling is more effective at
                         preventing crashes than either alone, and the fine
                         1W granularity lets the controller settle at the
                         actual equilibrium power rather than overshooting.
  --version              Show version and exit

 === Signals ===

  SIGUSR1   Print current status to stderr/journal
  SIGHUP    Reserved for future config reload

 === Fan curve ===

  Default curve tuned for RTX PRO 6000 at 600W TDP. Linear interpolation
  between points. Hysteresis on cooling (3C band) prevents oscillation.
  Emergency override at >= 88C bypasses hysteresis.

    Temp (C) | GPU fans (%) | External fans (PWM/255)
    ---------|--------------|------------------------
    <= 30    | 30 (min)     | 50  (~20%)
    40       | 40           | 80  (~31%)
    50       | 50           | 130 (~51%)
    60       | 60           | 200 (~78%)
    65       | 65           | 235 (~92%)
    >= 70    | 70           | 255 (100%) — case fans max first
    75       | 80           | 255 (100%)
    80       | 90           | 255 (100%)
    85       | 95           | 255 (100%)
    >= 88    | 100          | 255 (100%)

  Override via environment variables:
    GPU_FAN_CURVE="30:30,50:50,70:70,80:90,85:100"
    HWMON_FAN_CURVE="30:50,50:110,70:180,80:240,85:255"
  Format: "temp:value,temp:value,..." — values sorted by temp, linearly interpolated.

 === Safety guarantees ===

  1. Failsafe: if NVML becomes unresponsive, all fans go to 100% and any
     dynamic-TDP reduction is reverted (best-effort).
  2. Cleanup: on exit (SIGTERM, SIGINT, crash), restores firmware auto control
     for both GPU fans and hwmon PWM, and restores the original power limit.
  3. Emergency: >= 88C forces 100% regardless of curve or hysteresis.
  4. Min clamp: GPU fans never set below hardware-reported minimum (typically 30%).
  5. TDP clamp: dynamic-TDP reductions never go below the hardware minimum
     (queried via nvmlDeviceGetPowerManagementLimitConstraints).
  6. Lock file: prevents duplicate instances via advisory flock on /run/*.pid.
  7. Dry run: --dry-run previews all actions without touching hardware.
  8. Signal-first init: handlers installed BEFORE the lock, so SIGTERM during
     startup triggers clean shutdown rather than default kill.
  9. Input validation: fan curves parsed from environment are range-checked
     (temp [0-120C], value [0-255]) and malformed entries dropped with warnings.
 10. Hwmon pwm name is validated to match `^pwm\\d+$` before touching sysfs.

 === Design notes ===

  Single-threaded main loop. Signal handlers only set flags (threading.Event).
  All mutable state written from the main thread only. Python GIL guarantees
  atomic reads for print_status() called from the signal handler.

  GPU fans: nvmlDeviceSetFanSpeed_v2 (manual policy via nvmlDeviceSetFanControlPolicy).
  Hwmon fans: sysfs pwm_enable=1 (manual) + pwm write. Original mode saved for restore.

  All shell-free: no subprocess calls. NVML direct for GPU, sysfs direct for hwmon.

 === Posting as a gist ===

  gh gist create --public \
    -d "Userspace GPU fan controller for Blackwell thermal mitigation (RTX PRO 6000 / RTX 50-series)" \
    gpu-fan-control.py
 """

 from __future__ import annotations

 import argparse
 import fcntl
 import os
 import re
 import signal
 import sys
 import threading
 import time
 from collections import deque
 from pathlib import Path
 from typing import IO, Any, Callable, Deque

 PWM_NAME_RE = re.compile(r"^pwm(\d+)$")

 try:
    import pynvml
 except ImportError:
    print(
        "[gpu-fan-control] ERROR: pynvml not found.\n"
        "  Install: sudo /usr/bin/python3 -m pip install --break-system-packages nvidia-ml-py",
        file=sys.stderr,
    )
    sys.exit(1)

 VERSION = "1.9.0"
 LOG_PREFIX = "[gpu-fan-control]"
 LOCK_FILE = Path("/run/gpu-fan-control.pid")

 # NVML clocks_event_reasons bit masks (from nvml.h)
 NVML_CLOCK_EVENT_SW_POWER_CAP = 0x04
 NVML_FAN_POLICY_MANUAL = 1
 NVML_FAN_POLICY_AUTO = 0

 # ---------------------------------------------------------------------------
 # Default fan curves
 # ---------------------------------------------------------------------------
 # Format: list of (temp_C, fan_value) sorted ascending by temp.
 # GPU fans: value is percentage (30-100, clamped to hardware min/max).
 # Hwmon fans: value is PWM duty cycle (0-255).
 # Intermediate temps are linearly interpolated between adjacent points.

 DEFAULT_GPU_FAN_CURVE: list[tuple[int, int]] = [
    (30, 30),   # idle — minimum hardware speed
    (40, 40),
    (50, 50),
    (60, 60),
    (70, 70),   # moderate compute
    (75, 80),   # aggressive ramp begins
    (80, 90),
    (85, 95),   # approaching thermal limits
    (88, 100),  # emergency — full blast, 5C before GPU max (93C)
 ]

 DEFAULT_HWMON_FAN_CURVE: list[tuple[int, int]] = [
    (30, 50),   # ~20% — barely audible idle
    (40, 80),   # ~31%
    (50, 130),  # ~51%
    (60, 200),  # ~78%
    (65, 235),  # ~92%
    (70, 255),  # 100% — case fans at max before GPU fans need to get loud
    (75, 255),
    (80, 255),
    (85, 255),
    (88, 255),  # emergency
 ]

 # Thermal management constants
 HYSTERESIS_C = 3            # Degrees of hysteresis on cooling to prevent oscillation.
 EMERGENCY_TEMP_C = 88       # Override all curves — 100% fans, no hysteresis.
 FAILSAFE_GPU_PCT = 100      # Fan % when NVML is unresponsive.
 FAILSAFE_HWMON_PWM = 255    # PWM when NVML is unresponsive (max cooling).
 STATUS_LOG_INTERVAL_SEC = 300.0  # Log status every 5 minutes for liveness monitoring.
 MAX_CONSECUTIVE_FAILURES = 5     # Log escalating warnings after this many NVML failures.

 # Fan curve input validation ranges
 CURVE_TEMP_MIN_C = 0
 CURVE_TEMP_MAX_C = 120      # GPU thermal_shutdown is ~98C; give headroom for exotic chips.
 CURVE_PWM_MIN = 0
 CURVE_PWM_MAX = 255         # Hwmon PWM is 0-255; GPU % also caps below 255, so one range is fine.

 # Dynamic TDP throttle — detects GSP firmware stress patterns and temporarily
 # reduces the power limit to reduce crash probability. Based on statistical
 # analysis of 5 days of telemetry from a crashing RTX PRO 6000 Blackwell.
 TDP_WINDOW_SEC = 60.0            # Rolling window for precursor detection.
 TDP_SWCAP_RATE_THRESHOLD = 0.75  # Fraction of samples with SwPowerCap to trigger.
 TDP_OSC_RATE_THRESHOLD = 30      # Throttle state changes per minute to trigger.

 # --- Incremental stepping controller for dynamic TDP ---
 # Smooth, bidirectional controller driven by rolling average temperature:
 #   - Step DOWN on rising avg temp + firmware stress precursor.
 #   - Step UP on falling avg temp + no precursor.
 # Steps are small (TDP_STEP_W) to allow gradual, proportional response.
 # Recovery is temperature-driven (not precursor-gated) because a reduced-TDP
 # workload may continue to pin against the new cap and keep the precursor
 # firing, which would otherwise block recovery indefinitely.
 # Fans are boosted while TDP is reduced — cooling is the cheapest mitigation.

 TDP_STEP_W = 1                   # Watts per incremental step (down or up).
 TDP_STEP_COOLDOWN_SEC = 30.0     # Min seconds between step adjustments (2W/min traverse).
 TEMP_AVG_WINDOW_SEC = 120.0      # Rolling window for temperature averaging.
 TDP_STRESS_AVG_TEMP_C = 80       # Avg temp >= this AND precursor → step DOWN.
 TDP_RECOVERY_AVG_TEMP_C = 77     # Avg temp <= this AND no precursor → step UP.
 # Fan boost scales linearly with how far below baseline we've stepped:
 #   0W reduction   → 0% boost
 #   FULL_BOOST_W reduction or more → TDP_REDUCED_FAN_BOOST_PCT boost
 # Without scaling, a 1W reduction would apply the full boost, which is silly.
 TDP_REDUCED_FAN_BOOST_PCT = 10       # Max fan % boost when deeply reduced.
 TDP_FULL_BOOST_REDUCTION_W = 30      # Reduction at which the full boost kicks in.

 # Module-level lock file handle — prevent GC from releasing flock.
 _lock_fh: IO[str] | None = None


 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------

 def log(msg: str) -> None:
    """Write a log line to stderr (captured by journald when run as a service)."""
    print(f"{LOG_PREFIX} {msg}", file=sys.stderr, flush=True)


 def to_str(value: Any) -> str:
    """Coerce bytes or str from NVML into str (pynvml version-portable)."""
    if isinstance(value, bytes):
        return value.decode("utf-8", errors="replace")
    return str(value)


 def nvml_safe(func: Callable[..., Any], *args: Any, default: Any = 0) -> Any:
    """Call an NVML function, returning *default* on NVMLError.

    Use for non-critical reads where a default is acceptable (fan speed, power).
    Do NOT use for the temperature read in the main loop — that must raise on
    failure to trigger the failsafe path.
    """
    try:
        return func(*args)
    except pynvml.NVMLError:
        return default


 def parse_fan_curve(spec: str, name: str = "curve") -> list[tuple[int, int]]:
    """Parse 'temp:value,temp:value,...' into a sorted, deduplicated fan curve.

    - Drops malformed pairs with a warning.
    - Rejects temps outside [CURVE_TEMP_MIN_C, CURVE_TEMP_MAX_C].
    - Rejects values outside [CURVE_PWM_MIN, CURVE_PWM_MAX].
    - On duplicate temperatures, the LAST value wins (later entries override
      earlier ones — matches user intent for iterative curve editing).
    - Validates monotonically non-decreasing fan values across the sorted
      curve (fan should not decrease as temperature rises — logs a warning).
    """
    # Use a dict keyed on temp to deduplicate — later values overwrite earlier.
    by_temp: dict[int, int] = {}
    for pair in spec.split(","):
        pair = pair.strip()
        if not pair:
            continue
        parts = pair.split(":")
        if len(parts) != 2:
            log(f"WARNING: ignoring malformed {name} pair: '{pair}' (expected temp:value)")
            continue
        try:
            temp, value = int(parts[0]), int(parts[1])
        except ValueError:
            log(f"WARNING: ignoring non-integer {name} pair: '{pair}'")
            continue
        if not (CURVE_TEMP_MIN_C <= temp <= CURVE_TEMP_MAX_C):
            log(f"WARNING: {name} temperature {temp} outside "
                f"[{CURVE_TEMP_MIN_C}, {CURVE_TEMP_MAX_C}] — ignoring pair '{pair}'")
            continue
        if not (CURVE_PWM_MIN <= value <= CURVE_PWM_MAX):
            log(f"WARNING: {name} value {value} outside "
                f"[{CURVE_PWM_MIN}, {CURVE_PWM_MAX}] — ignoring pair '{pair}'")
            continue
        if temp in by_temp and by_temp[temp] != value:
            log(f"WARNING: {name} has duplicate temp {temp} — "
                f"overriding {by_temp[temp]} with {value}")
        by_temp[temp] = value

    curve = sorted(by_temp.items())
    for i in range(1, len(curve)):
        if curve[i][1] < curve[i - 1][1]:
            log(f"WARNING: {name} has decreasing value at {curve[i][0]}C "
                f"({curve[i][1]} < {curve[i - 1][1]}) — may cause oscillation")
    return curve


 def curve_lookup(curve: list[tuple[int, int]], temp: int) -> int:
    """Look up fan value for a temperature using linear interpolation.

    Returns the interpolated value between the two nearest curve points.
    Clamps to the first/last value for temps outside the curve range.
    Returns the last curve value for an empty curve (fail-safe: use max).
    """
    if not curve:
        log("WARNING: empty fan curve — returning max value")
        return 255  # Safe default: full speed.
    if temp <= curve[0][0]:
        return curve[0][1]
    if temp >= curve[-1][0]:
        return curve[-1][1]
    for i in range(len(curve) - 1):
        t_lo, v_lo = curve[i]
        t_hi, v_hi = curve[i + 1]
        if t_lo <= temp <= t_hi:
            if t_hi == t_lo:
                return v_hi
            frac = (temp - t_lo) / (t_hi - t_lo)
            return int(v_lo + frac * (v_hi - v_lo))
    return curve[-1][1]


 # ---------------------------------------------------------------------------
 # Hwmon fan controller — motherboard fan headers via sysfs
 # ---------------------------------------------------------------------------

 class HwmonFanController:
    """Control motherboard fan headers via Linux sysfs hwmon interface.

    Targets a specific Super I/O chip (e.g. nct6798) and a specific PWM
    output (e.g. pwm1). Saves the original control mode on init and restores
    it on shutdown so BIOS SmartFan resumes when this script exits.
    """

    def __init__(self, chip_name: str, pwm_name: str = "pwm1") -> None:
        self._hwmon_path: Path | None = None
        self._pwm_path: Path | None = None
        self._enable_path: Path | None = None
        self._rpm_path: Path | None = None
        self._original_enable: str = "5"  # Default: BIOS SmartFan mode.
        self._last_pwm: int = -1

        # Validate pwm_name format strictly: must be pwm<N> where N is an integer.
        match = PWM_NAME_RE.match(pwm_name)
        if not match:
            log(f"WARNING: invalid pwm name '{pwm_name}' (expected pwm<N>) — "
                f"external fan control disabled")
            return
        fan_idx = match.group(1)

        # Find the hwmon device by chip name.
        try:
            candidates = sorted(Path("/sys/class/hwmon").iterdir())
        except OSError as exc:
            log(f"WARNING: cannot enumerate /sys/class/hwmon ({exc}) — "
                f"external fan control disabled")
            return

        for hwmon in candidates:
            name_file = hwmon / "name"
            try:
                if name_file.exists() and name_file.read_text().strip() == chip_name:
                    self._hwmon_path = hwmon
                    break
            except OSError:
                continue

        if self._hwmon_path is None:
            log(f"WARNING: hwmon chip '{chip_name}' not found — external fan control disabled")
            return

        self._pwm_path = self._hwmon_path / pwm_name
        self._enable_path = self._hwmon_path / f"{pwm_name}_enable"
        self._rpm_path = self._hwmon_path / f"fan{fan_idx}_input"

        if not self._pwm_path.exists():
            log(f"WARNING: {self._pwm_path} not found — external fan control disabled")
            self._pwm_path = None
            return

        log(f"Hwmon: {chip_name} at {self._hwmon_path}, controlling {pwm_name}")

    @property
    def available(self) -> bool:
        """True if the hwmon PWM path was found and is writable."""
        return self._pwm_path is not None

    def enable_manual(self) -> None:
        """Switch to manual PWM control, saving the original mode for restore."""
        if not self.available or self._enable_path is None:
            return
        try:
            self._original_enable = self._enable_path.read_text().strip()
            self._enable_path.write_text("1")
            log(f"Hwmon: manual mode enabled (was mode {self._original_enable})")
        except OSError as exc:
            log(f"WARNING: could not enable manual hwmon control: {exc}")

    def restore_auto(self) -> None:
        """Restore the original fan control mode saved during enable_manual()."""
        if not self.available or self._enable_path is None:
            return
        try:
            self._enable_path.write_text(self._original_enable)
            # Verify the write took effect.
            actual = self._enable_path.read_text().strip()
            if actual == self._original_enable:
                log(f"Hwmon: restored to mode {self._original_enable}")
            else:
                log(f"WARNING: hwmon restore wrote {self._original_enable} "
                    f"but read back {actual}")
        except OSError as exc:
            log(f"WARNING: could not restore hwmon control: {exc}")

    def set_pwm(self, pwm: int) -> None:
        """Set fan PWM duty cycle (0-255). Skips write if value unchanged."""
        if not self.available or self._pwm_path is None:
            return
        pwm = max(0, min(255, pwm))
        if pwm == self._last_pwm:
            return
        try:
            self._pwm_path.write_text(str(pwm))
            self._last_pwm = pwm
        except OSError:
            pass  # Transient sysfs write failures are not fatal.

    def get_rpm(self) -> int:
        """Read current fan RPM from sysfs. Returns 0 on failure."""
        if self._rpm_path is None:
            return 0
        try:
            return int(self._rpm_path.read_text().strip())
        except (OSError, ValueError):
            return 0


 # ---------------------------------------------------------------------------
 # GPU fan controller — NVIDIA GPU fans via NVML
 # ---------------------------------------------------------------------------

 class GPUFanController:
    """Control GPU built-in fans via the NVML API.

    Switches fans to manual mode on init, sets speeds via
    nvmlDeviceSetFanSpeed_v2, and restores automatic firmware control on
    shutdown. Clamps fan speed to the hardware-reported minimum.
    """

    def __init__(self, gpu_index: int = 0) -> None:
        self._handle: Any = None
        self._num_fans: int = 0
        self._gpu_name: str = ""
        self._gpu_index = gpu_index
        self._min_fan_pct: int = 30  # Default; updated from hardware in init().
        self._max_fan_pct: int = 100
        self._last_speeds: list[int] = []
        self._manual_mode: bool = False

    def init(self) -> None:
        """Initialize NVML, detect GPU and fans, read hardware limits."""
        pynvml.nvmlInit()
        self._handle = pynvml.nvmlDeviceGetHandleByIndex(self._gpu_index)
        self._num_fans = int(nvml_safe(pynvml.nvmlDeviceGetNumFans, self._handle, default=0))
        self._gpu_name = to_str(nvml_safe(pynvml.nvmlDeviceGetName, self._handle, default="Unknown"))
        self._last_speeds = [-1] * self._num_fans

        # Read hardware min/max fan speed.
        min_max = nvml_safe(pynvml.nvmlDeviceGetMinMaxFanSpeed, self._handle,
                            default=[self._min_fan_pct, self._max_fan_pct])
        if isinstance(min_max, (list, tuple)) and len(min_max) >= 2:
            self._min_fan_pct = int(min_max[0])
            self._max_fan_pct = int(min_max[1])

        driver = to_str(nvml_safe(pynvml.nvmlSystemGetDriverVersion, default="Unknown"))
        log(f"GPU: {self._gpu_name} | driver: {driver}")
        log(f"GPU fans: {self._num_fans} detected, range {self._min_fan_pct}-{self._max_fan_pct}%")

    @property
    def num_fans(self) -> int:
        return self._num_fans

    @property
    def min_speed(self) -> int:
        """Hardware-reported minimum fan speed percentage."""
        return self._min_fan_pct

    @property
    def max_speed(self) -> int:
        """Hardware-reported maximum fan speed percentage."""
        return self._max_fan_pct

    def get_temp(self) -> int:
        """Read GPU die temperature in Celsius.

        Raises pynvml.NVMLError on failure — callers must handle this to
        trigger the failsafe path. Do NOT wrap in nvml_safe.
        """
        return int(pynvml.nvmlDeviceGetTemperature(
            self._handle, pynvml.NVML_TEMPERATURE_GPU
        ))

    def get_power(self) -> float:
        """Read GPU power draw in watts. Returns 0 on failure."""
        return int(nvml_safe(pynvml.nvmlDeviceGetPowerUsage, self._handle)) / 1000.0

    def get_fan_speeds(self) -> list[int]:
        """Read actual fan speed percentages from hardware."""
        return [
            int(nvml_safe(pynvml.nvmlDeviceGetFanSpeed_v2, self._handle, i))
            for i in range(self._num_fans)
        ]

    def enable_manual(self) -> None:
        """Set all GPU fans to manual (userspace-controlled) mode."""
        for i in range(self._num_fans):
            try:
                pynvml.nvmlDeviceSetFanControlPolicy(
                    self._handle, i, NVML_FAN_POLICY_MANUAL,
                )
            except pynvml.NVMLError as exc:
                log(f"WARNING: could not set GPU fan {i} to manual: {exc}")
        self._manual_mode = True
        log(f"GPU fans: manual mode enabled ({self._num_fans} fans)")

    def restore_auto(self) -> None:
        """Restore all GPU fans to automatic firmware control.

        Called on clean shutdown. Safe to call multiple times.
        """
        if not self._manual_mode:
            return
        for i in range(self._num_fans):
            try:
                pynvml.nvmlDeviceSetFanControlPolicy(
                    self._handle, i, NVML_FAN_POLICY_AUTO,
                )
            except pynvml.NVMLError:
                pass
            try:
                pynvml.nvmlDeviceSetDefaultFanSpeed_v2(self._handle, i)
            except pynvml.NVMLError:
                pass
        self._manual_mode = False
        log("GPU fans: auto mode restored")

    def set_speed(self, pct: int, quiet: bool = False) -> None:
        """Set all GPU fans to a speed percentage.

        Clamps to hardware min/max. Skips NVML write if value unchanged.
        If *quiet*, suppress per-fan failure warnings (used by failsafe path
        where NVML is already known to be unresponsive and logging N warnings
        per sample would flood the journal).
        """
        pct = max(self._min_fan_pct, min(self._max_fan_pct, pct))
        for i in range(self._num_fans):
            if pct == self._last_speeds[i]:
                continue
            try:
                pynvml.nvmlDeviceSetFanSpeed_v2(self._handle, i, pct)
                self._last_speeds[i] = pct
            except pynvml.NVMLError as exc:
                if not quiet:
                    log(f"WARNING: could not set GPU fan {i} to {pct}%: {exc}")

    # --- Public accessors for orchestrator (avoid private handle access) ---

    def get_power_limit(self) -> int:
        """Return current NVML power limit in milliwatts. Raises on failure."""
        return int(pynvml.nvmlDeviceGetPowerManagementLimit(self._handle))

    def get_power_limit_constraints(self) -> tuple[int, int]:
        """Return (min_mw, max_mw) allowed power limits for this GPU."""
        lo, hi = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self._handle)
        return int(lo), int(hi)

    def set_power_limit(self, limit_mw: int) -> None:
        """Set GPU power limit in milliwatts. Raises on failure."""
        pynvml.nvmlDeviceSetPowerManagementLimit(self._handle, limit_mw)

    def get_throttle_bitmask(self) -> int:
        """Return the clocks_event_reasons bitmask. Returns 0 on failure."""
        return int(nvml_safe(
            pynvml.nvmlDeviceGetCurrentClocksEventReasons, self._handle, default=0,
        ))

    def shutdown_nvml(self) -> None:
        """Restore auto fan control and shut down NVML cleanly."""
        self.restore_auto()
        try:
            pynvml.nvmlShutdown()
        except pynvml.NVMLError as exc:
            log(f"WARNING: nvmlShutdown failed: {exc}")


 # ---------------------------------------------------------------------------
 # Fan controller orchestrator
 # ---------------------------------------------------------------------------

 class FanController:
    """Orchestrates GPU and hwmon fan control based on GPU temperature.

    Architecture:
        - Single-threaded main loop (no background threads).
        - Signal handler only sets threading.Event flags.
        - All hardware writes happen from the main thread.

    Lifecycle:
        controller = FanController(...)
        controller.run()  # blocks until shutdown.set()
        # run() handles init, control loop, and cleanup internally.
    """

    def __init__(
        self,
        gpu_index: int = 0,
        interval: float = 2.0,
        gpu_curve: list[tuple[int, int]] | None = None,
        hwmon_curve: list[tuple[int, int]] | None = None,
        hwmon_chip: str = "",
        hwmon_pwm: str = "pwm1",
        control_gpu: bool = True,
        control_hwmon: bool = True,
        dry_run: bool = False,
        dynamic_tdp: bool = False,
    ) -> None:
        self._interval = interval
        self._gpu_curve = gpu_curve or DEFAULT_GPU_FAN_CURVE
        self._hwmon_curve = hwmon_curve or DEFAULT_HWMON_FAN_CURVE
        self._control_gpu = control_gpu
        self._dry_run = dry_run
        self._dynamic_tdp = dynamic_tdp

        # GPU fan controller is always constructed; whether it's actively used
        # depends on `_control_gpu`. NVML init happens later in run().
        self._gpu = GPUFanController(gpu_index)

        # Hwmon is optional: only construct if both requested AND chip is given.
        # _control_hwmon is derived from actual availability so downstream code
        # can use a single check.
        self._hwmon: HwmonFanController | None = None
        if control_hwmon and hwmon_chip:
            candidate = HwmonFanController(hwmon_chip, hwmon_pwm)
            if candidate.available:
                self._hwmon = candidate
        self._control_hwmon: bool = self._hwmon is not None

        # Shutdown coordination — threading.Event is safe for signal handlers.
        self.shutdown = threading.Event()

        # Statistics (written from main thread only).
        self._start_time = time.monotonic()
        self._prev_temp: int | None = None         # None until first successful sample.
        self._total_adjustments: int = 0
        self._consecutive_nvml_failures: int = 0
        self._peak_temp: int = 0
        self._peak_power: float = 0.0
        self._last_status_time: float = 0.0

        # Dynamic TDP state (only used when dynamic_tdp=True).
        # Rolling windows of (monotonic_ts, value) for throttle analysis.
        self._swcap_history: Deque[tuple[float, bool]] = deque()
        self._throttle_transitions: Deque[tuple[float, str]] = deque()
        # Rolling temperature window for averaging.
        self._temp_history: Deque[tuple[float, int]] = deque()
        self._prev_throttle_str: str | None = None  # None until first sample.

        # Power-limit state (NVML-managed).
        self._original_tdp_mw: int | None = None    # None = not yet initialized.
        self._min_tdp_mw: int = 0                   # Hardware lower bound (from NVML).
        self._current_tdp_mw: int = 0

        # Step controller state.
        self._last_tdp_step_time: float = 0.0       # For TDP_STEP_COOLDOWN_SEC gating.
        self._tdp_last_update: float = 0.0          # For reduced-time accounting.
        self._total_tdp_steps_down: int = 0
        self._total_tdp_steps_up: int = 0
        self._total_tdp_seconds: float = 0.0        # Cumulative time spent reduced.

    # ------------------------------------------------------------------
    # Fan curve logic
    # ------------------------------------------------------------------

    def _apply_hysteresis(self, temp: int, target: int,
                          curve: list[tuple[int, int]]) -> int:
        """On cooling, keep the higher fan speed within the hysteresis band.

        Prevents fan oscillation when the temperature hovers near a curve point.
        Bypassed in emergency (>= EMERGENCY_TEMP_C) — respond immediately.
        Caller must ensure self._prev_temp is not None.
        """
        assert self._prev_temp is not None
        if temp >= EMERGENCY_TEMP_C:
            return target
        if temp >= self._prev_temp:
            return target  # Temperature rising — no delay.
        # Temperature falling — check hysteresis band.
        hysteresis_target = curve_lookup(curve, temp + HYSTERESIS_C)
        return max(target, hysteresis_target)

    def _compute_fan_speeds(self, temp: int) -> tuple[int, int]:
        """Compute GPU fan % and hwmon PWM for a given temperature.

        When dynamic TDP has actively reduced the power limit, boost the GPU
        fan speed by TDP_REDUCED_FAN_BOOST_PCT — we're trying to cool the card
        harder, and fan headroom is the cheapest mitigation we have.
        Hwmon fans already tend to hit 100% by 70C in the default curve, so
        the boost mainly affects the GPU built-in fans.
        """
        gpu_pct = curve_lookup(self._gpu_curve, temp)
        hwmon_pwm = curve_lookup(self._hwmon_curve, temp)

        # Apply hysteresis only once we have a previous temperature to compare.
        if self._prev_temp is not None:
            gpu_pct = self._apply_hysteresis(temp, gpu_pct, self._gpu_curve)
            hwmon_pwm = self._apply_hysteresis(temp, hwmon_pwm, self._hwmon_curve)

        # Dynamic TDP fan boost: if we've stepped the limit down, run fans
        # harder to help drive avg temp back below the recovery threshold.
        # Boost scales linearly with reduction depth: 0% at baseline, full
        # TDP_REDUCED_FAN_BOOST_PCT at TDP_FULL_BOOST_REDUCTION_W below. With
        # 1W steps, a flat-on-any-reduction boost would fire at 1W below
        # baseline and cost noise/wear for no real thermal need.
        if (self._dynamic_tdp
                and self._original_tdp_mw is not None
                and self._current_tdp_mw < self._original_tdp_mw):
            reduction_w = (self._original_tdp_mw - self._current_tdp_mw) / 1000
            boost_pct = int(min(
                TDP_REDUCED_FAN_BOOST_PCT,
                TDP_REDUCED_FAN_BOOST_PCT * reduction_w / TDP_FULL_BOOST_REDUCTION_W,
            ))
            gpu_pct += boost_pct
            hwmon_pwm = min(255, hwmon_pwm + boost_pct * 255 // 100)

        # Clamp GPU fan to hardware [min, max]. The max clamp matters when the
        # boost pushes the curve value above 100% — without it, prev_gpu_pct
        # in the main loop caches the unclamped value and the log line would
        # print e.g. "GPU 110%", while set_speed() silently clamps to 100.
        gpu_pct = max(self._gpu.min_speed, min(self._gpu.max_speed, gpu_pct))

        return gpu_pct, hwmon_pwm

    # ------------------------------------------------------------------
    # Dynamic TDP (precursor-based power limit throttling)
    # ------------------------------------------------------------------

    def _update_precursor_windows(self, throttle_str: str) -> None:
        """Append the current throttle state to the rolling windows.

        Evicts entries older than TDP_WINDOW_SEC. Tracks:
          - Every sample (for SwPowerCap rate).
          - Transitions only (for oscillation rate). The first sample is NOT
            counted as a transition (we have no prior state to compare against).
        """
        now = time.monotonic()
        cutoff = now - TDP_WINDOW_SEC

        self._swcap_history.append((now, throttle_str == "SwPowerCap"))
        # Only count a transition if we have a prior state AND it actually changed.
        if self._prev_throttle_str is None:
            self._prev_throttle_str = throttle_str  # First sample: prime, don't count.
        elif throttle_str != self._prev_throttle_str:
            self._throttle_transitions.append((now, throttle_str))
            self._prev_throttle_str = throttle_str

        while self._swcap_history and self._swcap_history[0][0] < cutoff:
            self._swcap_history.popleft()
        while self._throttle_transitions and self._throttle_transitions[0][0] < cutoff:
            self._throttle_transitions.popleft()

    def _precursor_detected(self) -> tuple[bool, str]:
        """Return (fired, reason) if GSP stress patterns exceed thresholds.

        Triggers (either):
          - SwPowerCap sample rate >= TDP_SWCAP_RATE_THRESHOLD
          - Throttle transition rate >= TDP_OSC_RATE_THRESHOLD / minute
        """
        if len(self._swcap_history) < 10:
            return False, ""  # Insufficient data for statistical signal.

        swcap_count = sum(1 for _, v in self._swcap_history if v)
        swcap_rate = swcap_count / len(self._swcap_history)
        osc_per_min = len(self._throttle_transitions) * 60.0 / TDP_WINDOW_SEC

        if swcap_rate >= TDP_SWCAP_RATE_THRESHOLD:
            return True, f"SwPowerCap rate {swcap_rate * 100:.0f}% over {TDP_WINDOW_SEC:.0f}s"
        if osc_per_min >= TDP_OSC_RATE_THRESHOLD:
            return True, f"Throttle oscillations {osc_per_min:.0f}/min"
        return False, ""

    def _init_dynamic_tdp(self) -> bool:
        """One-time: read baseline power limit and NVML constraints.

        Returns True on success. Disables dynamic TDP on any error.
        """
        try:
            self._original_tdp_mw = self._gpu.get_power_limit()
            self._current_tdp_mw = self._original_tdp_mw
        except pynvml.NVMLError as exc:
            log(f"ERROR: could not read power limit, disabling dynamic TDP: {exc}")
            self._dynamic_tdp = False
            return False
        try:
            self._min_tdp_mw, _ = self._gpu.get_power_limit_constraints()
        except pynvml.NVMLError:
            # Constraints query is optional; fall back to a safe floor.
            self._min_tdp_mw = max(100_000, self._original_tdp_mw // 4)
        log(f"Dynamic TDP: baseline={self._original_tdp_mw / 1000:.0f}W "
            f"(hardware min={self._min_tdp_mw / 1000:.0f}W)")
        self._tdp_last_update = time.monotonic()
        return True

    def _apply_tdp(self, limit_mw: int, label: str) -> bool:
        """Apply a power limit. Returns True on success."""
        try:
            self._gpu.set_power_limit(limit_mw)
            self._current_tdp_mw = limit_mw
            return True
        except pynvml.NVMLError as exc:
            log(f"WARNING: could not {label} power limit to {limit_mw / 1000:.0f}W: {exc}")
            return False

    def _update_temp_window(self, temp: int) -> None:
        """Append *temp* to the rolling window and evict entries older than
        TEMP_AVG_WINDOW_SEC. Caller maintains timing.
        """
        now = time.monotonic()
        cutoff = now - TEMP_AVG_WINDOW_SEC
        self._temp_history.append((now, temp))
        while self._temp_history and self._temp_history[0][0] < cutoff:
            self._temp_history.popleft()

    def _avg_temp(self) -> float:
        """Return rolling mean temperature. 0 if no samples yet."""
        if not self._temp_history:
            return 0.0
        return sum(t for _, t in self._temp_history) / len(self._temp_history)

    def _maybe_adjust_tdp(self, temp: int) -> None:
        """Incrementally step the TDP limit based on rolling-average temperature.

        Behaviour:

          - Step DOWN by TDP_STEP_W when:
                avg_temp >= TDP_STRESS_AVG_TEMP_C AND precursor firing
                AND cooldown elapsed AND above hardware min.
          - Step UP by TDP_STEP_W when:
                avg_temp <= TDP_RECOVERY_AVG_TEMP_C AND precursor NOT firing
                AND cooldown elapsed AND below original baseline.
          - Between RECOVERY and STRESS temps: no change (deadband).

        The deadband between RECOVERY_TEMP and STRESS_TEMP prevents oscillation.
        Cooldown between steps prevents rapid thrashing within a single workload
        transient. Recovery is temperature-gated rather than precursor-gated
        because a workload throttled to a lower cap may continue pinning against
        that cap and keeping the precursor active, which would otherwise block
        recovery indefinitely.
        """
        if not self._dynamic_tdp or self._dry_run:
            return
        if self._original_tdp_mw is None:
            if not self._init_dynamic_tdp():
                return
        assert self._original_tdp_mw is not None

        now = time.monotonic()
        self._update_temp_window(temp)
        avg_temp = self._avg_temp()
        is_reduced = self._current_tdp_mw < self._original_tdp_mw

        # Accumulate time spent reduced using actual elapsed time.
        if is_reduced and self._tdp_last_update > 0:
            self._total_tdp_seconds += (now - self._tdp_last_update)
        self._tdp_last_update = now

        # Don't evaluate until we have a meaningful average.
        if len(self._temp_history) < 10:
            return

        # Enforce cooldown between adjustments.
        if now - self._last_tdp_step_time < TDP_STEP_COOLDOWN_SEC:
            return

        fired, reason = self._precursor_detected()

        # --- STEP DOWN: avg temp in stress zone AND firmware stressed ---
        if fired and avg_temp >= TDP_STRESS_AVG_TEMP_C:
            target_mw = max(
                self._min_tdp_mw,
                self._current_tdp_mw - TDP_STEP_W * 1000,
            )
            if target_mw >= self._current_tdp_mw:
                return  # Already at the floor.
            if self._apply_tdp(target_mw, "step down"):
                self._last_tdp_step_time = now
                self._total_tdp_steps_down += 1
                log(f"STEP DOWN: {reason}, avg_temp={avg_temp:.1f}C → "
                    f"TDP {self._current_tdp_mw / 1000:.0f}W "
                    f"(step #{self._total_tdp_steps_down} down)")
            return

        # --- STEP UP: avg temp in recovery zone AND no stress ---
        if not fired and avg_temp <= TDP_RECOVERY_AVG_TEMP_C and is_reduced:
            target_mw = min(
                self._original_tdp_mw,
                self._current_tdp_mw + TDP_STEP_W * 1000,
            )
            if target_mw <= self._current_tdp_mw:
                return  # Already at baseline.
            if self._apply_tdp(target_mw, "step up"):
                self._last_tdp_step_time = now
                self._total_tdp_steps_up += 1
                log(f"STEP UP: avg_temp={avg_temp:.1f}C, no precursor → "
                    f"TDP {self._current_tdp_mw / 1000:.0f}W "
                    f"(step #{self._total_tdp_steps_up} up)")

    def _restore_tdp(self) -> None:
        """Restore the original power limit. Safe to call multiple times."""
        if (
            self._original_tdp_mw is not None
            and self._current_tdp_mw != self._original_tdp_mw
        ):
            if self._apply_tdp(self._original_tdp_mw, "restore"):
                log(f"Dynamic TDP: restored to original {self._original_tdp_mw / 1000:.0f}W")

    # ------------------------------------------------------------------
    # Failsafe
    # ------------------------------------------------------------------

    def _apply_failsafe(self) -> None:
        """Set all controllable fans to 100% and restore TDP.

        Called when NVML is unresponsive. Conservative: maximize cooling and
        remove any TDP restriction we may have applied (the firmware will
        handle its own power limits once we're out of the way).

        Logging is gated on _consecutive_nvml_failures to avoid flooding the
        journal — we log on the first few failures and on recovery, but stay
        silent during sustained outages (fan writes are still attempted every
        iteration to catch transient recovery).
        """
        # Only log the "FAILSAFE: ..." banner while we're still under the
        # warning threshold. The NVML-read-failure path already escalates;
        # this keeps the failsafe log in sync with it.
        verbose = self._consecutive_nvml_failures <= MAX_CONSECUTIVE_FAILURES
        if verbose:
            log("FAILSAFE: NVML failed — setting all fans to maximum")

        if self._control_gpu and not self._dry_run:
            self._gpu.set_speed(FAILSAFE_GPU_PCT, quiet=True)
        if self._control_hwmon and self._hwmon and not self._dry_run:
            self._hwmon.set_pwm(FAILSAFE_HWMON_PWM)

        # Best-effort TDP restore so we don't leave the card throttled if we
        # die here. Silent during sustained failure to avoid log flood.
        if (self._dynamic_tdp and not self._dry_run
                and self._original_tdp_mw is not None
                and self._current_tdp_mw != self._original_tdp_mw):
            try:
                self._gpu.set_power_limit(self._original_tdp_mw)
                self._current_tdp_mw = self._original_tdp_mw
                if verbose:
                    log("FAILSAFE: TDP restored to baseline")
            except pynvml.NVMLError as exc:
                if verbose:
                    log(f"FAILSAFE: TDP restore also failed: {exc}")

    # ------------------------------------------------------------------
    # Status
    # ------------------------------------------------------------------

    def print_status(self) -> None:
        """Print a one-line status summary. Safe from signal handler (GIL).

        Best-effort: if NVML is unresponsive (e.g. during a GPU crash), shows
        cached values rather than crashing the status print itself.
        """
        elapsed = int(time.monotonic() - self._start_time)
        h, remainder = divmod(elapsed, 3600)
        m = remainder // 60
        try:
            temp = self._gpu.get_temp()
            speeds = self._gpu.get_fan_speeds()
        except pynvml.NVMLError:
            temp = 0
            speeds = []
        hwmon_rpm = self._hwmon.get_rpm() if self._hwmon else 0
        tdp_info = ""
        if self._dynamic_tdp and self._original_tdp_mw is not None:
            current_w = self._current_tdp_mw / 1000
            state = "REDUCED" if self._current_tdp_mw < self._original_tdp_mw else "normal"
            avg_t = self._avg_temp()
            avg_str = f"{avg_t:.1f}C" if avg_t > 0 else "—"
            tdp_info = (f" | TDP={current_w:.0f}W ({state}, avg={avg_str}) | "
                        f"steps: {self._total_tdp_steps_down}↓/{self._total_tdp_steps_up}↑ "
                        f"({self._total_tdp_seconds:.0f}s reduced)")
        log(
            f"Status: {h}h{m}m | {temp}C | "
            f"GPU fans={speeds} | hwmon={hwmon_rpm}RPM | "
            f"adj={self._total_adjustments} | "
            f"peak: {self._peak_temp}C/{self._peak_power:.0f}W{tdp_info}"
        )

    def _maybe_log_status(self) -> None:
        """Log periodic status for liveness monitoring."""
        now = time.monotonic()
        if now - self._last_status_time >= STATUS_LOG_INTERVAL_SEC:
            self._last_status_time = now
            self.print_status()

    # ------------------------------------------------------------------
    # Main loop
    # ------------------------------------------------------------------

    def run(self) -> None:
        """Initialize hardware, run the fan control loop, clean up on exit.

        Ordering is critical for safety:

        1. NVML init happens first (may raise → clean exit, nothing to undo).
        2. Validation of num_fans (may exit cleanly, only NVML to shut down).
        3. enable_manual and the main loop live inside a single try/finally
           so that the cleanup path ALWAYS runs once we've changed hardware
           state — even if a signal arrives mid-log, mid-loop, or mid-ramp.

        The cleanup path in the finally block is idempotent (restore_auto
        guards against already-restored state via _manual_mode flag).
        """
        # Phase 1: NVML init. No hardware state changed yet — safe to exit on
        # failure without running cleanup.
        try:
            self._gpu.init()
        except pynvml.NVMLError as exc:
            log(f"ERROR: NVML init failed: {exc}")
            sys.exit(1)

        # Phase 2: sanity check. Only NVML is initialized; clean shutdown
        # covers it.
        if self._control_gpu and self._gpu.num_fans == 0:
            log("ERROR: --no-gpu-fans not set but NVML reports 0 fans. "
                "Nothing to control. Exiting. (Use --no-gpu-fans to suppress.)")
            try:
                pynvml.nvmlShutdown()
            except pynvml.NVMLError:
                pass
            sys.exit(1)

        # Phase 3: hardware state changes and main loop — protected by
        # try/finally so cleanup runs under any exit condition.
        prev_gpu_pct = -1
        prev_hwmon_pwm = -1

        try:
            # Switch to manual control. MUST be inside the try so that a
            # signal arriving between enable_manual and the loop body is
            # still caught by the finally block and hardware is restored.
            if self._control_gpu and not self._dry_run:
                self._gpu.enable_manual()
            if self._control_hwmon and self._hwmon and not self._dry_run:
                self._hwmon.enable_manual()

            log(f"v{VERSION} started | interval={self._interval}s | "
                f"gpu_fans={'ON' if self._control_gpu else 'OFF'} | "
                f"hwmon={'ON' if self._control_hwmon else 'OFF'} | "
                f"dry_run={self._dry_run} | "
                f"dynamic_tdp={'ON' if self._dynamic_tdp else 'OFF'} | "
                f"emergency={EMERGENCY_TEMP_C}C | hysteresis={HYSTERESIS_C}C")
            if self._dynamic_tdp:
                log(f"Dynamic TDP (stepping controller): "
                    f"step down {TDP_STEP_W}W when avg_temp>={TDP_STRESS_AVG_TEMP_C}C AND "
                    f"precursor (SwPowerCap>{TDP_SWCAP_RATE_THRESHOLD * 100:.0f}% "
                    f"OR osc>{TDP_OSC_RATE_THRESHOLD}/min); step up when "
                    f"avg_temp<={TDP_RECOVERY_AVG_TEMP_C}C AND no precursor. "
                    f"Cooldown {TDP_STEP_COOLDOWN_SEC:.0f}s between steps. "
                    f"Temp avg window {TEMP_AVG_WINDOW_SEC:.0f}s. "
                    f"Fan boost +{TDP_REDUCED_FAN_BOOST_PCT}% while reduced.")
            log(f"GPU curve: {self._gpu_curve}")
            if self._control_hwmon:
                log(f"Hwmon curve: {self._hwmon_curve}")

            while not self.shutdown.is_set():
                # --- Read temperature ---
                try:
                    temp = self._gpu.get_temp()  # Raises on NVML failure.
                    power = self._gpu.get_power()

                    # Reset failure counter on success.
                    if self._consecutive_nvml_failures > 0:
                        log(f"NVML recovered after {self._consecutive_nvml_failures} failures")
                    self._consecutive_nvml_failures = 0

                except pynvml.NVMLError as exc:
                    self._consecutive_nvml_failures += 1
                    if self._consecutive_nvml_failures <= MAX_CONSECUTIVE_FAILURES:
                        log(f"NVML read failed ({self._consecutive_nvml_failures}x): {exc}")
                    elif self._consecutive_nvml_failures == MAX_CONSECUTIVE_FAILURES + 1:
                        log(f"NVML still failing — suppressing further warnings")
                    self._apply_failsafe()
                    self.shutdown.wait(self._interval)
                    continue

                # --- Track peaks ---
                if temp > self._peak_temp:
                    self._peak_temp = temp
                if power > self._peak_power:
                    self._peak_power = power

                # --- Dynamic TDP: read throttle, update windows, maybe adjust ---
                if self._dynamic_tdp:
                    throttle_bitmask = self._gpu.get_throttle_bitmask()
                    # SwPowerCap is the single most correlated signal with crashes
                    # from 5 days of telemetry. Other bits tracked via bitmask state
                    # (any transition counts toward oscillation rate).
                    if throttle_bitmask & NVML_CLOCK_EVENT_SW_POWER_CAP:
                        throttle_str = "SwPowerCap"
                    elif throttle_bitmask != 0:
                        throttle_str = f"0x{throttle_bitmask:x}"
                    else:
                        throttle_str = "None"
                    self._update_precursor_windows(throttle_str)
                    self._maybe_adjust_tdp(temp)

                # --- Compute target fan speeds ---
                gpu_pct, hwmon_pwm = self._compute_fan_speeds(temp)

                # --- Apply ---
                changed = False

                if gpu_pct != prev_gpu_pct:
                    if self._dry_run:
                        log(f"DRY RUN: {temp}C → GPU fans {gpu_pct}%")
                    elif self._control_gpu:
                        self._gpu.set_speed(gpu_pct)
                    prev_gpu_pct = gpu_pct
                    changed = True

                if hwmon_pwm != prev_hwmon_pwm:
                    if self._dry_run:
                        log(f"DRY RUN: {temp}C → hwmon PWM {hwmon_pwm}/255")
                    elif self._control_hwmon and self._hwmon:
                        self._hwmon.set_pwm(hwmon_pwm)
                    prev_hwmon_pwm = hwmon_pwm
                    changed = True

                if changed:
                    self._total_adjustments += 1
                    speeds = self._gpu.get_fan_speeds()
                    hwmon_rpm = self._hwmon.get_rpm() if self._hwmon else 0
                    log(f"{temp}C {power:.0f}W → GPU {gpu_pct}% (actual {speeds}) "
                        f"| hwmon PWM {hwmon_pwm}/255 ({hwmon_rpm}RPM)")

                self._prev_temp = temp
                self._maybe_log_status()
                self.shutdown.wait(self._interval)

        except KeyboardInterrupt:
            pass
        finally:
            self.shutdown.set()
            log("Shutting down — restoring automatic fan control...")
            # Print final status BEFORE touching NVML — status reads need the
            # library initialized. After shutdown_nvml() the reads will fail.
            self.print_status()
            if not self._dry_run:
                # Restore TDP first (needs live NVML handle).
                if self._dynamic_tdp:
                    self._restore_tdp()
                # NVML was inited unconditionally in Phase 1 (temperature reads
                # need it even in hwmon-only mode), so shut it down
                # unconditionally here to keep the symmetry. restore_auto() is
                # a no-op if we never enabled manual mode (guarded by the
                # _manual_mode flag inside GPUFanController), so this is safe
                # under --no-gpu-fans as well.
                self._gpu.shutdown_nvml()
                if self._control_hwmon and self._hwmon:
                    self._hwmon.restore_auto()
            log("Stopped")


 # ---------------------------------------------------------------------------
 # Lock file
 # ---------------------------------------------------------------------------

 def _acquire_lock() -> None:
    """Prevent duplicate instances via advisory flock.

    The lock is held for the lifetime of the process (module-level _lock_fh
    keeps the file descriptor open). flock is released automatically by the
    kernel when the process exits, even on crashes. The PID is written for
    diagnostic purposes only — it is NOT used for lock detection.

    Open ordering matters: we open in append mode (no truncation), try to
    acquire flock, and ONLY truncate the file once we own the lock. Opening
    in "w" mode would truncate a concurrent holder's PID even if we lose
    the flock race. The append-then-truncate pattern preserves the original
    PID in the file if another instance already holds the lock.
    """
    global _lock_fh
    try:
        LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
        # Append mode: does not truncate the file on open. Creates if absent.
        _lock_fh = open(LOCK_FILE, "a+")  # noqa: SIM115 — kept open intentionally.
    except OSError as exc:
        log(f"ERROR: cannot open {LOCK_FILE}: {exc}")
        sys.exit(1)
    try:
        fcntl.flock(_lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
    except OSError:
        # Another instance holds the lock — DO NOT modify the file.
        log(f"ERROR: another instance is already running (lock held on {LOCK_FILE})")
        _lock_fh.close()
        _lock_fh = None
        sys.exit(1)
    # We own the lock. Safe to truncate and write our PID.
    try:
        _lock_fh.seek(0)
        _lock_fh.truncate()
        _lock_fh.write(f"{os.getpid()}\n")
        _lock_fh.flush()
        try:
            os.fsync(_lock_fh.fileno())
        except OSError:
            pass  # fsync failure on tmpfs is not critical.
    except OSError as exc:
        # Writing the PID is diagnostic; lock itself is held.
        log(f"WARNING: could not write PID to {LOCK_FILE}: {exc}")


 # ---------------------------------------------------------------------------
 # Entry point
 # ---------------------------------------------------------------------------

 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Userspace GPU fan controller for Blackwell thermal mitigation",
    )
    parser.add_argument("--gpu", type=int, default=0,
                        help="GPU index (default: 0)")
    parser.add_argument("--interval", type=float, default=2.0,
                        help="Polling interval in seconds (default: 2.0)")
    parser.add_argument("--hwmon-chip", type=str, default="",
                        help="Motherboard hwmon chip for external fans (e.g. nct6798)")
    parser.add_argument("--hwmon-pwm", type=str, default="pwm1",
                        help="Hwmon PWM sysfs name (default: pwm1)")
    parser.add_argument("--no-gpu-fans", action="store_true",
                        help="Don't control GPU built-in fans (hwmon only)")
    parser.add_argument("--no-hwmon", action="store_true",
                        help="Don't control motherboard fans (GPU fans only)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Print actions without changing fan speeds")
    parser.add_argument("--dynamic-tdp", action="store_true",
                        help="Enable dynamic TDP stepping controller: "
                             f"steps power limit by {TDP_STEP_W}W (down under stress, "
                             "up during recovery) based on rolling temperature average "
                             "and firmware stress precursors. Also boosts fan speeds "
                             f"by {TDP_REDUCED_FAN_BOOST_PCT}%% while TDP is reduced.")
    parser.add_argument("--version", action="version",
                        version=f"gpu-fan-control.py {VERSION}")
    args = parser.parse_args()

    if args.interval <= 0:
        log(f"ERROR: interval must be positive (got: {args.interval})")
        sys.exit(1)

    if args.no_gpu_fans and args.no_hwmon:
        log("ERROR: --no-gpu-fans and --no-hwmon together leave nothing to control.")
        sys.exit(1)

    # Startup sanity: the deadband must have RECOVERY < STRESS, otherwise the
    # controller would immediately oscillate. Guard against a future edit of
    # the constants that accidentally inverts the relation.
    assert TDP_RECOVERY_AVG_TEMP_C < TDP_STRESS_AVG_TEMP_C, (
        f"TDP_RECOVERY_AVG_TEMP_C ({TDP_RECOVERY_AVG_TEMP_C}) must be less than "
        f"TDP_STRESS_AVG_TEMP_C ({TDP_STRESS_AVG_TEMP_C}) to define a valid deadband"
    )

    # Parse optional custom fan curves from environment variables. An empty
    # parse result means every entry was malformed — warn loudly rather than
    # silently falling back to the built-in default.
    gpu_curve = None
    hwmon_curve = None
    env_gpu = os.environ.get("GPU_FAN_CURVE", "")
    env_hwmon = os.environ.get("HWMON_FAN_CURVE", "")
    if env_gpu:
        gpu_curve = parse_fan_curve(env_gpu, "GPU_FAN_CURVE")
        if gpu_curve:
            log(f"Custom GPU curve from env: {gpu_curve}")
        else:
            log("WARNING: GPU_FAN_CURVE parsed to empty — using built-in default")
            gpu_curve = None
    if env_hwmon:
        hwmon_curve = parse_fan_curve(env_hwmon, "HWMON_FAN_CURVE")
        if hwmon_curve:
            log(f"Custom hwmon curve from env: {hwmon_curve}")
        else:
            log("WARNING: HWMON_FAN_CURVE parsed to empty — using built-in default")
            hwmon_curve = None

    controller = FanController(
        gpu_index=args.gpu,
        interval=args.interval,
        gpu_curve=gpu_curve,
        hwmon_curve=hwmon_curve,
        hwmon_chip=args.hwmon_chip,
        hwmon_pwm=args.hwmon_pwm,
        control_gpu=not args.no_gpu_fans,
        control_hwmon=not args.no_hwmon,
        dry_run=args.dry_run,
        dynamic_tdp=args.dynamic_tdp,
    )

    def _handle_signal(signum: int, _: Any) -> None:
        if signum == signal.SIGUSR1:
            controller.print_status()
        elif signum == signal.SIGHUP:
            log("SIGHUP received (no-op — reserved for future config reload)")
        else:
            controller.shutdown.set()

    # Install signal handlers BEFORE acquiring the lock so that a SIGTERM
    # arriving during lock acquisition triggers a clean shutdown instead of
    # the default kill action (which would leave the lock held until the
    # kernel releases it).
    signal.signal(signal.SIGTERM, _handle_signal)
    signal.signal(signal.SIGINT, _handle_signal)
    signal.signal(signal.SIGUSR1, _handle_signal)
    signal.signal(signal.SIGHUP, _handle_signal)

    _acquire_lock()

    controller.run()


 if __name__ == "__main__":
    main()
No results found