wallter · April 14, 2026 00:56
diff --git a/gpu-telemetry-deep.py b/gpu-telemetry-deep.py
 #!/usr/bin/env python3
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2026 Tyler Wall
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 """
 gpu-telemetry.py — NVML GPU telemetry logger for diagnosing Blackwell GSP crashes

 Captures temp (all sensors), power, clocks, per-fan speed + target + policy,
 throttle reason bitmask, PCIe link state, and GPU events every 100ms.
 Auto-detects fan spikes, power limit breaches, fan-temperature inversions,
 and spurious throttle events. Uses NVML directly (not nvidia-smi) for lower
 overhead and deeper visibility into firmware behavior.

 Designed for NVIDIA RTX PRO 6000 / RTX 50-series Blackwell GPUs experiencing
 GSP firmware crashes (Xid 62/79/119/154). See:
  https://github.com/NVIDIA/open-gpu-kernel-modules/issues/1045

 Platform: Linux only (requires fcntl, dmesg, lspci, setpci for crash snapshots).

 === Install ===

  # 1. Install pynvml (system-wide for systemd)
  sudo /usr/bin/python3 -m pip install --break-system-packages nvidia-ml-py

  # 2. Copy script
  sudo cp gpu-telemetry.py /usr/local/bin/

  # 3. Create systemd service
  sudo tee /etc/systemd/system/gpu-telemetry.service <<'UNIT'
  [Unit]
  Description=GPU Telemetry Logger (Blackwell crash diagnostics)
  After=nvidia-persistenced.service
  Wants=nvidia-persistenced.service

  [Service]
  Type=simple
  ExecStart=/usr/bin/python3 /usr/local/bin/gpu-telemetry.py --interval 0.1 --log-dir /var/log/gpu-telemetry
  Restart=on-failure
  RestartSec=10

  [Install]
  WantedBy=multi-user.target
  UNIT

  # 4. Enable and start
  sudo systemctl daemon-reload
  sudo systemctl enable --now gpu-telemetry

  # 5. Verify
  sudo systemctl status gpu-telemetry
  tail -5 /var/log/gpu-telemetry/gpu-deep-$(date +%Y-%m-%d).csv

 === Uninstall ===

  sudo systemctl disable --now gpu-telemetry
  sudo rm /etc/systemd/system/gpu-telemetry.service /usr/local/bin/gpu-telemetry.py
  sudo systemctl daemon-reload
  # Optionally: sudo rm -rf /var/log/gpu-telemetry

 === Arguments ===

  --interval SECONDS       Polling interval in seconds (default: 0.1)
  --log-dir PATH           Log directory (default: /var/log/gpu-telemetry)
  --spike-threshold PCT    Fan %/sample change to flag as spike (default: 15)
  --gpu INDEX              GPU index for multi-GPU systems (default: 0)
  --version                Show version and exit

 === Signals ===

  SIGUSR1   Print status to stderr/journal
  SIGHUP    Reopen log files (for external rotation)

 === Output files ===

  gpu-deep-*.csv     Telemetry CSV, daily rotation, 14-day retention
  events.log         GPU events from NVML interrupt API, rotated at 50MB
  summary.log        Periodic stats every 5 min, rotated at 10MB
  system-info.txt    One-time hardware/driver snapshot (refreshed on driver change)
  crashes/*.log      Auto-captured on NVML failure (full dmesg + PCIe state)
  spikes/*.log       Auto-captured on fan spike (10s cooldown between snapshots)

 === CSV columns ===

  timestamp, gpu_temp, gpu_tlimit, power_instant, power_limit,
  pstate, gpu_clock, mem_clock, gpu_util, mem_util, vram_used_mb,
  pcie_gen, pcie_width,
  throttle_bitmask, throttle_reasons,
  fan{N}_speed_pct, fan{N}_target_pct,  (repeated per fan)
  sensor0_temp, sensor0_target, sensor0_controller,
  flags

 === Anomaly flags (in the 'flags' CSV column) ===

  FAN_SPIKE             Fan speed change >= spike-threshold %/sample
  TEMP_FAN_INV(TT@FF%)  Temp >= 85C but fans <= 55% — fan controller unresponsive
  SPUR_THROT(PP%ofLim)  SwPowerCap throttle at < 25% of power limit — GSP glitch
  PWR+NNW               Power exceeds limit by > 50W — limit enforcement failure
  TDP_REDUCED(WWW W)    Power limit was lowered between samples (external mitigation)
  TDP_RESTORED(WWW W)   Power limit was raised between samples (external mitigation)

 === Rate-limited events ===

  Fan spike snapshots:  cooldown SPIKE_COOLDOWN_SEC (10s) between spike-*.log files.
  Crash snapshots:      cooldown CRASH_COOLDOWN_SEC (60s) between crash-*.log files
                        — Xid event floods produce 30-50+ events per crash; one
                        snapshot per crash is what matters.
  Suppressed crashes:   counted and shown in status; the first 3 suppressions
                        per crash are logged so the burst is visible.

 === Summary.log rate metrics ===

  Each summary entry includes both cumulative totals and per-window rates
  (per-minute, computed over the actual elapsed window). This makes
  acceleration visible:

    window=300s samples=... pwr_breaches=74692 (rate=14.8/min) ...

  Period counters reset each summary cycle.

 === Design notes ===

  Threading: one main thread (polling + CSV), one daemon thread (NVML events).
  Shutdown coordinated via threading.Event. Counters are written only from the
  main thread; the signal handler and event thread only read them or set the
  shutdown flag. Python's GIL ensures atomic int reads for print_status().

  All shell commands in run_cmd() are hardcoded string literals — no user input
  flows into shell=True. Used only in crash/spike snapshots (not hot path).

  Each NVML call is wrapped in nvml_safe() so a single field failure doesn't
  lose the entire sample — the row is written with defaults for failed fields.
 """

 from __future__ import annotations

 import argparse
 import csv
 import fcntl
 import os
 import shutil
 import signal
 import subprocess
 import sys
 import threading
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import IO, Any, Callable

 try:
    import pynvml
 except ImportError:
    print(
        "[gpu-telemetry] ERROR: pynvml not found.\n"
        "  Install: sudo /usr/bin/python3 -m pip install --break-system-packages nvidia-ml-py",
        file=sys.stderr,
    )
    sys.exit(1)

 VERSION = "2.5.0"
 LOCK_FILE = Path("/run/gpu-telemetry.pid")
 LOG_PREFIX = "[gpu-telemetry]"

 # Defaults (overridable via args or constants)
 DEFAULT_INTERVAL = 0.1
 DEFAULT_LOG_DIR = "/var/log/gpu-telemetry"
 DEFAULT_SPIKE_THRESHOLD = 15
 MIN_DISK_MB = 500
 SPIKE_COOLDOWN_SEC = 10.0
 CRASH_COOLDOWN_SEC = 60.0                 # Minimum seconds between crash snapshots —
                                           # Xid floods produce dozens of events per crash.
 DISK_CHECK_INTERVAL_SEC = 30.0
 LOG_RETENTION_DAYS = 14
 EVENTS_LOG_MAX_BYTES = 50 * 1024 * 1024   # 50 MB
 SUMMARY_LOG_MAX_BYTES = 10 * 1024 * 1024  # 10 MB
 SUMMARY_INTERVAL_SEC = 300.0              # 5 minutes — also used as rate-metric window.

 # Anomaly detection thresholds
 FAN_TEMP_INVERSION_TEMP = 85    # flag if temp >= this AND fan% <= threshold below
 FAN_TEMP_INVERSION_FAN = 55     # fan% ceiling for inversion detection
 SPURIOUS_THROTTLE_POWER_PCT = 0.25  # flag SwPowerCap if power < 25% of limit
 ANOMALY_LOG_COOLDOWN_SEC = 60.0     # max once per 60s per anomaly type to stderr

 # TDP change detection thresholds
 TDP_CHANGE_EPSILON_W = 5        # Power limit differences under this are treated as noise.

 # Module-level lock file handle — prevent GC from releasing flock.
 _lock_fh: IO[str] | None = None

 # ---------------------------------------------------------------------------
 # Throttle reason bitmask (from nvml.h nvmlClocksEventReasons)
 # ---------------------------------------------------------------------------
 THROTTLE_REASONS: dict[int, str] = {
    0x0000_0000_0000_0001: "GpuIdle",
    0x0000_0000_0000_0002: "AppClocks",
    0x0000_0000_0000_0004: "SwPowerCap",
    0x0000_0000_0000_0008: "HwSlowdown",
    0x0000_0000_0000_0010: "SyncBoost",
    0x0000_0000_0000_0020: "SwThermalSlowdown",
    0x0000_0000_0000_0040: "HwThermalSlowdown",
    0x0000_0000_0000_0080: "HwPowerBrakeSlowdown",
    0x0000_0000_0000_0100: "DisplayClocks",
    0x0000_0000_0000_0200: "UserDefinedClocks",
 }


 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------

 def log(msg: str) -> None:
    """Write a log line to stderr (captured by journald when run as a service)."""
    print(f"{LOG_PREFIX} {msg}", file=sys.stderr, flush=True)


 def decode_throttle(bitmask: int) -> str:
    """Decode NVML clocks_event_reasons bitmask to human-readable pipe-delimited string."""
    if bitmask == 0:
        return "None"
    reasons = [name for bit, name in THROTTLE_REASONS.items() if bitmask & bit]
    return "|".join(reasons) if reasons else f"Unknown(0x{bitmask:x})"


 def nvml_safe(func: Callable[..., Any], *args: Any, default: Any = 0) -> Any:
    """Call an NVML function, returning *default* on any NVMLError.

    This allows each field in a sample to fail independently without losing
    the entire row. The caller sees the default and the CSV gets a zero/empty
    for that field rather than an exception.
    """
    try:
        return func(*args)
    except pynvml.NVMLError:
        return default


 def to_str(value: Any) -> str:
    """Coerce bytes or str from NVML into str.

    pynvml returns bytes on some versions and str on others for the same call.
    """
    if isinstance(value, bytes):
        return value.decode("utf-8", errors="replace")
    return str(value)


 def tail_lines(filepath: Path, n: int = 50) -> list[str]:
    """Read last *n* complete lines without loading the entire file.

    Seeks to an estimated offset near EOF, reads forward, then discards the
    first (potentially partial) line to avoid returning a truncated CSV row.
    """
    try:
        with open(filepath, "rb") as f:
            f.seek(0, 2)
            size = f.tell()
            if size == 0:
                return []
            read_size = min(size, n * 300)
            start = max(0, size - read_size)
            f.seek(start)
            chunk = f.read().decode("utf-8", errors="replace")
            lines = chunk.splitlines()
            if start > 0 and lines:
                lines = lines[1:]  # Discard partial first line.
            return lines[-n:]
    except OSError:
        return ["(could not read log)"]


 def run_cmd(cmd: str, timeout: int = 5) -> str:
    """Run a shell command, return combined stdout+stderr.

    All commands passed to this function are hardcoded string literals defined
    in this file — no user input or external data flows into the shell.
    """
    try:
        result = subprocess.run(
            cmd, shell=True, capture_output=True, text=True, timeout=timeout,
        )
        return result.stdout + result.stderr
    except subprocess.TimeoutExpired:
        return f"(timed out after {timeout}s: {cmd})\n"
    except OSError as exc:
        return f"(failed: {cmd}: {exc})\n"


 def check_disk_space(path: Path) -> bool:
    """Return False if free space on *path*'s filesystem < MIN_DISK_MB."""
    try:
        return shutil.disk_usage(path).free // (1024 * 1024) >= MIN_DISK_MB
    except OSError:
        return True  # Can't check → assume OK; let writes fail naturally.


 def extract_pci_short(pci_bus_id: str) -> str:
    """Extract 'BB:DD.F' from '00000000:01:00.0' for lspci/setpci."""
    parts = pci_bus_id.split(":")
    if len(parts) >= 3:
        return f"{parts[-2]}:{parts[-1]}"
    return pci_bus_id


 def _rotate_file(path: Path, max_bytes: int) -> None:
    """Rotate *path* to path.1 if it exceeds *max_bytes*."""
    try:
        if path.exists() and path.stat().st_size > max_bytes:
            rotated = path.with_suffix(path.suffix + ".1")
            path.rename(rotated)
    except OSError:
        pass


 # ---------------------------------------------------------------------------
 # Snapshot header — shared between spike, crash, and Xid snapshots
 # ---------------------------------------------------------------------------

 def _write_snapshot_header(
    f: IO[str],
    *,
    title: str,
    gpu_name: str,
    gpu_pci: str,
    error: str = "",
    uptime_sec: int = 0,
    total_samples: int = 0,
    peak_temp: int = 0,
    peak_power: float = 0.0,
    event_number: int = 0,
    total_power_breaches: int = 0,
    total_fan_inversions: int = 0,
    total_spurious_throttles: int = 0,
 ) -> None:
    """Write the common header block for any diagnostic snapshot."""
    f.write(f"=== {title} ===\n")
    f.write(f"Time:      {datetime.now(timezone.utc).isoformat()}\n")
    f.write(f"Script:    gpu-telemetry.py v{VERSION}\n")
    f.write(f"GPU:       {gpu_name} ({gpu_pci})\n")
    if error:
        f.write(f"Error:     {error}\n")
    f.write(f"Uptime:    {uptime_sec}s, {total_samples} samples\n")
    f.write(f"Peaks:     {peak_temp}C / {peak_power:.0f}W\n")
    f.write(f"Event #:   {event_number}\n")
    f.write(f"Pwr breach:{total_power_breaches}  "
            f"Fan inv:{total_fan_inversions}  "
            f"Spur throt:{total_spurious_throttles}\n\n")


 # ---------------------------------------------------------------------------
 # GPUMonitor
 # ---------------------------------------------------------------------------

 class GPUMonitor:
    """Main telemetry collector.

    Lifecycle::

        monitor = GPUMonitor(...)
        monitor.run()   # blocks until shutdown.set()

    Threading model:
        - Main thread: _sample() → CSV writes, anomaly detection, snapshots.
        - Daemon thread: _run_event_listener() → NVML event subscription.
        - Signal handler: sets shutdown event or calls print_status().

    All mutable counters are written exclusively from the main thread.
    The signal handler only reads them (Python GIL guarantees atomic int reads)
    or sets the threading.Event.
    """

    def __init__(
        self,
        log_dir: str,
        interval: float,
        fan_spike_threshold: int = DEFAULT_SPIKE_THRESHOLD,
        gpu_index: int = 0,
    ) -> None:
        self.log_dir = Path(log_dir)
        self.interval = interval
        self.fan_spike_threshold = fan_spike_threshold
        self.gpu_index = gpu_index

        # NVML state (populated by _init_nvml)
        self._handle: Any = None
        self._num_fans: int = 0
        self._gpu_name: str = ""
        self._gpu_pci: str = ""
        self._gpu_pci_short: str = "01:00.0"

        # Shutdown coordination
        self.shutdown = threading.Event()
        self._start_time = time.monotonic()

        # Counters (main thread writes, signal handler reads)
        self._total_samples: int = 0
        self._total_spikes: int = 0
        self._total_crashes: int = 0
        self._total_power_breaches: int = 0
        self._total_fan_inversions: int = 0
        self._total_spurious_throttles: int = 0
        self._total_tdp_changes: int = 0
        self._consecutive_failures: int = 0
        self._peak_temp: int = 0
        self._peak_power: float = 0.0

        # Per-period counters for rolling-rate reporting. Reset each summary cycle.
        self._period_samples: int = 0
        self._period_power_breaches: int = 0
        self._period_fan_inversions: int = 0
        self._period_spurious_throttles: int = 0
        self._period_tdp_changes: int = 0

        # Timing state
        self._disk_paused: bool = False
        self._last_disk_check: float = 0.0
        self._last_fan_inversion_log: float = 0.0
        self._last_spurious_throttle_log: float = 0.0
        self._last_summary_time: float = 0.0
        self._last_spike_time: float = 0.0
        self._last_crash_time: float = 0.0         # Cooldown for crash snapshots.
        self._crashes_suppressed: int = 0          # Count of cooldown-suppressed snapshots.

        # Per-sample state
        self._prev_fan_speeds: list[int] = []
        self._prev_throttle: int = 0
        self._prev_power_limit_w: float | None = None  # For TDP change detection.

        # SIGHUP flag
        self._reopen_logs: bool = False

        # File handles
        self._csv_fh: IO[str] | None = None
        self._csv_writer: Any = None
        self._csv_date: str = ""
        self._csv_path: Path | None = None
        self._csv_columns: int = 0
        self._events_fh: IO[str] | None = None

    # ------------------------------------------------------------------
    # NVML initialization
    # ------------------------------------------------------------------

    def _init_nvml(self) -> None:
        """Initialize NVML and cache static GPU properties."""
        pynvml.nvmlInit()
        self._handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)
        self._num_fans = int(nvml_safe(pynvml.nvmlDeviceGetNumFans, self._handle, default=0))
        self._prev_fan_speeds = [0] * self._num_fans

        self._gpu_name = to_str(nvml_safe(pynvml.nvmlDeviceGetName, self._handle, default="Unknown"))
        driver = to_str(nvml_safe(pynvml.nvmlSystemGetDriverVersion, default="Unknown"))

        try:
            pci_info = pynvml.nvmlDeviceGetPciInfo(self._handle)
            self._gpu_pci = to_str(pci_info.busId if hasattr(pci_info, "busId") else pci_info)
            self._gpu_pci_short = extract_pci_short(self._gpu_pci)
        except pynvml.NVMLError:
            self._gpu_pci = "unknown"

        power_limit_w = int(nvml_safe(
            pynvml.nvmlDeviceGetPowerManagementLimit, self._handle, default=0
        )) / 1000.0

        log(f"v{VERSION} | GPU: {self._gpu_name} | PCI: {self._gpu_pci}")
        log(f"Driver: {driver} | Fans: {self._num_fans} | Power limit: {power_limit_w:.0f}W")

        thresholds: dict[str, Any] = {}
        for label, idx in [("shutdown", 0), ("slowdown", 1), ("gpu_max", 3)]:
            thresholds[label] = nvml_safe(
                pynvml.nvmlDeviceGetTemperatureThreshold, self._handle, idx, default="N/A",
            )
        log(f"Thresholds: {thresholds}")

    # ------------------------------------------------------------------
    # System info
    # ------------------------------------------------------------------

    def _log_system_info(self) -> None:
        """Write one-time hardware snapshot. Refreshed if driver version changes."""
        info_file = self.log_dir / "system-info.txt"
        driver = to_str(nvml_safe(pynvml.nvmlSystemGetDriverVersion, default=""))
        try:
            if info_file.exists() and driver and driver in info_file.read_text():
                return
        except OSError:
            pass

        try:
            lines = [
                "=== GPU Telemetry — System Info ===",
                f"Generated: {datetime.now(timezone.utc).isoformat()}",
                f"Script:    gpu-telemetry.py v{VERSION}",
                "",
                "--- GPU ---",
                f"  Name:        {self._gpu_name}",
                f"  PCI:         {self._gpu_pci}",
                f"  Serial:      {to_str(nvml_safe(pynvml.nvmlDeviceGetSerial, self._handle, default='N/A'))}",
                f"  VBIOS:       {to_str(nvml_safe(pynvml.nvmlDeviceGetVbiosVersion, self._handle, default='N/A'))}",
                f"  Driver:      {driver}",
                f"  Fans:        {self._num_fans}",
                "",
                "--- Thermal thresholds ---",
            ]
            for label, idx in [("shutdown", 0), ("slowdown", 1), ("gpu_max", 3)]:
                val = nvml_safe(pynvml.nvmlDeviceGetTemperatureThreshold, self._handle, idx, default="N/A")
                lines.append(f"  {label}: {val}{'C' if isinstance(val, int) else ''}")
            lines.extend([
                "",
                "--- OS ---",
                f"  {run_cmd('uname -a').strip()}",
                f"  {run_cmd('lsb_release -d 2>/dev/null || head -3 /etc/os-release 2>/dev/null').strip()}",
                "",
                "--- CPU ---",
                f"  {run_cmd('grep -m1 model.name /proc/cpuinfo 2>/dev/null').strip() or 'N/A'}",
                "",
                "--- Motherboard ---",
                f"  {run_cmd('cat /sys/devices/virtual/dmi/id/board_vendor 2>/dev/null').strip() or 'N/A'}",
                f"  {run_cmd('cat /sys/devices/virtual/dmi/id/board_name 2>/dev/null').strip() or 'N/A'}",
                "",
                "--- NVIDIA kernel modules ---",
                run_cmd("lsmod 2>/dev/null | grep -i nvidia").strip() or "N/A",
            ])
            info_file.write_text("\n".join(lines) + "\n")
            log(f"System info: {info_file}")
        except OSError as exc:
            log(f"WARNING: system info write failed: {exc}")

    # ------------------------------------------------------------------
    # CSV management
    # ------------------------------------------------------------------

    def _get_csv_fields(self) -> list[str]:
        fields = [
            "timestamp", "gpu_temp", "gpu_tlimit",
            "power_instant", "power_limit",
            "pstate", "gpu_clock", "mem_clock",
            "gpu_util", "mem_util", "vram_used_mb",
            "pcie_gen", "pcie_width",
            "throttle_bitmask", "throttle_reasons",
        ]
        for i in range(self._num_fans):
            fields.extend([f"fan{i}_speed_pct", f"fan{i}_target_pct"])
        fields.extend(["sensor0_temp", "sensor0_target", "sensor0_controller"])
        fields.append("flags")
        return fields

    def _rotate_csv(self) -> None:
        """Open today's CSV file. Close previous day's. Prune old files."""
        today = datetime.now().strftime("%Y-%m-%d")
        if today == self._csv_date and not self._reopen_logs:
            return

        self._close_csv()
        self._csv_date = today
        filepath = self.log_dir / f"gpu-deep-{today}.csv"
        is_new = not filepath.exists() or filepath.stat().st_size == 0
        self._csv_path = filepath
        self._csv_fh = open(filepath, "a", newline="", buffering=1)  # noqa: SIM115
        self._csv_writer = csv.writer(self._csv_fh)
        fields = self._get_csv_fields()
        self._csv_columns = len(fields)
        if is_new:
            self._csv_writer.writerow(fields)
            log(f"New daily log: {filepath}")

        cutoff = time.time() - (LOG_RETENTION_DAYS * 86400)
        for f in self.log_dir.glob("gpu-deep-????-??-??.csv"):
            try:
                if f.stat().st_mtime < cutoff:
                    f.unlink()
            except OSError:
                pass

        self._reopen_logs = False

    def _close_csv(self) -> None:
        if self._csv_fh and not self._csv_fh.closed:
            self._csv_fh.close()
        self._csv_fh = None
        self._csv_writer = None

    def _flush_csv(self) -> None:
        """Flush CSV to disk. Called before crash snapshots to ensure data is persisted."""
        if self._csv_fh and not self._csv_fh.closed:
            self._csv_fh.flush()

    # ------------------------------------------------------------------
    # Events log
    # ------------------------------------------------------------------

    def _write_event(self, line: str) -> None:
        """Append to events.log with size-based rotation."""
        try:
            events_path = self.log_dir / "events.log"

            if self._events_fh and not self._events_fh.closed:
                try:
                    if self._events_fh.tell() > EVENTS_LOG_MAX_BYTES:
                        self._events_fh.close()
                        _rotate_file(events_path, EVENTS_LOG_MAX_BYTES)
                        self._events_fh = None
                except OSError:
                    pass

            if self._events_fh is None or self._events_fh.closed or self._reopen_logs:
                self._events_fh = open(events_path, "a", buffering=1)  # noqa: SIM115

            self._events_fh.write(line + "\n")
        except OSError:
            pass

    # ------------------------------------------------------------------
    # Sampling — decomposed for readability
    # ------------------------------------------------------------------

    def _read_thermal(self) -> tuple[int, int | str]:
        """Read GPU temperature and distance to thermal limit."""
        gpu_temp = int(nvml_safe(pynvml.nvmlDeviceGetTemperature,
                                 self._handle, pynvml.NVML_TEMPERATURE_GPU))
        gpu_max = int(nvml_safe(pynvml.nvmlDeviceGetTemperatureThreshold,
                                self._handle, 3, default=0))
        gpu_tlimit: int | str = (gpu_max - gpu_temp) if gpu_max else ""
        return gpu_temp, gpu_tlimit

    def _read_power(self) -> tuple[int, float, float]:
        """Read power usage (mW, W) and current power limit (W)."""
        power_mw = int(nvml_safe(pynvml.nvmlDeviceGetPowerUsage, self._handle))
        power_w = power_mw / 1000.0
        # Re-read current limit each sample — it can change at runtime via nvidia-smi -pl.
        power_limit_mw = int(nvml_safe(
            pynvml.nvmlDeviceGetPowerManagementLimit, self._handle, default=0
        ))
        power_limit_w = power_limit_mw / 1000.0
        return power_mw, power_w, power_limit_w

    def _read_clocks_and_util(self) -> tuple[int, int, int, int, int, int]:
        """Read P-state, clocks, utilization, and VRAM usage."""
        h = self._handle
        pstate = int(nvml_safe(pynvml.nvmlDeviceGetPerformanceState, h))
        gpu_clock = int(nvml_safe(pynvml.nvmlDeviceGetClockInfo, h, pynvml.NVML_CLOCK_GRAPHICS))
        mem_clock = int(nvml_safe(pynvml.nvmlDeviceGetClockInfo, h, pynvml.NVML_CLOCK_MEM))
        util = nvml_safe(pynvml.nvmlDeviceGetUtilizationRates, h, default=None)
        gpu_util = util.gpu if util else 0
        mem_util = util.memory if util else 0
        mem_info = nvml_safe(pynvml.nvmlDeviceGetMemoryInfo, h, default=None)
        vram_used_mb = int(mem_info.used) // (1024 * 1024) if mem_info else 0
        return pstate, gpu_clock, mem_clock, gpu_util, mem_util, vram_used_mb

    def _read_fans(self) -> tuple[list[int], bool]:
        """Read per-fan speed and target. Returns (fan_data, spike_detected)."""
        fan_data: list[int] = []
        spike_detected = False
        for i in range(self._num_fans):
            speed = int(nvml_safe(pynvml.nvmlDeviceGetFanSpeed_v2, self._handle, i))
            target = int(nvml_safe(pynvml.nvmlDeviceGetTargetFanSpeed, self._handle, i))
            fan_data.extend([speed, target])
            if self._total_samples > 2:
                if abs(speed - self._prev_fan_speeds[i]) >= self.fan_spike_threshold:
                    spike_detected = True
            self._prev_fan_speeds[i] = speed
        return fan_data, spike_detected

    def _read_thermal_sensors(self) -> tuple[int, int, int]:
        """Read extended thermal sensor array (sensor 0)."""
        sensors = nvml_safe(pynvml.nvmlDeviceGetThermalSettings, self._handle, 0, default=[])
        s0 = sensors[0] if sensors else None
        return (
            s0.currentTemp if s0 else 0,
            s0.target if s0 else 0,
            s0.controller if s0 else 0,
        )

    def _check_anomalies(
        self, gpu_temp: int, power_mw: int, power_w: float,
        power_limit_mw: int, throttle: int, max_fan_speed: int, flags: list[str],
    ) -> None:
        """Run anomaly detectors and append flags."""
        now = time.monotonic()

        # Fan-temperature inversion: high temp + low fans.
        if (gpu_temp >= FAN_TEMP_INVERSION_TEMP
                and max_fan_speed <= FAN_TEMP_INVERSION_FAN
                and self._total_samples > 10):
            self._total_fan_inversions += 1
            self._period_fan_inversions += 1
            flags.append(f"TEMP_FAN_INV({gpu_temp}C@{max_fan_speed}%)")
            if now - self._last_fan_inversion_log >= ANOMALY_LOG_COOLDOWN_SEC:
                self._last_fan_inversion_log = now
                log(f"ANOMALY: {gpu_temp}C but fans at {max_fan_speed}% "
                    f"(expected >{FAN_TEMP_INVERSION_FAN}% above {FAN_TEMP_INVERSION_TEMP}C)")

        # Spurious SwPowerCap at low power.
        if (throttle & 0x04
                and power_limit_mw > 0
                and 0 < power_mw < power_limit_mw * SPURIOUS_THROTTLE_POWER_PCT
                and self._total_samples > 10):
            self._total_spurious_throttles += 1
            self._period_spurious_throttles += 1
            pct = (power_mw / power_limit_mw) * 100
            flags.append(f"SPUR_THROT({pct:.0f}%ofLim)")
            if now - self._last_spurious_throttle_log >= ANOMALY_LOG_COOLDOWN_SEC:
                self._last_spurious_throttle_log = now
                log(f"ANOMALY: SwPowerCap at {power_w:.0f}W "
                    f"({pct:.0f}% of {power_limit_mw / 1000:.0f}W limit)")

        # Power limit breach.
        if power_limit_mw > 0 and power_mw > power_limit_mw:
            self._total_power_breaches += 1
            self._period_power_breaches += 1
            overage_w = (power_mw - power_limit_mw) / 1000.0
            if overage_w > 50:
                flags.append(f"PWR+{overage_w:.0f}W")

    def _sample(self) -> None:
        """Collect one telemetry sample, run anomaly detection, write CSV row."""
        ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S.%f")[:-3]
        flags: list[str] = []

        gpu_temp, gpu_tlimit = self._read_thermal()
        power_mw, power_w, power_limit_w = self._read_power()
        power_limit_mw = int(power_limit_w * 1000)
        pstate, gpu_clock, mem_clock, gpu_util, mem_util, vram_used_mb = self._read_clocks_and_util()

        pcie_gen = int(nvml_safe(pynvml.nvmlDeviceGetCurrPcieLinkGeneration, self._handle))
        pcie_width = int(nvml_safe(pynvml.nvmlDeviceGetCurrPcieLinkWidth, self._handle))

        # Throttle reasons with transition logging.
        throttle = int(nvml_safe(pynvml.nvmlDeviceGetCurrentClocksEventReasons, self._handle))
        throttle_str = decode_throttle(throttle)
        if throttle != self._prev_throttle and self._total_samples > 0:
            log(f"Throttle: {decode_throttle(self._prev_throttle)} → {throttle_str}")
        self._prev_throttle = throttle

        # Power limit change detection — catches external adjustments (e.g.
        # gpu-fan-control.py --dynamic-tdp reducing/restoring the limit).
        # Noise-floored by TDP_CHANGE_EPSILON_W to avoid flapping on rounding.
        if self._prev_power_limit_w is not None:
            delta_w = power_limit_w - self._prev_power_limit_w
            if abs(delta_w) >= TDP_CHANGE_EPSILON_W:
                direction = "RESTORED" if delta_w > 0 else "REDUCED"
                tag = f"TDP_{direction}({power_limit_w:.0f}W)"
                flags.append(tag)
                self._total_tdp_changes += 1
                self._period_tdp_changes += 1
                log(f"TDP CHANGE: {self._prev_power_limit_w:.0f}W → {power_limit_w:.0f}W "
                    f"({delta_w:+.0f}W)")
                self._write_event(
                    f"{datetime.now().isoformat()} type=TDP_CHANGE "
                    f"from={self._prev_power_limit_w:.0f}W to={power_limit_w:.0f}W "
                    f"delta={delta_w:+.0f}W"
                )
        self._prev_power_limit_w = power_limit_w

        fan_data, spike_detected = self._read_fans()
        if spike_detected:
            flags.append("FAN_SPIKE")

        # Max fan speed across all fans (fan_data is [speed0, target0, speed1, target1, ...]).
        max_fan_speed = max((fan_data[i * 2] for i in range(self._num_fans)), default=0)

        self._check_anomalies(gpu_temp, power_mw, power_w, power_limit_mw,
                              throttle, max_fan_speed, flags)

        s0_temp, s0_target, s0_ctrl = self._read_thermal_sensors()

        # Peaks.
        if gpu_temp > self._peak_temp:
            self._peak_temp = gpu_temp
        if power_w > self._peak_power:
            self._peak_power = power_w

        # Build and write CSV row.
        row: list[Any] = [
            ts, gpu_temp, gpu_tlimit,
            f"{power_w:.2f}", f"{power_limit_w:.0f}",
            pstate, gpu_clock, mem_clock,
            gpu_util, mem_util, vram_used_mb,
            pcie_gen, pcie_width,
            f"0x{throttle:04x}", throttle_str,
        ]
        row.extend(fan_data)
        row.extend([s0_temp, s0_target, s0_ctrl])
        row.append("|".join(flags))

        if self._csv_writer:
            self._csv_writer.writerow(row)
        self._total_samples += 1
        self._period_samples += 1

        # Spike snapshot (rate-limited).
        if spike_detected:
            self._total_spikes += 1
            now = time.monotonic()
            if now - self._last_spike_time >= SPIKE_COOLDOWN_SEC:
                self._last_spike_time = now
                self._flush_csv()
                self._write_spike_snapshot(row)
            else:
                log(f"SPIKE #{self._total_spikes} (cooldown, no snapshot)")

    def _make_failure_row(self, error: str) -> list[Any]:
        """Build a CSV failure row padded to the correct column count."""
        ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S.%f")[:-3]
        padding = max(0, self._csv_columns - 2)
        return [ts] + [""] * padding + [f"NVML_FAILED:{error[:80]}"]

    # ------------------------------------------------------------------
    # Snapshot writers
    # ------------------------------------------------------------------

    def _snapshot_kwargs(self) -> dict[str, Any]:
        """Common keyword arguments for _write_snapshot_header."""
        return dict(
            gpu_name=self._gpu_name,
            gpu_pci=self._gpu_pci,
            uptime_sec=int(time.monotonic() - self._start_time),
            total_samples=self._total_samples,
            peak_temp=self._peak_temp,
            peak_power=self._peak_power,
            total_power_breaches=self._total_power_breaches,
            total_fan_inversions=self._total_fan_inversions,
            total_spurious_throttles=self._total_spurious_throttles,
        )

    def _write_spike_snapshot(self, row: list[Any]) -> None:
        spike_dir = self.log_dir / "spikes"
        spike_dir.mkdir(exist_ok=True)
        spike_file = spike_dir / f"spike-{datetime.now().strftime('%Y%m%d-%H%M%S')}.log"
        try:
            with open(spike_file, "w") as f:
                _write_snapshot_header(
                    f, title="FAN SPIKE DETECTED",
                    event_number=self._total_spikes,
                    **self._snapshot_kwargs(),
                )
                f.write("=== Current state ===\n")
                for name, val in zip(self._get_csv_fields(), row):
                    f.write(f"  {name}: {val}\n")
                f.write("\n=== Last 50 telemetry readings ===\n")
                if self._csv_path:
                    for line in tail_lines(self._csv_path, 50):
                        f.write(f"  {line}\n")
                self._write_thermal_thresholds(f)
                self._write_fan_policies(f)
            log(f"SPIKE #{self._total_spikes} → {spike_file}")
        except OSError as exc:
            log(f"WARNING: spike snapshot failed: {exc}")

    def _write_crash_snapshot(self, error: str) -> None:
        """Capture full diagnostic snapshot when NVML fails or Xid event fires."""
        crash_dir = self.log_dir / "crashes"
        crash_dir.mkdir(exist_ok=True)
        crash_file = crash_dir / f"crash-{datetime.now().strftime('%Y%m%d-%H%M%S')}.log"
        try:
            with open(crash_file, "w") as f:
                _write_snapshot_header(
                    f, title="GPU CRASH DETECTED", error=error,
                    event_number=self._total_crashes,
                    **self._snapshot_kwargs(),
                )
                f.write("=== Last 50 telemetry readings ===\n")
                if self._csv_path:
                    for line in tail_lines(self._csv_path, 50):
                        f.write(f"  {line}\n")

                diag_sections = [
                    ("dmesg (nvidia/gpu errors)",
                     "dmesg 2>/dev/null | grep -iE "
                     "'nvidia|gpu|nvrm|xid|uvm|BUG|ca7d|ca7e|GPU_IS_LOST|"
                     "fallen|reg.read|gsp|pmu.*halt' | tail -100"),
                    ("dmesg (last 200 lines)", "dmesg 2>/dev/null | tail -200"),
                    ("PCIe AER errors",
                     "dmesg 2>/dev/null | grep -iE 'AER|corrected|uncorrect|pcie.*error' | tail -20"),
                    ("PCIe device state", f"lspci -vvs {self._gpu_pci_short} 2>/dev/null"),
                    ("GPU register test (ffff = off bus)",
                     f"setpci -s {self._gpu_pci_short} VENDOR_ID 2>/dev/null"),
                    ("nvidia-smi (may fail)", "timeout 5 nvidia-smi 2>&1"),
                    ("NVIDIA kernel modules", "lsmod 2>/dev/null | grep -i nvidia"),
                    ("System memory", "free -h 2>/dev/null"),
                    ("System load", "uptime 2>/dev/null"),
                ]
                for title, cmd in diag_sections:
                    f.write(f"\n=== {title} ===\n")
                    f.write(run_cmd(cmd))
            log(f"CRASH #{self._total_crashes} → {crash_file}")
        except OSError as exc:
            log(f"WARNING: crash snapshot failed: {exc}")

    def _write_thermal_thresholds(self, f: IO[str]) -> None:
        f.write("\n=== Thermal thresholds ===\n")
        for label, idx in [("shutdown", 0), ("slowdown", 1), ("gpu_max", 3)]:
            val = nvml_safe(pynvml.nvmlDeviceGetTemperatureThreshold, self._handle, idx, default="N/A")
            f.write(f"  {label}: {val}{'C' if isinstance(val, int) else ''}\n")

    def _write_fan_policies(self, f: IO[str]) -> None:
        f.write("\n=== Fan control policies ===\n")
        for i in range(self._num_fans):
            policy = nvml_safe(pynvml.nvmlDeviceGetFanControlPolicy_v2, self._handle, i, default="N/A")
            f.write(f"  fan[{i}]: policy={policy}\n")

    # ------------------------------------------------------------------
    # Status and summary
    # ------------------------------------------------------------------

    def print_status(self) -> None:
        """One-line status summary. Safe to call from signal handler (GIL protects reads).

        Shows cumulative totals plus the current-period rates so operators can
        distinguish "bad long ago" from "bad right now."
        """
        elapsed = int(time.monotonic() - self._start_time)
        h, remainder = divmod(elapsed, 3600)
        m = remainder // 60
        # Period rates are per SUMMARY_INTERVAL_SEC window.
        pb = self._period_power_breaches
        fi = self._period_fan_inversions
        sp = self._period_spurious_throttles
        log(
            f"Status: {h}h{m}m | {self._total_samples} samples | "
            f"{self._total_spikes} spk | {self._total_crashes} crash | "
            f"pwr {self._total_power_breaches} (+{pb}/5m) | "
            f"inv {self._total_fan_inversions} (+{fi}/5m) | "
            f"spur {self._total_spurious_throttles} (+{sp}/5m) | "
            f"tdp_chg {self._total_tdp_changes} | "
            f"peak: {self._peak_temp}C/{self._peak_power:.0f}W"
        )

    def _maybe_log_summary(self) -> None:
        """Log periodic stats every SUMMARY_INTERVAL_SEC.

        Writes both cumulative totals AND per-period rates so that acceleration
        is visible by diff. Period counters are reset after each summary.
        """
        now = time.monotonic()
        if now - self._last_summary_time < SUMMARY_INTERVAL_SEC:
            return

        # Compute actual window length so rates are accurate even if scheduling
        # skewed from the nominal interval.
        if self._last_summary_time == 0.0:
            window_sec = now - self._start_time
        else:
            window_sec = now - self._last_summary_time
        window_sec = max(window_sec, 1.0)  # Avoid divide-by-zero on rapid re-entry.
        self._last_summary_time = now

        if self._total_samples == 0:
            return

        self.print_status()

        summary_path = self.log_dir / "summary.log"
        _rotate_file(summary_path, SUMMARY_LOG_MAX_BYTES)
        try:
            elapsed = int(now - self._start_time)
            h, remainder = divmod(elapsed, 3600)
            m = remainder // 60
            ts = datetime.now().isoformat()
            # Rates are per-minute, computed over the actual window duration.
            # Easier to reason about than per-sample rates.
            per_min = 60.0 / window_sec
            with open(summary_path, "a") as f:
                f.write(
                    f"{ts} | {h}h{m}m | "
                    f"window={window_sec:.0f}s | "
                    f"samples={self._total_samples} (+{self._period_samples}) "
                    f"spikes={self._total_spikes} "
                    f"crashes={self._total_crashes} "
                    f"pwr_breaches={self._total_power_breaches} "
                    f"(rate={self._period_power_breaches * per_min:.1f}/min) "
                    f"fan_inversions={self._total_fan_inversions} "
                    f"(rate={self._period_fan_inversions * per_min:.1f}/min) "
                    f"spurious_throttles={self._total_spurious_throttles} "
                    f"(rate={self._period_spurious_throttles * per_min:.1f}/min) "
                    f"tdp_changes={self._total_tdp_changes} "
                    f"(period={self._period_tdp_changes}) "
                    f"peak_temp={self._peak_temp}C "
                    f"peak_power={self._peak_power:.0f}W\n"
                )
        except OSError:
            pass
        finally:
            # Reset period counters for next window.
            self._period_samples = 0
            self._period_power_breaches = 0
            self._period_fan_inversions = 0
            self._period_spurious_throttles = 0
            self._period_tdp_changes = 0

    # ------------------------------------------------------------------
    # Event listener (background daemon thread)
    # ------------------------------------------------------------------

    def _run_event_listener(self) -> None:
        """Subscribe to NVML GPU events and log them.

        Xid events trigger an immediate crash snapshot since they are
        precursors to full GSP failure.
        """
        try:
            event_set = pynvml.nvmlEventSetCreate()
            supported = int(nvml_safe(
                pynvml.nvmlDeviceGetSupportedEventTypes, self._handle, default=0,
            ))

            event_mask = 0
            for etype in [
                pynvml.nvmlEventTypePState,
                pynvml.nvmlEventTypeXidCriticalError,
                pynvml.nvmlEventTypeClock,
            ]:
                if supported & int(etype):
                    event_mask |= int(etype)
            try:
                recovery_bit = int(pynvml.nvmlEventTypeGpuRecoveryAction)
                if supported & recovery_bit:
                    event_mask |= recovery_bit
            except AttributeError:
                pass

            if event_mask == 0:
                log("WARNING: No GPU events supported by this driver")
                return

            pynvml.nvmlDeviceRegisterEvents(self._handle, event_mask, event_set)
            log(f"Event listener started (mask=0x{event_mask:04x})")

            clock_burst_count = 0
            clock_burst_start = 0.0

            while not self.shutdown.is_set():
                try:
                    data = pynvml.nvmlEventSetWait_v2(event_set, 1000)
                except pynvml.NVMLError as exc:
                    if "Timeout" in type(exc).__name__:
                        continue
                    if not self.shutdown.is_set():
                        log(f"Event error: {exc}")
                    break

                ts = datetime.now().isoformat()
                now = time.monotonic()
                etype = int(data.eventType)
                edata = int(data.eventData)

                names: list[str] = []
                is_clock_only = True
                is_critical = False

                if etype & int(pynvml.nvmlEventTypePState):
                    names.append("PState")
                    is_clock_only = False
                if etype & int(pynvml.nvmlEventTypeXidCriticalError):
                    names.append(f"Xid({edata})")
                    is_clock_only = False
                    is_critical = True
                try:
                    if etype & int(pynvml.nvmlEventTypeGpuRecoveryAction):
                        names.append("GpuRecovery")
                        is_clock_only = False
                        is_critical = True
                except AttributeError:
                    pass
                if etype & int(pynvml.nvmlEventTypeClock):
                    names.append("Clock")

                if not names:
                    continue

                # Suppress chatty Clock-only events; log bursts.
                if is_clock_only:
                    if now - clock_burst_start > 1.0:
                        clock_burst_count = 0
                        clock_burst_start = now
                    clock_burst_count += 1
                    if clock_burst_count == 10:
                        log(f"EVENT: {ts} ClockBurst(10 in 1.0s)")
                else:
                    log(f"EVENT: {ts} {'|'.join(names)}")

                self._write_event(f"{ts} type=0x{etype:04x} data={edata} {'|'.join(names)}")

                # Xid and GpuRecovery events are crash precursors.
                # Always count them, but rate-limit the disk-heavy snapshot:
                # a single crash typically floods 30-50+ Xid events within a
                # few seconds, and one snapshot per crash is sufficient.
                if is_critical:
                    self._total_crashes += 1
                    if now - self._last_crash_time >= CRASH_COOLDOWN_SEC:
                        self._last_crash_time = now
                        self._crashes_suppressed = 0
                        log(f"CRITICAL EVENT — capturing crash snapshot "
                            f"(crash #{self._total_crashes})")
                        self._flush_csv()
                        self._write_crash_snapshot(f"Event: {'|'.join(names)}")
                    else:
                        self._crashes_suppressed += 1
                        # Log first few suppressions so the burst is visible,
                        # then go quiet until the cooldown expires.
                        if self._crashes_suppressed <= 3:
                            log(f"CRITICAL EVENT #{self._total_crashes} "
                                f"(snapshot suppressed by {CRASH_COOLDOWN_SEC:.0f}s cooldown)")

            try:
                pynvml.nvmlEventSetFree(event_set)
            except pynvml.NVMLError:
                pass
        except pynvml.NVMLError as exc:
            log(f"Event listener failed to start: {exc}")

    # ------------------------------------------------------------------
    # Main loop
    # ------------------------------------------------------------------

    def run(self) -> None:
        """Initialize, poll, and shut down. Blocks until shutdown is signaled."""
        self.log_dir.mkdir(parents=True, exist_ok=True)
        self._init_nvml()
        self._log_system_info()

        event_thread = threading.Thread(
            target=self._run_event_listener, name="gpu-events", daemon=True,
        )
        event_thread.start()

        log(f"Polling every {self.interval}s → {self.log_dir}/")

        prev_failures = 0

        try:
            while not self.shutdown.is_set():
                # Disk space guard (throttled check).
                now = time.monotonic()
                if now - self._last_disk_check >= DISK_CHECK_INTERVAL_SEC:
                    self._last_disk_check = now
                    if not check_disk_space(self.log_dir):
                        if not self._disk_paused:
                            log(f"WARNING: disk < {MIN_DISK_MB}MB, pausing writes")
                            self._disk_paused = True
                        self.shutdown.wait(60)
                        continue
                    elif self._disk_paused:
                        log("Disk space recovered, resuming")
                        self._disk_paused = False

                if self._reopen_logs:
                    self._csv_date = ""  # Force CSV reopen.

                try:
                    self._rotate_csv()
                    self._sample()

                    if prev_failures > 0:
                        log(f"GPU RECOVERED after {prev_failures} failures")
                    prev_failures = 0
                    self._consecutive_failures = 0

                    self._maybe_log_summary()

                except pynvml.NVMLError as exc:
                    self._consecutive_failures += 1
                    prev_failures = self._consecutive_failures

                    if self._consecutive_failures == 1:
                        log(f"NVML FAILED — GPU crash? {exc}")
                        self._flush_csv()
                        self._total_crashes += 1
                        self._write_crash_snapshot(str(exc))

                    if self._csv_writer and self._csv_columns > 0:
                        self._csv_writer.writerow(self._make_failure_row(str(exc)))

                    if self._consecutive_failures == 5:
                        log("GPU still unresponsive — backing off to 30s")

                    if self._consecutive_failures > 10:
                        self.shutdown.wait(30)
                        continue

                    self.shutdown.wait(min(self.interval * 5, 5.0))
                    continue

                self.shutdown.wait(self.interval)

        except KeyboardInterrupt:
            pass
        finally:
            self.shutdown.set()
            self.print_status()
            self._close_csv()
            if self._events_fh and not self._events_fh.closed:
                self._events_fh.close()
            try:
                pynvml.nvmlShutdown()
            except Exception:
                pass
            log("Stopped")


 # ---------------------------------------------------------------------------
 # Lock file
 # ---------------------------------------------------------------------------

 def _acquire_lock() -> None:
    """Prevent duplicate instances via flock (auto-released on process exit)."""
    global _lock_fh
    try:
        LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
        _lock_fh = open(LOCK_FILE, "w")  # noqa: SIM115
        fcntl.flock(_lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
        _lock_fh.write(str(os.getpid()))
        _lock_fh.flush()
    except OSError:
        log(f"ERROR: already running or cannot create {LOCK_FILE}")
        sys.exit(1)


 # ---------------------------------------------------------------------------
 # Entry point
 # ---------------------------------------------------------------------------

 def main() -> None:
    parser = argparse.ArgumentParser(
        description="NVML GPU telemetry for diagnosing Blackwell GSP crashes",
    )
    parser.add_argument("--interval", type=float, default=DEFAULT_INTERVAL,
                        help=f"Seconds between samples (default: {DEFAULT_INTERVAL})")
    parser.add_argument("--log-dir", type=str, default=DEFAULT_LOG_DIR,
                        help=f"Log directory (default: {DEFAULT_LOG_DIR})")
    parser.add_argument("--spike-threshold", type=int, default=DEFAULT_SPIKE_THRESHOLD,
                        help=f"Fan %%/sample delta to flag (default: {DEFAULT_SPIKE_THRESHOLD})")
    parser.add_argument("--gpu", type=int, default=0,
                        help="GPU index (default: 0)")
    parser.add_argument("--version", action="version",
                        version=f"gpu-telemetry.py {VERSION}")
    args = parser.parse_args()

    if args.interval <= 0:
        log(f"ERROR: interval must be positive (got: {args.interval})")
        sys.exit(1)

    _acquire_lock()

    monitor = GPUMonitor(args.log_dir, args.interval, args.spike_threshold, args.gpu)

    def _handle_signal(signum: int, _: Any) -> None:
        if signum == signal.SIGUSR1:
            monitor.print_status()
        elif signum == signal.SIGHUP:
            log("SIGHUP — reopening logs")
            monitor._reopen_logs = True
        else:
            monitor.shutdown.set()

    signal.signal(signal.SIGTERM, _handle_signal)
    signal.signal(signal.SIGINT, _handle_signal)
    signal.signal(signal.SIGUSR1, _handle_signal)
    signal.signal(signal.SIGHUP, _handle_signal)

    monitor.run()


 if __name__ == "__main__":
    main()
No results found