Created
May 21, 2026 18:59
-
-
Save Starttoaster/0e1a952330ad070c49402c896390283b to your computer and use it in GitHub Desktop.
smartctl_exporter alerts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: monitoring.coreos.com/v1 | |
| kind: PrometheusRule | |
| metadata: | |
| name: smart-health-alerts | |
| namespace: monitoring | |
| spec: | |
| groups: | |
| - name: smart.rules | |
| rules: | |
| # General health status alerts | |
| - alert: SMARTHealthStatusDegraded | |
| expr: | | |
| smartctl_device_smart_status == 0 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "Drive health status degraded on {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "SMART health check indicates drive failure or imminent failure." | |
| - alert: SMARTHighDeviceTemperature | |
| expr: | | |
| smartctl_device_temperature > 70 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "High temperature on {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Drive temperature is above 70°C for more than 1 minute." | |
| - alert: SMARTDeviceMissing | |
| expr: | | |
| count by (device,instance_name) (smartctl_device offset 5m) | |
| unless | |
| count by (device,instance_name) (smartctl_device) | |
| for: 5m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "SMART device missing from {{ $labels.instance_name }} - device {{ $labels.device }}" | |
| description: "A drive or server may have gone offline." | |
| # SATA device Specific | |
| - alert: SATAReallocationEventsDetected | |
| expr: | | |
| increase(smartctl_device_attribute{attribute_name=~"Reallocated_Sector_Ct|Current_Pending_Sector"}[1d]) > 0 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| sendto: discord | |
| annotations: | |
| summary: "Sector reallocation detected on {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "New bad sectors have been detected and reallocated in the last 24 hours." | |
| - alert: SATAWearLevelCritical | |
| expr: | | |
| ( | |
| smartctl_device_attribute{attribute_name="Wear_Leveling_Count",attribute_value_type="raw"} < 20 or | |
| smartctl_device_attribute{attribute_name="Media_Wearout_Indicator",attribute_value_type="raw"} < 20 or | |
| smartctl_device_percentage_used > 80 | |
| ) | |
| for: 10m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "Drive wear level critical on {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Drive wear level has reached a critical threshold, indicating the drive is nearing end of life." | |
| - alert: SATAHighBadBlockCount | |
| expr: | | |
| smartctl_device_attribute{attribute_name="Bad_Block_Count",attribute_value_type="raw"} <= on(device,instance_name) | |
| smartctl_device_attribute{attribute_name="Bad_Block_Count",attribute_value_type="thresh"} | |
| for: 5m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "Bad block count at or below threshold on {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Bad block count ({{ $value }}) has reached or exceeded the threshold level for device {{ $labels.device }}. Raw value is at or below the threshold value." | |
| # NVMe specific alerts | |
| - alert: NVMeWarningSpareBlocks | |
| expr: | | |
| smartctl_device_available_spare < (smartctl_device_available_spare_threshold * 2) | |
| for: 24h | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "Low spare blocks on NVMe drive {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Available spare blocks have dropped below twice the drive's spare threshold." | |
| - alert: NVMeCriticalSpareBlocks | |
| expr: | | |
| smartctl_device_available_spare < smartctl_device_available_spare_threshold | |
| for: 24h | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "Critically low spare blocks on NVMe drive {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Available spare blocks have dropped below the drive's spare threshold." | |
| - alert: NVMeCriticalControllerWarning | |
| expr: | | |
| smartctl_device_critical_warning > 0 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "NVMe controller detected a critical warning {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "Critical warnings can indicate a failing controller." | |
| - alert: NVMeDeviceWritePercentageUsed | |
| expr: | | |
| smartctl_device_percentage_used > 90 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| sendto: discord | |
| annotations: | |
| summary: "NVMe percentage used exceeded 90% {{ $labels.instance_name }} {{ $labels.device }}" | |
| description: "May need to buy a new drive to replace this one soon." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment