Skip to content

Instantly share code, notes, and snippets.

@Starttoaster
Created May 21, 2026 18:59
Show Gist options
  • Select an option

  • Save Starttoaster/0e1a952330ad070c49402c896390283b to your computer and use it in GitHub Desktop.

Select an option

Save Starttoaster/0e1a952330ad070c49402c896390283b to your computer and use it in GitHub Desktop.
smartctl_exporter alerts
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smart-health-alerts
namespace: monitoring
spec:
groups:
- name: smart.rules
rules:
# General health status alerts
- alert: SMARTHealthStatusDegraded
expr: |
smartctl_device_smart_status == 0
for: 5m
labels:
severity: critical
sendto: discord
annotations:
summary: "Drive health status degraded on {{ $labels.instance_name }} {{ $labels.device }}"
description: "SMART health check indicates drive failure or imminent failure."
- alert: SMARTHighDeviceTemperature
expr: |
smartctl_device_temperature > 70
for: 1m
labels:
severity: critical
sendto: discord
annotations:
summary: "High temperature on {{ $labels.instance_name }} {{ $labels.device }}"
description: "Drive temperature is above 70°C for more than 1 minute."
- alert: SMARTDeviceMissing
expr: |
count by (device,instance_name) (smartctl_device offset 5m)
unless
count by (device,instance_name) (smartctl_device)
for: 5m
labels:
severity: critical
sendto: discord
annotations:
summary: "SMART device missing from {{ $labels.instance_name }} - device {{ $labels.device }}"
description: "A drive or server may have gone offline."
# SATA device Specific
- alert: SATAReallocationEventsDetected
expr: |
increase(smartctl_device_attribute{attribute_name=~"Reallocated_Sector_Ct|Current_Pending_Sector"}[1d]) > 0
for: 5m
labels:
severity: warning
sendto: discord
annotations:
summary: "Sector reallocation detected on {{ $labels.instance_name }} {{ $labels.device }}"
description: "New bad sectors have been detected and reallocated in the last 24 hours."
- alert: SATAWearLevelCritical
expr: |
(
smartctl_device_attribute{attribute_name="Wear_Leveling_Count",attribute_value_type="raw"} < 20 or
smartctl_device_attribute{attribute_name="Media_Wearout_Indicator",attribute_value_type="raw"} < 20 or
smartctl_device_percentage_used > 80
)
for: 10m
labels:
severity: critical
sendto: discord
annotations:
summary: "Drive wear level critical on {{ $labels.instance_name }} {{ $labels.device }}"
description: "Drive wear level has reached a critical threshold, indicating the drive is nearing end of life."
- alert: SATAHighBadBlockCount
expr: |
smartctl_device_attribute{attribute_name="Bad_Block_Count",attribute_value_type="raw"} <= on(device,instance_name)
smartctl_device_attribute{attribute_name="Bad_Block_Count",attribute_value_type="thresh"}
for: 5m
labels:
severity: critical
sendto: discord
annotations:
summary: "Bad block count at or below threshold on {{ $labels.instance_name }} {{ $labels.device }}"
description: "Bad block count ({{ $value }}) has reached or exceeded the threshold level for device {{ $labels.device }}. Raw value is at or below the threshold value."
# NVMe specific alerts
- alert: NVMeWarningSpareBlocks
expr: |
smartctl_device_available_spare < (smartctl_device_available_spare_threshold * 2)
for: 24h
labels:
severity: critical
sendto: discord
annotations:
summary: "Low spare blocks on NVMe drive {{ $labels.instance_name }} {{ $labels.device }}"
description: "Available spare blocks have dropped below twice the drive's spare threshold."
- alert: NVMeCriticalSpareBlocks
expr: |
smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 24h
labels:
severity: critical
sendto: discord
annotations:
summary: "Critically low spare blocks on NVMe drive {{ $labels.instance_name }} {{ $labels.device }}"
description: "Available spare blocks have dropped below the drive's spare threshold."
- alert: NVMeCriticalControllerWarning
expr: |
smartctl_device_critical_warning > 0
for: 1m
labels:
severity: critical
sendto: discord
annotations:
summary: "NVMe controller detected a critical warning {{ $labels.instance_name }} {{ $labels.device }}"
description: "Critical warnings can indicate a failing controller."
- alert: NVMeDeviceWritePercentageUsed
expr: |
smartctl_device_percentage_used > 90
for: 1m
labels:
severity: critical
sendto: discord
annotations:
summary: "NVMe percentage used exceeded 90% {{ $labels.instance_name }} {{ $labels.device }}"
description: "May need to buy a new drive to replace this one soon."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment