Created
August 8, 2020 21:10
-
-
Save hagen1778/20ba5af021db40569000608992fecb7d to your computer and use it in GitHub Desktop.
Migrating data from Prometheus to VM. Prometheus rules config
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: CPU rules | |
interval: 10s | |
rules: | |
# The count of CPUs per node, useful for getting CPU time as a percent of total. | |
- record: instance:node_cpus:count | |
expr: > | |
count without (cpu, mode) ( | |
node_cpu_seconds_total{mode="idle"} | |
) | |
# CPU in use by CPU. | |
- record: instance_cpu:node_cpu_seconds_not_idle:rate1m | |
expr: > | |
sum without (mode) ( | |
1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) | |
) | |
# CPU in use by mode. | |
# Split recording for iowait to avoid reset bugs. | |
- record: instance_mode:node_cpu_seconds:rate1m | |
expr: > | |
sum without (cpu) ( | |
rate(node_cpu_seconds_total{mode!="iowait"}[1m]) | |
) | |
- record: instance_mode:node_cpu_seconds:rate1m | |
expr: > | |
sum without (cpu) ( | |
deriv(node_cpu_seconds_total{mode="iowait"}[1m]) > 0 | |
) | |
# CPU in use ratio. | |
- record: instance:node_cpu_utilization:ratio | |
expr: > | |
avg without (cpu) ( | |
instance_cpu:node_cpu_seconds_not_idle:rate1m | |
) | |
# CPU summaries | |
- record: job:node_cpu_utilization:min_ratio | |
expr: > | |
min without (fqdn, instance) ( | |
instance:node_cpu_utilization:ratio | |
) | |
- record: job:node_cpu_utilization:avg_ratio | |
expr: > | |
avg without (fqdn, instance) ( | |
instance:node_cpu_utilization:ratio | |
) | |
- record: job:node_cpu_utilization:max_ratio | |
expr: > | |
max without (fqdn, instance) ( | |
instance:node_cpu_utilization:ratio | |
) | |
# Rules for calculating and alerting on long-term node utilization issues. | |
- name: Utilization | |
interval: 20s | |
rules: | |
- record: instance:cpu_utilization:ratio_max | |
expr: max_over_time(instance:node_cpu_utilization:ratio[300s]) | |
- record: instance:cpu_utilization:ratio_avg | |
expr: avg_over_time(instance:node_cpu_utilization:ratio[300s]) | |
- record: instance:cpu_utilization:ratio_q95 | |
expr: quantile_over_time(0.95, instance:node_cpu_utilization:ratio[300s]) | |
- record: instance:memory_utilization:ratio_max | |
expr: max_over_time(instance:node_memory_utilization:ratio[300s]) | |
- record: instance:memory_utilization:ratio_avg | |
expr: avg_over_time(instance:node_memory_utilization:ratio[300s]) | |
- record: instance:memory_utilization:ratio_q95 | |
expr: quantile_over_time(0.95, instance:node_memory_utilization:ratio[300s]) | |
- name: Node memory | |
rules: | |
- record: instance:node_memory_available:ratio | |
expr: > | |
( | |
node_memory_MemAvailable_bytes or | |
( | |
node_memory_Buffers_bytes + | |
node_memory_Cached_bytes + | |
node_memory_MemFree_bytes + | |
node_memory_Slab_bytes | |
) | |
) / | |
node_memory_MemTotal_bytes | |
- record: instance:node_memory_utilization:ratio | |
expr: 1 - instance:node_memory_available:ratio | |
- name: Node filesystem rules | |
rules: | |
- record: instance:node_filesystem_avail:ratio | |
expr: node_filesystem_avail_bytes{device=~"(/dev/.+|tank/dataset)"} / node_filesystem_size_bytes{device=~"(/dev/.+|tank/dataset)"} | |
- record: instance:node_disk_writes_completed:irate1m | |
expr: sum(irate(node_disk_writes_completed_total{device=~"sd.*"}[1m])) WITHOUT (device) | |
- record: instance:node_disk_reads_completed:irate1m | |
expr: sum(irate(node_disk_reads_completed_total{device=~"sd.*"}[1m])) WITHOUT (device) | |
- expr: |- | |
avg by (node) ( | |
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_utilisation:avg_irate | |
- expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) | |
record: ':node_disk_saturation:avg_irate' | |
- expr: |- | |
avg by (node) ( | |
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_saturation:avg_irate | |
- expr: |- | |
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} | |
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_usage:' | |
- expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_avail:' | |
- record: instance:up:count | |
expr: count(up{job="node",type!=""} == 1) WITHOUT (instance, fqdn) | |
- name: node-exporter.examples | |
rules: | |
- record: instance:node_num_cpu:sum | |
expr: |- | |
count without (cpu) ( | |
count without (mode) ( | |
node_cpu_seconds_total | |
) | |
) | |
- expr: |- | |
1 - avg without (cpu, mode) ( | |
rate(node_cpu_seconds_total{mode="idle"}[1m]) | |
) | |
record: instance:node_cpu_utilisation:rate1m | |
- expr: |- | |
( | |
node_load1 | |
/ | |
instance:node_num_cpu:sum | |
) | |
record: instance:node_load1_per_cpu:ratio | |
- expr: |- | |
1 - ( | |
node_memory_MemAvailable_bytes | |
/ | |
node_memory_MemTotal_bytes | |
) | |
record: instance:node_memory_utilisation:ratio | |
- expr: rate(node_vmstat_pgmajfault[1m]) | |
record: instance:node_vmstat_pgmajfault:rate1m | |
- expr: rate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) | |
record: instance_device:node_disk_io_time_seconds:rate1m | |
- expr: rate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) | |
record: instance_device:node_disk_io_time_weighted_seconds:rate1m | |
- expr: |- | |
sum without (device) ( | |
rate(node_network_receive_bytes_total{ device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_bytes_excluding_lo:rate1m | |
- expr: |- | |
sum without (device) ( | |
rate(node_network_transmit_bytes_total{device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_bytes_excluding_lo:rate1m | |
- expr: |- | |
sum without (device) ( | |
rate(node_network_receive_drop_total{device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_drop_excluding_lo:rate1m | |
- expr: |- | |
sum without (device) ( | |
rate(node_network_transmit_drop_total{device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_drop_excluding_lo:rate1m |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment