Update prometheus alert rules with upstream

certbot_on_virtu
Alexandre Iooss 2021-06-07 21:44:52 +02:00 committed by Yohann D'ANELLO
parent f37d195543
commit 0b4c77eb0c
Signed by: _ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 218 additions and 54 deletions

View File

@ -1,74 +1,235 @@
{{ ansible_header | comment }} {{ ansible_header | comment }}
{# As this is also Jinja2 it will conflict without a raw block #} {# As this is also using brackets it will conflict without a raw block #}
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
{% raw %} {% raw %}
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
# We remove descriptions as we only send summary on IRC.
# UPS, APT and RADIUS configuration is made by Crans.
groups: groups:
- name: alert.rules - name: alert.rules
rules: rules:
# Alert for any instance that is unreachable for >3 minutes. ##############################
- alert: InstanceDown # Prometheus self-monitoring #
##############################
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
- alert: PrometheusTargetMissing
expr: up == 0 expr: up == 0
for: 3m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !" summary: {{ $labels.instance }} ({{ $labels.job }}) est manquant
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (instance {{ $labels.instance }})
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
- alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target empty (instance {{ $labels.instance }})
# This already happened in 2021 at Crans
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
#####################
# Host and hardware #
#####################
# Alert for out of memory # Alert for out of memory
# Do not take into account memory not used by apps # Do not take into account memory not used by apps
- alert: OutOfMemory - alert: HostOutOfMemory
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10 expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%." summary: Host unusual disk read rate (instance {{ $labels.instance }})
# Alert for out of disk space - alert: HostUnusualDiskWriteRate
- alert: OutOfDiskSpace expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 for: 2m
for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%." summary: Host unusual disk write rate (instance {{ $labels.instance }})
# Alert for out of inode space on disk - alert: HostOutOfDiskSpace
- alert: OutOfInodes expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: {{ $labels.mountpoint }} sur {{ $labels.instance }} arrive à saturation ({{ $value }}%)
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." summary: Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}
# Alert for high CPU usage - alert: HostHighCpuLoad
- alert: CpuBusy
expr: node_load5 > 9 expr: node_load5 > 9
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Charge sur {{ $labels.instance }} à {{ $value }}." summary: Charge sur {{ $labels.instance }} à {{ $value }}
# Check mdadm software RAID - alert: HostSystemdServiceCrashed
- alert: SoftwareRAIDDegraded
expr: node_md_disks-node_md_disks_active > 0
for: 3m
labels:
severity: warning
annotations:
summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
# Check systemd unit (> buster)
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1 expr: node_systemd_unit_state{state="failed"} == 1
for: 10m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" summary: {{ $labels.name }} a crashé sur {{ $labels.instance }}
# 0B is so hot
# En pratique c'est mauvais de tourner des disques trop chauds
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
# This happend in June 2021 at Crans
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
############
# Blackbox #
############
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
#######
# UPS #
#######
# Check UPS # Check UPS
- alert: UpsOutputSourceChanged - alert: UpsOutputSourceChanged
@ -77,8 +238,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La source d'alimentation de {{ $labels.instance }} a changé !" summary: La source d'alimentation de {{ $labels.instance }} a changé !
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsBatteryStatusChanged - alert: UpsBatteryStatusChanged
expr: upsBatteryStatus != 2 expr: upsBatteryStatus != 2
@ -86,8 +246,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "L'état de la batterie de {{ $labels.instance }} a changé !" summary: L'état de la batterie de {{ $labels.instance }} a changé !
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsTemperatureWarning - alert: UpsTemperatureWarning
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
@ -95,8 +254,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C." summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsTemperatureCritical - alert: UpsTemperatureCritical
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
@ -104,8 +262,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !" summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsHighHumidity - alert: UpsHighHumidity
expr: xupsEnvRemoteHumidity > 65 expr: xupsEnvRemoteHumidity > 65
@ -113,8 +270,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%." summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsVeryHighHumidity - alert: UpsVeryHighHumidity
expr: xupsEnvRemoteHumidity > 85 expr: xupsEnvRemoteHumidity > 85
@ -122,8 +278,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !" summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsHighLoad - alert: UpsHighLoad
expr: upsOutputPercentLoad > 70 expr: upsOutputPercentLoad > 70
@ -131,8 +286,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" summary: La charge de {{ $labels.instance }} est de {{ $value }}%
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsWrongInputVoltage - alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
@ -140,8 +294,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsWrongOutputVoltage - alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
@ -149,8 +302,11 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
#########
# Other #
#########
- alert: AptAutoremovePending - alert: AptAutoremovePending
expr: apt_autoremove_pending > 0 expr: apt_autoremove_pending > 0
@ -158,7 +314,15 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." summary: {{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}
- alert: AptOrphans
expr: apt_orphans > 10
for: 5m
labels:
severity: warning
annotations:
summary: {{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}
- alert: MailqNotEmpty - alert: MailqNotEmpty
expr: postfix_mailq_length > 25 expr: postfix_mailq_length > 25
@ -166,7 +330,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." summary: {{ $value }} mails dans la mailq sur {{ $labels.instance }}
- alert: NoRadiusLogin - alert: NoRadiusLogin
expr: rate(radiusd_access_ok[3m]) == 0 expr: rate(radiusd_access_ok[3m]) == 0
@ -174,7 +338,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Personne ne vient taper le RADIUS." summary: Personne ne vient taper le RADIUS
- alert: TooManyReallocatedSectors - alert: TooManyReallocatedSectors
expr: smartmon_reallocated_sector_ct_raw_value > 1e3 expr: smartmon_reallocated_sector_ct_raw_value > 1e3
@ -182,6 +346,6 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués." summary: {{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués
{% endraw %} {% endraw %}