Update prometheus alert rules with upstream

2021-06-07 21:44:52 +02:00 · 2021-06-07 21:44:52 +02:00 · 0b4c77eb0c
parent f37d195543
commit 0b4c77eb0c
1 changed files with 218 additions and 54 deletions
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@ -1,74 +1,235 @@
 {{ ansible_header | comment }}
-{# As this is also Jinja2 it will conflict without a raw block #}
-{# Depending of Prometheus Node exporter version, rules can change depending of version #}
+{# As this is also using brackets it will conflict without a raw block #}
 {% raw %}
+# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
+# We remove descriptions as we only send summary on IRC.
+# UPS, APT and RADIUS configuration is made by Crans.
+
 groups:
 - name: alert.rules
  rules:

-  # Alert for any instance that is unreachable for >3 minutes.
-  - alert: InstanceDown
+  ##############################
+  # Prometheus self-monitoring #
+  ##############################
+
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+
+  - alert: PrometheusTargetMissing
    expr: up == 0
-    for: 3m
+    for: 0m
    labels:
      severity: critical
    annotations:
-      summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
+      summary: {{ $labels.instance }} ({{ $labels.job }}) est manquant
+
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+
+  - alert: PrometheusTooManyRestarts
+    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus too many restarts (instance {{ $labels.instance }})
+
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+
+  - alert: PrometheusTargetEmpty
+    expr: prometheus_sd_discovered_targets == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target empty (instance {{ $labels.instance }})
+
+  # This already happened in 2021 at Crans
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+
+  #####################
+  # Host and hardware #
+  #####################

  # Alert for out of memory
  # Do not take into account memory not used by apps
-  - alert: OutOfMemory
+  - alert: HostOutOfMemory
    expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
+
+  - alert: HostUnusualDiskReadRate
+    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
-      summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
+      summary: Host unusual disk read rate (instance {{ $labels.instance }})

-  # Alert for out of disk space
-  - alert: OutOfDiskSpace
-    expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
-    for: 5m
+  - alert: HostUnusualDiskWriteRate
+    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 2m
    labels:
      severity: warning
    annotations:
-      summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
+      summary: Host unusual disk write rate (instance {{ $labels.instance }})

-  # Alert for out of inode space on disk
-  - alert: OutOfInodes
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: {{ $labels.mountpoint }} sur {{ $labels.instance }} arrive à saturation ({{ $value }}%)
+
+  - alert: HostDiskWillFillIn24Hours
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+
+  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
-      summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
+      summary: Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}

-  # Alert for high CPU usage
-  - alert: CpuBusy
+  - alert: HostHighCpuLoad
    expr: node_load5 > 9
    for: 10m
    labels:
      severity: warning
    annotations:
-      summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
+      summary: Charge sur {{ $labels.instance }} à {{ $value }}

-  # Check mdadm software RAID
-  - alert: SoftwareRAIDDegraded
-    expr: node_md_disks-node_md_disks_active > 0
-    for: 3m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
-
-  # Check systemd unit (> buster)
-  - alert: SystemdServiceFailed
+  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
-    for: 10m
+    for: 0m
    labels:
      severity: warning
    annotations:
-      summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
+      summary: {{ $labels.name }} a crashé sur {{ $labels.instance }}
+
+  # 0B is so hot
+  # En pratique c'est mauvais de tourner des disques trop chauds
+  - alert: HostPhysicalComponentTooHot
+    expr: node_hwmon_temp_celsius > 75
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host physical component too hot (instance {{ $labels.instance }})
+
+  - alert: HostNodeOvertemperatureAlarm
+    expr: node_hwmon_temp_crit_alarm_celsius == 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+
+  - alert: HostRaidDiskFailure
+    expr: node_md_disks{state="failed"} > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
+
+  - alert: HostOomKillDetected
+    expr: increase(node_vmstat_oom_kill[1m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host OOM kill detected (instance {{ $labels.instance }})
+
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: increase(node_edac_correctable_errors_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: node_edac_uncorrectable_errors_total > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+
+  # This happend in June 2021 at Crans
+  - alert: HostConntrackLimit
+    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack limit (instance {{ $labels.instance }})
+
+  ############
+  # Blackbox #
+  ############
+
+  - alert: BlackboxProbeFailed
+    expr: probe_success == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Blackbox probe failed (instance {{ $labels.instance }})
+
+  - alert: BlackboxSlowProbe
+    expr: avg_over_time(probe_duration_seconds[1m]) > 1
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: Blackbox slow probe (instance {{ $labels.instance }})
+
+  - alert: BlackboxSslCertificateWillExpireSoon
+    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+
+  #######
+  # UPS #
+  #######

  # Check UPS
  - alert: UpsOutputSourceChanged
@ -77,8 +238,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La source d'alimentation de {{ $labels.instance }} a changé !

  - alert: UpsBatteryStatusChanged
    expr: upsBatteryStatus != 2
@ -86,8 +246,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: L'état de la batterie de {{ $labels.instance }} a changé !

  - alert: UpsTemperatureWarning
    expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
@ -95,8 +254,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C

  - alert: UpsTemperatureCritical
    expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
@ -104,8 +262,7 @@ groups:
    labels:
      severity: critical
    annotations:
-      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C

  - alert: UpsHighHumidity
    expr: xupsEnvRemoteHumidity > 65
@ -113,8 +270,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%

  - alert: UpsVeryHighHumidity
    expr: xupsEnvRemoteHumidity > 85
@ -122,8 +278,7 @@ groups:
    labels:
      severity: critical
    annotations:
-      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%

  - alert: UpsHighLoad
    expr: upsOutputPercentLoad > 70
@ -131,8 +286,7 @@ groups:
    labels:
      severity: critical
    annotations:
-      summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La charge de {{ $labels.instance }} est de {{ $value }}%

  - alert: UpsWrongInputVoltage
    expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
@ -140,8 +294,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V

  - alert: UpsWrongOutputVoltage
    expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
@ -149,8 +302,11 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
-      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+      summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
+
+  #########
+  # Other #
+  #########

  - alert: AptAutoremovePending
    expr: apt_autoremove_pending > 0
@ -158,7 +314,15 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
+      summary: {{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}
+
+  - alert: AptOrphans
+    expr: apt_orphans > 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: {{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}

  - alert: MailqNotEmpty
    expr: postfix_mailq_length > 25
@ -166,7 +330,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
+      summary: {{ $value }} mails dans la mailq sur {{ $labels.instance }}

  - alert: NoRadiusLogin
    expr: rate(radiusd_access_ok[3m]) == 0
@ -174,7 +338,7 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "Personne ne vient taper le RADIUS."
+      summary: Personne ne vient taper le RADIUS

  - alert: TooManyReallocatedSectors
    expr: smartmon_reallocated_sector_ct_raw_value > 1e3
@ -182,6 +346,6 @@ groups:
    labels:
      severity: warning
    annotations:
-      summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
+      summary: {{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués

 {% endraw %}