{{ ansible_header | comment }} {# As this is also Jinja2 it will conflict without a raw block #} {# Depending of Prometheus Node exporter version, rules can change depending of version #} {% raw %} groups: - name: alert.rules rules: # Alert for any instance that is unreachable for >3 minutes. - alert: InstanceDown expr: up == 0 for: 3m labels: severity: critical annotations: summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !" # Alert for out of memory # Do not take into account memory not used by apps - alert: OutOfMemory expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%." # Alert for out of disk space - alert: OutOfDiskSpace expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%." # Alert for out of inode space on disk - alert: OutOfInodes expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." # Alert for high CPU usage - alert: CpuBusy expr: node_load5{instance="zbee.adm.crans.org"} > 7 or node_load5{instance!="zbee.adm.crans.org"} > 5 for: 10m labels: severity: warning annotations: summary: "Charge sur {{ $labels.instance }} à {{ $value }}." # Check mdadm software RAID - alert: SoftwareRAIDDegraded expr: node_md_disks-node_md_disks_active > 0 for: 3m labels: severity: warning annotations: summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)." # Check systemd unit (> buster) - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 10m labels: severity: warning annotations: summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" # Check UPS - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 for: 5m labels: severity: warning annotations: summary: "La source d'alimentation de {{ $labels.instance }} a changé !" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsBatteryStatusChanged expr: upsBatteryStatus != 2 for: 5m labels: severity: warning annotations: summary: "L'état de la batterie de {{ $labels.instance }} a changé !" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsTemperatureWarning expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) for: 5m labels: severity: warning annotations: summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C." description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsTemperatureCritical expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) for: 5m labels: severity: critical annotations: summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsHighHumidity expr: xupsEnvRemoteHumidity > 65 for: 5m labels: severity: warning annotations: summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%." description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsVeryHighHumidity expr: xupsEnvRemoteHumidity > 85 for: 5m labels: severity: critical annotations: summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 for: 5m labels: severity: critical annotations: summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) for: 5m labels: severity: warning annotations: summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 225) or (upsOutputVoltage > 235) for: 5m labels: severity: warning annotations: summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: AptAutoremovePending expr: apt_autoremove_pending > 0 for: 5m labels: severity: warning annotations: summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." - alert: MailqNotEmpty expr: postfix_mailq_length > 5 for: 1m labels: severity: warning annotations: summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." # NTP (need NTP plugin in node) # - alert: ntp_drifting # expr: node_ntp_drift_seconds > 0.05 # for: 3m # labels: # severity: critical # annotations: # summary: "Décalage NTP trop élevé sur {{ $labels.instance }}" # description: "Le décalage NTP est trop élevé ({{ $value }} > 0.05)" # - alert: ntp_drifting # expr: node_ntp_drift_seconds > 0.01 # for: 1m # labels: # severity: warning # annotations: # summary: "Décalage NTP élevé sur {{ $labels.instance }}" # description: "Le décalage NTP est élevé ({{ $value }} > 0.01)" {% endraw %}