From b5a17c57da8a5f9322ff4b58596e2654542e5198 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Tue, 9 Aug 2022 16:17:30 +0200 Subject: [PATCH] [Prometheus] Add new alerts Signed-off-by: Yohann D'ANELLO --- host_vars/fyre.adm.crans.org.yml | 2 +- .../templates/prometheus/alert.rules.yml.j2 | 184 +++++++++++------- 2 files changed, 114 insertions(+), 72 deletions(-) diff --git a/host_vars/fyre.adm.crans.org.yml b/host_vars/fyre.adm.crans.org.yml index 6618c3a0..3c87c29d 100644 --- a/host_vars/fyre.adm.crans.org.yml +++ b/host_vars/fyre.adm.crans.org.yml @@ -219,7 +219,7 @@ loc_prometheus: metrics_path: '/snmp' params: module: - - eatonups + - apc relabel_configs: - source_labels: - __address__ diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index cc1f7a96..e11d6f77 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,9 +1,9 @@ {{ ansible_header | comment }} {# As this is also using brackets it will conflict without a raw block #} {% raw %} -# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07 +# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09 # We remove descriptions as we only send summary on IRC. -# UPS, APT and RADIUS configuration is made by Crans. +# UPS, APT and printer configuration are made by Crans. groups: - name: alert.rules @@ -151,7 +151,7 @@ groups: summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s) - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 + expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 0m labels: severity: warning @@ -174,6 +174,14 @@ groups: annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + - alert: HostNetworkBondDegraded + expr: (node_bonding_active - node_bonding_slaves) != 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + # This happend in June 2021 at Crans - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 @@ -183,6 +191,30 @@ groups: annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) + - alert: HostClockSkew + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + + - alert: HostClockNotSynchronising + expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + + - alert: HostRequiresReboot + expr: node_reboot_required > 0 + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + ############ # Blackbox # ############ @@ -203,82 +235,92 @@ groups: annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + ############## + # PostgreSQL # + ############## + + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + + - alert: PostgresqlTableNotAutoVacuumed + expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) + + - alert: PostgresqlTableNotAutoAnalyzed + expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) + + - alert: PostgresqlTooManyConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + + - alert: PostgresqlNotEnoughConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql not enough connections (instance {{ $labels.instance }}) + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + + - alert: PostgresqlHighRollbackRate + expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{ $labels.instance }}) + + + ######## + # Bird # + ######## + + # Check BGP routes + - alert: BGPRoutesMissing + expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5 + for: 5m + labels: + severity: warning + annotations: + summary: Pas de route BGP importée depuis {{ $labels.name }} + ####### # UPS # ####### # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 5m + - alert: UpsTooHighPower + expr: sum(rPDUIdentDevicePowerWatts) > 2000 + for: 3m labels: severity: warning annotations: - summary: La source d'alimentation de {{ $labels.instance }} a changé ! - - - alert: UpsBatteryStatusChanged - expr: upsBatteryStatus != 2 - for: 5m - labels: - severity: warning - annotations: - summary: L'état de la batterie de {{ $labels.instance }} a changé ! - - - alert: UpsTemperatureWarning - expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) - for: 5m - labels: - severity: warning - annotations: - summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - - - alert: UpsTemperatureCritical - expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) - for: 5m - labels: - severity: critical - annotations: - summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - - - alert: UpsHighHumidity - expr: xupsEnvRemoteHumidity > 65 - for: 5m - labels: - severity: warning - annotations: - summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsVeryHighHumidity - expr: xupsEnvRemoteHumidity > 85 - for: 5m - labels: - severity: critical - annotations: - summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: La charge de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 5m - labels: - severity: warning - annotations: - summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) - for: 5m - labels: - severity: warning - annotations: - summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V + summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W) ####### # iLO #