[Prometheus] Add new alerts

Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
alerts
Yohann D'ANELLO 2022-08-09 16:17:30 +02:00
parent 98228407a6
commit b5a17c57da
Signed by: _ynerant
GPG Key ID: 3A75C55819C8CF85
2 changed files with 114 additions and 72 deletions

View File

@ -219,7 +219,7 @@ loc_prometheus:
metrics_path: '/snmp' metrics_path: '/snmp'
params: params:
module: module:
- eatonups - apc
relabel_configs: relabel_configs:
- source_labels: - source_labels:
- __address__ - __address__

View File

@ -1,9 +1,9 @@
{{ ansible_header | comment }} {{ ansible_header | comment }}
{# As this is also using brackets it will conflict without a raw block #} {# As this is also using brackets it will conflict without a raw block #}
{% raw %} {% raw %}
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07 # Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
# We remove descriptions as we only send summary on IRC. # We remove descriptions as we only send summary on IRC.
# UPS, APT and RADIUS configuration is made by Crans. # UPS, APT and printer configuration are made by Crans.
groups: groups:
- name: alert.rules - name: alert.rules
@ -151,7 +151,7 @@ groups:
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s) summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
- alert: HostOomKillDetected - alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0 expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -174,6 +174,14 @@ groups:
annotations: annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
- alert: HostNetworkBondDegraded
expr: (node_bonding_active - node_bonding_slaves) != 0
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
# This happend in June 2021 at Crans # This happend in June 2021 at Crans
- alert: HostConntrackLimit - alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
@ -183,6 +191,30 @@ groups:
annotations: annotations:
summary: Host conntrack limit (instance {{ $labels.instance }}) summary: Host conntrack limit (instance {{ $labels.instance }})
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
- alert: HostRequiresReboot
expr: node_reboot_required > 0
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
############ ############
# Blackbox # # Blackbox #
############ ############
@ -203,82 +235,92 @@ groups:
annotations: annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
##############
# PostgreSQL #
##############
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
- alert: PostgresqlTableNotAutoVacuumed
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
- alert: PostgresqlTableNotAutoAnalyzed
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
- alert: PostgresqlTooManyConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
- alert: PostgresqlNotEnoughConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
- alert: PostgresqlHighRollbackRate
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
########
# Bird #
########
# Check BGP routes
- alert: BGPRoutesMissing
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
for: 5m
labels:
severity: warning
annotations:
summary: Pas de route BGP importée depuis {{ $labels.name }}
####### #######
# UPS # # UPS #
####### #######
# Check UPS # Check UPS
- alert: UpsOutputSourceChanged - alert: UpsTooHighPower
expr: upsOutputSource != 3 expr: sum(rPDUIdentDevicePowerWatts) > 2000
for: 5m for: 3m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: La source d'alimentation de {{ $labels.instance }} a changé ! summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
- alert: UpsBatteryStatusChanged
expr: upsBatteryStatus != 2
for: 5m
labels:
severity: warning
annotations:
summary: L'état de la batterie de {{ $labels.instance }} a changé !
- alert: UpsTemperatureWarning
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
for: 5m
labels:
severity: warning
annotations:
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
- alert: UpsTemperatureCritical
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
for: 5m
labels:
severity: critical
annotations:
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
- alert: UpsHighHumidity
expr: xupsEnvRemoteHumidity > 65
for: 5m
labels:
severity: warning
annotations:
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsVeryHighHumidity
expr: xupsEnvRemoteHumidity > 85
for: 5m
labels:
severity: critical
annotations:
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
labels:
severity: critical
annotations:
summary: La charge de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
for: 5m
labels:
severity: warning
annotations:
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
####### #######
# iLO # # iLO #