parent
98228407a6
commit
b5a17c57da
|
@ -219,7 +219,7 @@ loc_prometheus:
|
||||||
metrics_path: '/snmp'
|
metrics_path: '/snmp'
|
||||||
params:
|
params:
|
||||||
module:
|
module:
|
||||||
- eatonups
|
- apc
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels:
|
- source_labels:
|
||||||
- __address__
|
- __address__
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
{{ ansible_header | comment }}
|
{{ ansible_header | comment }}
|
||||||
{# As this is also using brackets it will conflict without a raw block #}
|
{# As this is also using brackets it will conflict without a raw block #}
|
||||||
{% raw %}
|
{% raw %}
|
||||||
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
|
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
|
||||||
# We remove descriptions as we only send summary on IRC.
|
# We remove descriptions as we only send summary on IRC.
|
||||||
# UPS, APT and RADIUS configuration is made by Crans.
|
# UPS, APT and printer configuration are made by Crans.
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
- name: alert.rules
|
- name: alert.rules
|
||||||
|
@ -151,7 +151,7 @@ groups:
|
||||||
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
|
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
|
||||||
|
|
||||||
- alert: HostOomKillDetected
|
- alert: HostOomKillDetected
|
||||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -174,6 +174,14 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: HostNetworkBondDegraded
|
||||||
|
expr: (node_bonding_active - node_bonding_slaves) != 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||||
|
|
||||||
# This happend in June 2021 at Crans
|
# This happend in June 2021 at Crans
|
||||||
- alert: HostConntrackLimit
|
- alert: HostConntrackLimit
|
||||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||||
|
@ -183,6 +191,30 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock skew (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: HostRequiresReboot
|
||||||
|
expr: node_reboot_required > 0
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||||
|
|
||||||
############
|
############
|
||||||
# Blackbox #
|
# Blackbox #
|
||||||
############
|
############
|
||||||
|
@ -203,82 +235,92 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
##############
|
||||||
|
# PostgreSQL #
|
||||||
|
##############
|
||||||
|
|
||||||
|
- alert: PostgresqlDown
|
||||||
|
expr: pg_up == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql down (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotAutoVacuumed
|
||||||
|
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotAutoAnalyzed
|
||||||
|
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyConnections
|
||||||
|
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlNotEnoughConnections
|
||||||
|
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlDeadLocks
|
||||||
|
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRollbackRate
|
||||||
|
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||||
|
|
||||||
|
|
||||||
|
########
|
||||||
|
# Bird #
|
||||||
|
########
|
||||||
|
|
||||||
|
# Check BGP routes
|
||||||
|
- alert: BGPRoutesMissing
|
||||||
|
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Pas de route BGP importée depuis {{ $labels.name }}
|
||||||
|
|
||||||
#######
|
#######
|
||||||
# UPS #
|
# UPS #
|
||||||
#######
|
#######
|
||||||
|
|
||||||
# Check UPS
|
# Check UPS
|
||||||
- alert: UpsOutputSourceChanged
|
- alert: UpsTooHighPower
|
||||||
expr: upsOutputSource != 3
|
expr: sum(rPDUIdentDevicePowerWatts) > 2000
|
||||||
for: 5m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: La source d'alimentation de {{ $labels.instance }} a changé !
|
summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
|
||||||
|
|
||||||
- alert: UpsBatteryStatusChanged
|
|
||||||
expr: upsBatteryStatus != 2
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: L'état de la batterie de {{ $labels.instance }} a changé !
|
|
||||||
|
|
||||||
- alert: UpsTemperatureWarning
|
|
||||||
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
|
||||||
|
|
||||||
- alert: UpsTemperatureCritical
|
|
||||||
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
|
||||||
|
|
||||||
- alert: UpsHighHumidity
|
|
||||||
expr: xupsEnvRemoteHumidity > 65
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
|
||||||
|
|
||||||
- alert: UpsVeryHighHumidity
|
|
||||||
expr: xupsEnvRemoteHumidity > 85
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
|
||||||
|
|
||||||
- alert: UpsHighLoad
|
|
||||||
expr: upsOutputPercentLoad > 70
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: La charge de {{ $labels.instance }} est de {{ $value }}%
|
|
||||||
|
|
||||||
- alert: UpsWrongInputVoltage
|
|
||||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
|
|
||||||
|
|
||||||
- alert: UpsWrongOutputVoltage
|
|
||||||
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
|
||||||
|
|
||||||
#######
|
#######
|
||||||
# iLO #
|
# iLO #
|
||||||
|
|
Loading…
Reference in New Issue