parent
98228407a6
commit
b5a17c57da
|
@ -219,7 +219,7 @@ loc_prometheus:
|
|||
metrics_path: '/snmp'
|
||||
params:
|
||||
module:
|
||||
- eatonups
|
||||
- apc
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
{{ ansible_header | comment }}
|
||||
{# As this is also using brackets it will conflict without a raw block #}
|
||||
{% raw %}
|
||||
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
|
||||
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
|
||||
# We remove descriptions as we only send summary on IRC.
|
||||
# UPS, APT and RADIUS configuration is made by Crans.
|
||||
# UPS, APT and printer configuration are made by Crans.
|
||||
|
||||
groups:
|
||||
- name: alert.rules
|
||||
|
@ -151,7 +151,7 @@ groups:
|
|||
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -174,6 +174,14 @@ groups:
|
|||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: (node_bonding_active - node_bonding_slaves) != 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
|
||||
# This happend in June 2021 at Crans
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||
|
@ -183,6 +191,30 @@ groups:
|
|||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: node_reboot_required > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
|
||||
############
|
||||
# Blackbox #
|
||||
############
|
||||
|
@ -203,82 +235,92 @@ groups:
|
|||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
|
||||
##############
|
||||
# PostgreSQL #
|
||||
##############
|
||||
|
||||
- alert: PostgresqlDown
|
||||
expr: pg_up == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlNotEnoughConnections
|
||||
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
|
||||
|
||||
########
|
||||
# Bird #
|
||||
########
|
||||
|
||||
# Check BGP routes
|
||||
- alert: BGPRoutesMissing
|
||||
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Pas de route BGP importée depuis {{ $labels.name }}
|
||||
|
||||
#######
|
||||
# UPS #
|
||||
#######
|
||||
|
||||
# Check UPS
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 5m
|
||||
- alert: UpsTooHighPower
|
||||
expr: sum(rPDUIdentDevicePowerWatts) > 2000
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La source d'alimentation de {{ $labels.instance }} a changé !
|
||||
|
||||
- alert: UpsBatteryStatusChanged
|
||||
expr: upsBatteryStatus != 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: L'état de la batterie de {{ $labels.instance }} a changé !
|
||||
|
||||
- alert: UpsTemperatureWarning
|
||||
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
||||
|
||||
- alert: UpsTemperatureCritical
|
||||
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
||||
|
||||
- alert: UpsHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 65
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsVeryHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: La charge de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
|
||||
|
||||
- alert: UpsWrongOutputVoltage
|
||||
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
||||
summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
|
||||
|
||||
#######
|
||||
# iLO #
|
||||
|
|
Loading…
Reference in New Issue