521 lines
15 KiB
Django/Jinja
521 lines
15 KiB
Django/Jinja
{{ ansible_header | comment }}
|
|
{# As this is also using brackets it will conflict without a raw block #}
|
|
{% raw %}
|
|
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
|
|
# We remove descriptions as we only send summary on IRC.
|
|
# UPS, APT and printer configuration are made by Crans.
|
|
|
|
groups:
|
|
- name: alert.rules
|
|
rules:
|
|
|
|
##############################
|
|
# Prometheus self-monitoring #
|
|
##############################
|
|
|
|
- alert: PrometheusJobMissing
|
|
expr: absent(up{job="prometheus"})
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus job missing (instance {{ $labels.instance }})
|
|
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.instance }} ({{ $labels.job }}) est manquant"
|
|
|
|
- alert: PrometheusConfigurationReloadFailure
|
|
expr: prometheus_config_last_reload_successful != 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
|
|
|
- alert: PrometheusTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
|
|
|
- alert: PrometheusRuleEvaluationFailures
|
|
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
|
|
|
- alert: PrometheusTargetEmpty
|
|
expr: prometheus_sd_discovered_targets == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus target empty (instance {{ $labels.instance }})
|
|
|
|
# This already happened in 2021 at Crans
|
|
- alert: PrometheusTsdbCompactionsFailed
|
|
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
|
|
|
#####################
|
|
# Host and hardware #
|
|
#####################
|
|
|
|
# Alert for out of memory
|
|
# Do not take into account memory not used by apps
|
|
- alert: HostOutOfMemory
|
|
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
|
|
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.mountpoint }} sur {{ $labels.instance }} arrive à saturation ({{ $value }}%)"
|
|
|
|
- alert: HostDiskWillFillIn24Hours
|
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
|
|
|
- alert: HostOutOfInodes
|
|
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}
|
|
|
|
- alert: HostHighCpuLoad
|
|
expr: node_load5 > 9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Charge sur {{ $labels.instance }} à {{ $value }}
|
|
|
|
- alert: HostSystemdServiceCrashed
|
|
expr: node_systemd_unit_state{state="failed"} == 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.name }} a crashé sur {{ $labels.instance }}"
|
|
|
|
# 0B is so hot
|
|
# En pratique c'est mauvais de tourner des disques trop chauds
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr: node_hwmon_temp_celsius > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
|
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
|
|
|
- alert: HostRaidDiskFailure
|
|
expr: node_md_disks{state="failed"} > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
|
|
|
|
- alert: HostOomKillDetected
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
|
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
|
|
|
- alert: HostEdacUncorrectableErrorsDetected
|
|
expr: node_edac_uncorrectable_errors_total > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
|
|
|
- alert: HostNetworkBondDegraded
|
|
expr: (node_bonding_active - node_bonding_slaves) != 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
|
|
|
# This happend in June 2021 at Crans
|
|
- alert: HostConntrackLimit
|
|
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
|
|
|
- alert: HostClockSkew
|
|
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock skew (instance {{ $labels.instance }})
|
|
|
|
- alert: HostClockNotSynchronising
|
|
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
|
|
|
- alert: HostRequiresReboot
|
|
expr: node_reboot_required > 0
|
|
for: 4h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host requires reboot (instance {{ $labels.instance }})
|
|
|
|
############
|
|
# Blackbox #
|
|
############
|
|
|
|
- alert: BlackboxProbeFailed
|
|
expr: probe_success == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
|
|
|
- alert: BlackboxSslCertificateWillExpireSoon
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
|
|
|
##############
|
|
# PostgreSQL #
|
|
##############
|
|
|
|
- alert: PostgresqlDown
|
|
expr: pg_up == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Postgresql down (instance {{ $labels.instance }})
|
|
|
|
- alert: PostgresqlTableNotAutoVacuumed
|
|
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
|
|
|
- alert: PostgresqlTableNotAutoAnalyzed
|
|
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
|
|
|
- alert: PostgresqlTooManyConnections
|
|
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
|
|
|
- alert: PostgresqlDeadLocks
|
|
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
|
|
|
- alert: PostgresqlHighRollbackRate
|
|
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.2
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Postgresql high rollback rate (database {{ $labels.datname }}, instance {{ $labels.instance }})
|
|
|
|
|
|
########
|
|
# Bird #
|
|
########
|
|
|
|
# Check BGP routes
|
|
- alert: BGPRoutesMissing
|
|
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Pas de route BGP importée depuis {{ $labels.name }}
|
|
|
|
#######
|
|
# UPS #
|
|
#######
|
|
|
|
# Check UPS
|
|
- alert: UpsTooHighPower
|
|
expr: sum(rPDUIdentDevicePowerWatts) > 2000
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
|
|
|
|
#######
|
|
# iLO #
|
|
#######
|
|
|
|
- alert: IloResilientMemoryDegraded
|
|
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
La mémoire vive n'est plus résiliente
|
|
({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloBiosSelfTestDegraded
|
|
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Une erreur a été détectée lors du POST du serveur
|
|
({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloBatteryDegraded
|
|
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
La batterie est dégradée
|
|
({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloTemperatureSensorDegraded
|
|
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Le capteur de température est dégradé
|
|
({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloFanDegraded
|
|
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Le ventilateur est dégradé
|
|
({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloPowerSupplyDegraded
|
|
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
L'alimentation est dégradée
|
|
({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}
|
|
|
|
- alert: IloOverrideSwitchState
|
|
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Le switch de réinitialisation n'est pas à l'état d'origine,
|
|
l'authentification est bypassée sur {{ $labels.instance }}
|
|
|
|
###########
|
|
# Printer #
|
|
###########
|
|
|
|
- alert: PrinterWarning
|
|
expr: deviceAlertDescription >= 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
L'imprimante {{ $labels.instance }} a un message d'erreur : {{ $labels.deviceAlertDescription }}
|
|
|
|
- alert: PrinterDoorOpen
|
|
expr: prtCoverStatus{prtCoverStatus="coverClosed"} != 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
La porte n°{{ $labels.prtCoverIndex }} est ouverte sur {{ $labels.instance }}
|
|
|
|
- alert: PrinterTonerLow
|
|
expr: prtMarkerSuppliesLevel < 1000
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
Le niveau de toner n°{{ $labels.prtMarkerSuppliesIndex }} de {{ $labels.instance }} est bas ({{ $value }} feuilles restantes)
|
|
|
|
- alert: PrinterTonerMissing
|
|
expr: prtMarkerSuppliesLevel == 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Le toner n°{{ $labels.prtMarkerSuppliesIndex }} de {{ $labels.instance }} est vide ou inexistant
|
|
|
|
- alert: PrinterPaperJam
|
|
expr: prtgenStatusPaperJam != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Bourrage papier dans l'imprimante {{ $labels.instance }}
|
|
|
|
- alert: PrinterPaperEmpty
|
|
expr: prtgenStatusInputEmpty != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Il n'y a plus de papier dans l'imprimante {{ $labels.instance }}
|
|
|
|
- alert: PrinterCoverOpen
|
|
expr: prtgenStatusCoverOpen != 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
Le couvercle de l'imprimante {{ $labels.instance }} est ouvert
|
|
|
|
- alert: PrinterPaperTonerError
|
|
expr: prtgenStatusTonerError != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Il y a un problème de toner dans l'imprimante {{ $labels.instance }}
|
|
|
|
- alert: PrinterSystemError
|
|
expr: prtgenStatusSrvcReqd != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Une maintenance de l'imprimante {{ $labels.instance }} est requise
|
|
|
|
|
|
#########
|
|
# Other #
|
|
#########
|
|
|
|
- alert: AptAutoremovePending
|
|
expr: apt_autoremove_pending > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
|
|
|
|
- alert: AptObsolete
|
|
expr: apt_obsolete > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}"
|
|
|
|
- alert: MailqNotEmpty
|
|
expr: sum by(instance) (postfix_showq_message_size_bytes_count) > 30
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}"
|
|
|
|
- alert: NoRadiusLogin
|
|
expr: rate(radiusd_access_ok[3m]) == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Personne ne vient taper le RADIUS
|
|
|
|
- alert: TooManyReallocatedSectors
|
|
expr: smartmon_reallocated_sector_ct_raw_value > 1e3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
|
|
{% endraw %}
|