ansible/roles/prometheus/templates/prometheus/alert.rules.yml.j2

{{ ansible_header | comment }}
{# As this is also using brackets it will conflict without a raw block #}
{% raw %}
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
# We remove descriptions as we only send summary on IRC.
# UPS, APT and printer configuration are made by Crans.

groups:
- name: alert.rules
  rules:

  ##############################
  # Prometheus self-monitoring #
  ##############################

  - alert: PrometheusJobMissing
    expr: absent(up{job="prometheus"})
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Prometheus job missing (instance {{ $labels.instance }})

  - alert: PrometheusTargetMissing
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "{{ $labels.instance }} ({{ $labels.job }}) est manquant"

  - alert: PrometheusConfigurationReloadFailure
    expr: prometheus_config_last_reload_successful != 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})

  - alert: PrometheusTooManyRestarts
    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Prometheus too many restarts (instance {{ $labels.instance }})

  - alert: PrometheusRuleEvaluationFailures
    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})

  - alert: PrometheusTargetEmpty
    expr: prometheus_sd_discovered_targets == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Prometheus target empty (instance {{ $labels.instance }})

  # This already happened in 2021 at Crans
  - alert: PrometheusTsdbCompactionsFailed
    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})

  #####################
  # Host and hardware #
  #####################

  # Alert for out of memory
  # Do not take into account memory not used by apps
  - alert: HostOutOfMemory
    expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)

  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.mountpoint }} sur {{ $labels.instance }} arrive à saturation ({{ $value }}%)"

  - alert: HostDiskWillFillIn24Hours
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}

  - alert: HostHighCpuLoad
    expr: node_load5 > 9
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: Charge sur {{ $labels.instance }} à {{ $value }}

  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.name }} a crashé sur {{ $labels.instance }}"

  # 0B is so hot
  # En pratique c'est mauvais de tourner des disques trop chauds
  - alert: HostPhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 85
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host physical component too hot (instance {{ $labels.instance }})

  - alert: HostNodeOvertemperatureAlarm
    expr: node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host node overtemperature alarm (instance {{ $labels.instance }})

  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="failed"} > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)

  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[5m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host OOM kill detected (instance {{ $labels.instance }})

  - alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: info
    annotations:
      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})

  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})

  - alert: HostNetworkBondDegraded
    expr: (node_bonding_active - node_bonding_slaves) != 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Bond Degraded (instance {{ $labels.instance }})

  # This happend in June 2021 at Crans
  - alert: HostConntrackLimit
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host conntrack limit (instance {{ $labels.instance }})

  - alert: HostClockSkew
    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock skew (instance {{ $labels.instance }})

  - alert: HostClockNotSynchronising
    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock not synchronising (instance {{ $labels.instance }})

  - alert: HostRequiresReboot
    expr: node_reboot_required > 0
    for: 4h
    labels:
      severity: info
    annotations:
      summary: Host requires reboot (instance {{ $labels.instance }})

  ############
  # Blackbox #
  ############

  - alert: BlackboxProbeFailed
    expr: probe_success == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: Blackbox probe failed (instance {{ $labels.instance }})

  - alert: BlackboxSslCertificateWillExpireSoon
    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})

  ##############
  # PostgreSQL #
  ##############

  - alert: PostgresqlDown
    expr: pg_up == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Postgresql down (instance {{ $labels.instance }})

  - alert: PostgresqlTableNotAutoVacuumed
    expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})

  - alert: PostgresqlTableNotAutoAnalyzed
    expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})

  - alert: PostgresqlTooManyConnections
    expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Postgresql too many connections (instance {{ $labels.instance }})

  - alert: PostgresqlDeadLocks
    expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Postgresql dead locks (instance {{ $labels.instance }})

  - alert: PostgresqlHighRollbackRate
    expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.2
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Postgresql high rollback rate (database {{ $labels.datname }}, instance {{ $labels.instance }})


  ########
  # Bird #
  ########

  # Check BGP routes
  - alert: BGPRoutesMissing
    expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Pas de route BGP importée depuis {{ $labels.name }}

  #######
  # UPS #
  #######

  # Check UPS
  - alert: UpsTooHighPower
    expr: sum(rPDUIdentDevicePowerWatts) > 2000
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)

  #######
  # iLO #
  #######

  - alert: IloResilientMemoryDegraded
    expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: >-
        La mémoire vive n'est plus résiliente
        ({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}

  - alert: IloBiosSelfTestDegraded
    expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: >-
        Une erreur a été détectée lors du POST du serveur
        ({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}

  - alert: IloBatteryDegraded
    expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: >-
        La batterie est dégradée
        ({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}

  - alert: IloTemperatureSensorDegraded
    expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: >-
        Le capteur de température est dégradé
        ({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}

  - alert: IloFanDegraded
    expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: >-
        Le ventilateur est dégradé
        ({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}

  - alert: IloPowerSupplyDegraded
    expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: >-
        L'alimentation est dégradée
        ({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}

  - alert: IloOverrideSwitchState
    expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: >-
        Le switch de réinitialisation n'est pas à l'état d'origine,
        l'authentification est bypassée sur {{ $labels.instance }}

  ###########
  # Printer #
  ###########

  - alert: PrinterWarning
    expr: deviceAlertDescription >= 1
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: >-
        L'imprimante {{ $labels.instance }} a un message d'erreur : {{ $labels.deviceAlertDescription }}

  - alert: PrinterDoorOpen
    expr: prtCoverStatus{prtCoverStatus="coverClosed"} != 1
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: >-
        La porte n°{{ $labels.prtCoverIndex }} est ouverte sur {{ $labels.instance }}

  - alert: PrinterTonerLow
    expr: prtMarkerSuppliesLevel < 1000
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: >-
        Le niveau de toner n°{{ $labels.prtMarkerSuppliesIndex }} de {{ $labels.instance }} est bas ({{ $value }} feuilles restantes)

  - alert: PrinterTonerMissing
    expr: prtMarkerSuppliesLevel == 0
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: >-
        Le toner n°{{ $labels.prtMarkerSuppliesIndex }} de {{ $labels.instance }} est vide ou inexistant

  - alert: PrinterPaperJam
    expr: prtgenStatusPaperJam != 1
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: >-
        Bourrage papier dans l'imprimante {{ $labels.instance }}

  - alert: PrinterPaperEmpty
    expr: prtgenStatusInputEmpty != 1
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: >-
        Il n'y a plus de papier dans l'imprimante {{ $labels.instance }}

  - alert: PrinterCoverOpen
    expr: prtgenStatusCoverOpen != 1
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: >-
        Le couvercle de l'imprimante {{ $labels.instance }} est ouvert

  - alert: PrinterPaperTonerError
    expr: prtgenStatusTonerError != 1
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: >-
        Il y a un problème de toner dans l'imprimante {{ $labels.instance }}

  - alert: PrinterSystemError
    expr: prtgenStatusSrvcReqd != 1
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: >-
        Une maintenance de l'imprimante {{ $labels.instance }} est requise


  #########
  # Other #
  #########

  - alert: AptAutoremovePending
    expr: apt_autoremove_pending > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"

  - alert: AptObsolete
    expr: apt_obsolete > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "{{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}"

  - alert: MailqNotEmpty
    expr: sum by(instance) (postfix_showq_message_size_bytes_count) > 30
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}"

  - alert: NoRadiusLogin
    expr: rate(radiusd_access_ok[3m]) == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Personne ne vient taper le RADIUS

  - alert: TooManyReallocatedSectors
    expr: smartmon_reallocated_sector_ct_raw_value > 1e3
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
{% endraw %}