From 0bd8604717252fae980a12b83cd7aca467985820 Mon Sep 17 00:00:00 2001 From: shirenn Date: Wed, 23 Nov 2022 16:35:30 +0100 Subject: [PATCH] [prometheus] Synchronizing configuration --- group_vars/prometheus.yml | 2 +- host_vars/fyre.adm.crans.org.yml | 92 +++++++- .../templates/prometheus/alert.rules.yml.j2 | 196 ++++++++++-------- 3 files changed, 198 insertions(+), 92 deletions(-) diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml index ec112bf2..85f8bee5 100644 --- a/group_vars/prometheus.yml +++ b/group_vars/prometheus.yml @@ -13,7 +13,7 @@ glob_service_prometheus_target: options: "" config: ldap: - server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adh', 'adm') }}" + server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adm', 'adm') }}" glob_ninjabot: config: diff --git a/host_vars/fyre.adm.crans.org.yml b/host_vars/fyre.adm.crans.org.yml index 745f01ff..c71e5b34 100644 --- a/host_vars/fyre.adm.crans.org.yml +++ b/host_vars/fyre.adm.crans.org.yml @@ -9,7 +9,7 @@ loc_prometheus: - job_name: servers file_sd_configs: - files: - - '/etc/prometheus/targets_node.json' + - '/etc/prometheus/targets/node.json' relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -24,7 +24,19 @@ loc_prometheus: - job_name: nginx file_sd_configs: - files: - - '/etc/prometheus/targets_nginx.json' + - '/etc/prometheus/targets/nginx.json' + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:9117' + + apache: + config: + - job_name: apache + file_sd_configs: + - files: ['/etc/prometheus/targets/apache.json'] relabel_configs: - source_labels: [__address__] target_label: instance @@ -33,7 +45,7 @@ loc_prometheus: replacement: '$1:9117' blackbox: - file: targets_blackbox.json + file: targets/blackbox.json targets: - https://crans.org/ - https://www.crans.org/ @@ -60,7 +72,7 @@ loc_prometheus: - job_name: blackbox file_sd_configs: - files: - - '/etc/prometheus/targets_blackbox.json' + - '/etc/prometheus/targets/blackbox.json' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. @@ -77,7 +89,7 @@ loc_prometheus: - job_name: blackbox_icmp file_sd_configs: - files: - - '/etc/prometheus/targets_icmp.json' + - '/etc/prometheus/targets/icmp.json' metrics_path: /probe params: module: [icmp] # Look for a ICMP ping @@ -94,7 +106,7 @@ loc_prometheus: - job_name: bird file_sd_configs: - files: - - '/etc/prometheus/targets_bird.json' + - '/etc/prometheus/targets/bird.json' relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -104,12 +116,72 @@ loc_prometheus: target_label: __address__ replacement: '$1:9324' + bind: + config: + - job_name: bind + file_sd_configs: + - files: + - '/etc/prometheus/targets/bind.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9119' + + postfix: + config: + - job_name: postfix + file_sd_configs: + - files: + - '/etc/prometheus/targets/postfix.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9154' + + postgres: + config: + - job_name: postgres + file_sd_configs: + - files: + - '/etc/prometheus/targets/postgres.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9187' + + mysql: + config: + - job_name: mysql + file_sd_configs: + - files: + - '/etc/prometheus/targets/mysql.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9104' + mtail: config: - job_name: mtail file_sd_configs: - files: - - '/etc/prometheus/targets_mtail.json' + - '/etc/prometheus/targets/mtail.json' relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -124,7 +196,7 @@ loc_prometheus: - job_name: ilo_snmp file_sd_configs: - files: - - '/etc/prometheus/targets_ilo_snmp.json' + - '/etc/prometheus/targets/ilo_snmp.json' metrics_path: '/snmp' params: module: @@ -142,11 +214,11 @@ loc_prometheus: - job_name: ups_snmp file_sd_configs: - files: - - '/etc/prometheus/targets_ups_snmp.json' + - '/etc/prometheus/targets/ups_snmp.json' metrics_path: '/snmp' params: module: - - eatonups + - apc relabel_configs: - source_labels: - __address__ diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index cc1f7a96..529c9bd8 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,9 +1,9 @@ {{ ansible_header | comment }} {# As this is also using brackets it will conflict without a raw block #} {% raw %} -# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07 +# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09 # We remove descriptions as we only send summary on IRC. -# UPS, APT and RADIUS configuration is made by Crans. +# UPS, APT and printer configuration are made by Crans. groups: - name: alert.rules @@ -151,7 +151,7 @@ groups: summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s) - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 + expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 0m labels: severity: warning @@ -174,6 +174,14 @@ groups: annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + - alert: HostNetworkBondDegraded + expr: (node_bonding_active - node_bonding_slaves) != 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + # This happend in June 2021 at Crans - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 @@ -183,6 +191,30 @@ groups: annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) + - alert: HostClockSkew + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + + - alert: HostClockNotSynchronising + expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + + - alert: HostRequiresReboot + expr: node_reboot_required > 0 + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + ############ # Blackbox # ############ @@ -203,82 +235,84 @@ groups: annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + ############## + # PostgreSQL # + ############## + + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + + - alert: PostgresqlTableNotAutoVacuumed + expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) + + - alert: PostgresqlTableNotAutoAnalyzed + expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) + + - alert: PostgresqlTooManyConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + + - alert: PostgresqlHighRollbackRate + expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.2 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (database {{ $labels.datname }}, instance {{ $labels.instance }}) + + + ######## + # Bird # + ######## + + # Check BGP routes + - alert: BGPRoutesMissing + expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5 + for: 5m + labels: + severity: warning + annotations: + summary: Pas de route BGP importée depuis {{ $labels.name }} + ####### # UPS # ####### # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 5m + - alert: UpsTooHighPower + expr: sum(rPDUIdentDevicePowerWatts) > 2000 + for: 3m labels: severity: warning annotations: - summary: La source d'alimentation de {{ $labels.instance }} a changé ! - - - alert: UpsBatteryStatusChanged - expr: upsBatteryStatus != 2 - for: 5m - labels: - severity: warning - annotations: - summary: L'état de la batterie de {{ $labels.instance }} a changé ! - - - alert: UpsTemperatureWarning - expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) - for: 5m - labels: - severity: warning - annotations: - summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - - - alert: UpsTemperatureCritical - expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) - for: 5m - labels: - severity: critical - annotations: - summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - - - alert: UpsHighHumidity - expr: xupsEnvRemoteHumidity > 65 - for: 5m - labels: - severity: warning - annotations: - summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsVeryHighHumidity - expr: xupsEnvRemoteHumidity > 85 - for: 5m - labels: - severity: critical - annotations: - summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: La charge de {{ $labels.instance }} est de {{ $value }}% - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 5m - labels: - severity: warning - annotations: - summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) - for: 5m - labels: - severity: warning - annotations: - summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V + summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W) ####### # iLO # @@ -360,7 +394,7 @@ groups: - alert: PrinterWarning expr: deviceAlertDescription >= 1 - for: 3m + for: 15m labels: severity: warning annotations: @@ -369,7 +403,7 @@ groups: - alert: PrinterDoorOpen expr: prtCoverStatus{prtCoverStatus="coverClosed"} != 1 - for: 3m + for: 15m labels: severity: warning annotations: @@ -378,7 +412,7 @@ groups: - alert: PrinterTonerLow expr: prtMarkerSuppliesLevel < 1000 - for: 3m + for: 15m labels: severity: warning annotations: @@ -387,7 +421,7 @@ groups: - alert: PrinterTonerMissing expr: prtMarkerSuppliesLevel == 0 - for: 3m + for: 15m labels: severity: critical annotations: @@ -396,7 +430,7 @@ groups: - alert: PrinterPaperJam expr: prtgenStatusPaperJam != 1 - for: 3m + for: 15m labels: severity: critical annotations: @@ -405,7 +439,7 @@ groups: - alert: PrinterPaperEmpty expr: prtgenStatusInputEmpty != 1 - for: 3m + for: 15m labels: severity: critical annotations: @@ -414,7 +448,7 @@ groups: - alert: PrinterCoverOpen expr: prtgenStatusCoverOpen != 1 - for: 3m + for: 15m labels: severity: warning annotations: @@ -423,7 +457,7 @@ groups: - alert: PrinterPaperTonerError expr: prtgenStatusTonerError != 1 - for: 3m + for: 15m labels: severity: critical annotations: @@ -432,7 +466,7 @@ groups: - alert: PrinterSystemError expr: prtgenStatusSrvcReqd != 1 - for: 3m + for: 15m labels: severity: critical annotations: @@ -461,7 +495,7 @@ groups: summary: "{{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}" - alert: MailqNotEmpty - expr: postfix_mailq_length > 25 + expr: sum by(instance) (postfix_showq_message_size_bytes_count) > 30 for: 1m labels: severity: warning