[prometheus] Synchronizing configuration

main
_shirenn 2022-11-23 16:35:30 +01:00
parent a78146e506
commit 0bd8604717
3 changed files with 198 additions and 92 deletions

View File

@ -13,7 +13,7 @@ glob_service_prometheus_target:
options: ""
config:
ldap:
server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adh', 'adm') }}"
server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adm', 'adm') }}"
glob_ninjabot:
config:

View File

@ -9,7 +9,7 @@ loc_prometheus:
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets_node.json'
- '/etc/prometheus/targets/node.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
@ -24,7 +24,19 @@ loc_prometheus:
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
- '/etc/prometheus/targets/nginx.json'
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
apache:
config:
- job_name: apache
file_sd_configs:
- files: ['/etc/prometheus/targets/apache.json']
relabel_configs:
- source_labels: [__address__]
target_label: instance
@ -33,7 +45,7 @@ loc_prometheus:
replacement: '$1:9117'
blackbox:
file: targets_blackbox.json
file: targets/blackbox.json
targets:
- https://crans.org/
- https://www.crans.org/
@ -60,7 +72,7 @@ loc_prometheus:
- job_name: blackbox
file_sd_configs:
- files:
- '/etc/prometheus/targets_blackbox.json'
- '/etc/prometheus/targets/blackbox.json'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
@ -77,7 +89,7 @@ loc_prometheus:
- job_name: blackbox_icmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_icmp.json'
- '/etc/prometheus/targets/icmp.json'
metrics_path: /probe
params:
module: [icmp] # Look for a ICMP ping
@ -94,7 +106,7 @@ loc_prometheus:
- job_name: bird
file_sd_configs:
- files:
- '/etc/prometheus/targets_bird.json'
- '/etc/prometheus/targets/bird.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
@ -104,12 +116,72 @@ loc_prometheus:
target_label: __address__
replacement: '$1:9324'
bind:
config:
- job_name: bind
file_sd_configs:
- files:
- '/etc/prometheus/targets/bind.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9119'
postfix:
config:
- job_name: postfix
file_sd_configs:
- files:
- '/etc/prometheus/targets/postfix.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9154'
postgres:
config:
- job_name: postgres
file_sd_configs:
- files:
- '/etc/prometheus/targets/postgres.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9187'
mysql:
config:
- job_name: mysql
file_sd_configs:
- files:
- '/etc/prometheus/targets/mysql.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9104'
mtail:
config:
- job_name: mtail
file_sd_configs:
- files:
- '/etc/prometheus/targets_mtail.json'
- '/etc/prometheus/targets/mtail.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
@ -124,7 +196,7 @@ loc_prometheus:
- job_name: ilo_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ilo_snmp.json'
- '/etc/prometheus/targets/ilo_snmp.json'
metrics_path: '/snmp'
params:
module:
@ -142,11 +214,11 @@ loc_prometheus:
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
- '/etc/prometheus/targets/ups_snmp.json'
metrics_path: '/snmp'
params:
module:
- eatonups
- apc
relabel_configs:
- source_labels:
- __address__

View File

@ -1,9 +1,9 @@
{{ ansible_header | comment }}
{# As this is also using brackets it will conflict without a raw block #}
{% raw %}
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
# We remove descriptions as we only send summary on IRC.
# UPS, APT and RADIUS configuration is made by Crans.
# UPS, APT and printer configuration are made by Crans.
groups:
- name: alert.rules
@ -151,7 +151,7 @@ groups:
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m
labels:
severity: warning
@ -174,6 +174,14 @@ groups:
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
- alert: HostNetworkBondDegraded
expr: (node_bonding_active - node_bonding_slaves) != 0
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
# This happend in June 2021 at Crans
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
@ -183,6 +191,30 @@ groups:
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
- alert: HostRequiresReboot
expr: node_reboot_required > 0
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
############
# Blackbox #
############
@ -203,82 +235,84 @@ groups:
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
##############
# PostgreSQL #
##############
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
- alert: PostgresqlTableNotAutoVacuumed
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
- alert: PostgresqlTableNotAutoAnalyzed
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
- alert: PostgresqlTooManyConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
- alert: PostgresqlHighRollbackRate
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.2
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (database {{ $labels.datname }}, instance {{ $labels.instance }})
########
# Bird #
########
# Check BGP routes
- alert: BGPRoutesMissing
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
for: 5m
labels:
severity: warning
annotations:
summary: Pas de route BGP importée depuis {{ $labels.name }}
#######
# UPS #
#######
# Check UPS
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 5m
- alert: UpsTooHighPower
expr: sum(rPDUIdentDevicePowerWatts) > 2000
for: 3m
labels:
severity: warning
annotations:
summary: La source d'alimentation de {{ $labels.instance }} a changé !
- alert: UpsBatteryStatusChanged
expr: upsBatteryStatus != 2
for: 5m
labels:
severity: warning
annotations:
summary: L'état de la batterie de {{ $labels.instance }} a changé !
- alert: UpsTemperatureWarning
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
for: 5m
labels:
severity: warning
annotations:
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
- alert: UpsTemperatureCritical
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
for: 5m
labels:
severity: critical
annotations:
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
- alert: UpsHighHumidity
expr: xupsEnvRemoteHumidity > 65
for: 5m
labels:
severity: warning
annotations:
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsVeryHighHumidity
expr: xupsEnvRemoteHumidity > 85
for: 5m
labels:
severity: critical
annotations:
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
labels:
severity: critical
annotations:
summary: La charge de {{ $labels.instance }} est de {{ $value }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
for: 5m
labels:
severity: warning
annotations:
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
#######
# iLO #
@ -360,7 +394,7 @@ groups:
- alert: PrinterWarning
expr: deviceAlertDescription >= 1
for: 3m
for: 15m
labels:
severity: warning
annotations:
@ -369,7 +403,7 @@ groups:
- alert: PrinterDoorOpen
expr: prtCoverStatus{prtCoverStatus="coverClosed"} != 1
for: 3m
for: 15m
labels:
severity: warning
annotations:
@ -378,7 +412,7 @@ groups:
- alert: PrinterTonerLow
expr: prtMarkerSuppliesLevel < 1000
for: 3m
for: 15m
labels:
severity: warning
annotations:
@ -387,7 +421,7 @@ groups:
- alert: PrinterTonerMissing
expr: prtMarkerSuppliesLevel == 0
for: 3m
for: 15m
labels:
severity: critical
annotations:
@ -396,7 +430,7 @@ groups:
- alert: PrinterPaperJam
expr: prtgenStatusPaperJam != 1
for: 3m
for: 15m
labels:
severity: critical
annotations:
@ -405,7 +439,7 @@ groups:
- alert: PrinterPaperEmpty
expr: prtgenStatusInputEmpty != 1
for: 3m
for: 15m
labels:
severity: critical
annotations:
@ -414,7 +448,7 @@ groups:
- alert: PrinterCoverOpen
expr: prtgenStatusCoverOpen != 1
for: 3m
for: 15m
labels:
severity: warning
annotations:
@ -423,7 +457,7 @@ groups:
- alert: PrinterPaperTonerError
expr: prtgenStatusTonerError != 1
for: 3m
for: 15m
labels:
severity: critical
annotations:
@ -432,7 +466,7 @@ groups:
- alert: PrinterSystemError
expr: prtgenStatusSrvcReqd != 1
for: 3m
for: 15m
labels:
severity: critical
annotations:
@ -461,7 +495,7 @@ groups:
summary: "{{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}"
- alert: MailqNotEmpty
expr: postfix_mailq_length > 25
expr: sum by(instance) (postfix_showq_message_size_bytes_count) > 30
for: 1m
labels:
severity: warning