[prometheus] Synchronizing configuration
parent
a78146e506
commit
0bd8604717
|
@ -13,7 +13,7 @@ glob_service_prometheus_target:
|
|||
options: ""
|
||||
config:
|
||||
ldap:
|
||||
server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adh', 'adm') }}"
|
||||
server: "ldaps://{{ query('ldap', 'ip4', 'ldap-adm', 'adm') }}"
|
||||
|
||||
glob_ninjabot:
|
||||
config:
|
||||
|
|
|
@ -9,7 +9,7 @@ loc_prometheus:
|
|||
- job_name: servers
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_node.json'
|
||||
- '/etc/prometheus/targets/node.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
|
@ -24,7 +24,19 @@ loc_prometheus:
|
|||
- job_name: nginx
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_nginx.json'
|
||||
- '/etc/prometheus/targets/nginx.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
- source_labels: [instance]
|
||||
target_label: __address__
|
||||
replacement: '$1:9117'
|
||||
|
||||
apache:
|
||||
config:
|
||||
- job_name: apache
|
||||
file_sd_configs:
|
||||
- files: ['/etc/prometheus/targets/apache.json']
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
|
@ -33,7 +45,7 @@ loc_prometheus:
|
|||
replacement: '$1:9117'
|
||||
|
||||
blackbox:
|
||||
file: targets_blackbox.json
|
||||
file: targets/blackbox.json
|
||||
targets:
|
||||
- https://crans.org/
|
||||
- https://www.crans.org/
|
||||
|
@ -60,7 +72,7 @@ loc_prometheus:
|
|||
- job_name: blackbox
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_blackbox.json'
|
||||
- '/etc/prometheus/targets/blackbox.json'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx] # Look for a HTTP 200 response.
|
||||
|
@ -77,7 +89,7 @@ loc_prometheus:
|
|||
- job_name: blackbox_icmp
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_icmp.json'
|
||||
- '/etc/prometheus/targets/icmp.json'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [icmp] # Look for a ICMP ping
|
||||
|
@ -94,7 +106,7 @@ loc_prometheus:
|
|||
- job_name: bird
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_bird.json'
|
||||
- '/etc/prometheus/targets/bird.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
|
@ -104,12 +116,72 @@ loc_prometheus:
|
|||
target_label: __address__
|
||||
replacement: '$1:9324'
|
||||
|
||||
bind:
|
||||
config:
|
||||
- job_name: bind
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/bind.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [__param_target]
|
||||
target_label: __address__
|
||||
replacement: '$1:9119'
|
||||
|
||||
postfix:
|
||||
config:
|
||||
- job_name: postfix
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/postfix.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [__param_target]
|
||||
target_label: __address__
|
||||
replacement: '$1:9154'
|
||||
|
||||
postgres:
|
||||
config:
|
||||
- job_name: postgres
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/postgres.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [__param_target]
|
||||
target_label: __address__
|
||||
replacement: '$1:9187'
|
||||
|
||||
mysql:
|
||||
config:
|
||||
- job_name: mysql
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/mysql.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [__param_target]
|
||||
target_label: __address__
|
||||
replacement: '$1:9104'
|
||||
|
||||
mtail:
|
||||
config:
|
||||
- job_name: mtail
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_mtail.json'
|
||||
- '/etc/prometheus/targets/mtail.json'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
|
@ -124,7 +196,7 @@ loc_prometheus:
|
|||
- job_name: ilo_snmp
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_ilo_snmp.json'
|
||||
- '/etc/prometheus/targets/ilo_snmp.json'
|
||||
metrics_path: '/snmp'
|
||||
params:
|
||||
module:
|
||||
|
@ -142,11 +214,11 @@ loc_prometheus:
|
|||
- job_name: ups_snmp
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_ups_snmp.json'
|
||||
- '/etc/prometheus/targets/ups_snmp.json'
|
||||
metrics_path: '/snmp'
|
||||
params:
|
||||
module:
|
||||
- eatonups
|
||||
- apc
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
{{ ansible_header | comment }}
|
||||
{# As this is also using brackets it will conflict without a raw block #}
|
||||
{% raw %}
|
||||
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07
|
||||
# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2022-08-09
|
||||
# We remove descriptions as we only send summary on IRC.
|
||||
# UPS, APT and RADIUS configuration is made by Crans.
|
||||
# UPS, APT and printer configuration are made by Crans.
|
||||
|
||||
groups:
|
||||
- name: alert.rules
|
||||
|
@ -151,7 +151,7 @@ groups:
|
|||
summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -174,6 +174,14 @@ groups:
|
|||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: (node_bonding_active - node_bonding_slaves) != 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
|
||||
# This happend in June 2021 at Crans
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||
|
@ -183,6 +191,30 @@ groups:
|
|||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: node_reboot_required > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
|
||||
############
|
||||
# Blackbox #
|
||||
############
|
||||
|
@ -203,82 +235,84 @@ groups:
|
|||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
|
||||
##############
|
||||
# PostgreSQL #
|
||||
##############
|
||||
|
||||
- alert: PostgresqlDown
|
||||
expr: pg_up == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.2
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql high rollback rate (database {{ $labels.datname }}, instance {{ $labels.instance }})
|
||||
|
||||
|
||||
########
|
||||
# Bird #
|
||||
########
|
||||
|
||||
# Check BGP routes
|
||||
- alert: BGPRoutesMissing
|
||||
expr: bird_protocol_prefix_import_count{proto="BGP", import_filter="ACCEPT"} < 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Pas de route BGP importée depuis {{ $labels.name }}
|
||||
|
||||
#######
|
||||
# UPS #
|
||||
#######
|
||||
|
||||
# Check UPS
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 5m
|
||||
- alert: UpsTooHighPower
|
||||
expr: sum(rPDUIdentDevicePowerWatts) > 2000
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La source d'alimentation de {{ $labels.instance }} a changé !
|
||||
|
||||
- alert: UpsBatteryStatusChanged
|
||||
expr: upsBatteryStatus != 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: L'état de la batterie de {{ $labels.instance }} a changé !
|
||||
|
||||
- alert: UpsTemperatureWarning
|
||||
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
||||
|
||||
- alert: UpsTemperatureCritical
|
||||
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C
|
||||
|
||||
- alert: UpsHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 65
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsVeryHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: La charge de {{ $labels.instance }} est de {{ $value }}%
|
||||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V
|
||||
|
||||
- alert: UpsWrongOutputVoltage
|
||||
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
||||
summary: La puissance totale tirée est trop grande ({{ $labels.rPDUIdentDevicePowerWatts }} W)
|
||||
|
||||
#######
|
||||
# iLO #
|
||||
|
@ -360,7 +394,7 @@ groups:
|
|||
|
||||
- alert: PrinterWarning
|
||||
expr: deviceAlertDescription >= 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -369,7 +403,7 @@ groups:
|
|||
|
||||
- alert: PrinterDoorOpen
|
||||
expr: prtCoverStatus{prtCoverStatus="coverClosed"} != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -378,7 +412,7 @@ groups:
|
|||
|
||||
- alert: PrinterTonerLow
|
||||
expr: prtMarkerSuppliesLevel < 1000
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -387,7 +421,7 @@ groups:
|
|||
|
||||
- alert: PrinterTonerMissing
|
||||
expr: prtMarkerSuppliesLevel == 0
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -396,7 +430,7 @@ groups:
|
|||
|
||||
- alert: PrinterPaperJam
|
||||
expr: prtgenStatusPaperJam != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -405,7 +439,7 @@ groups:
|
|||
|
||||
- alert: PrinterPaperEmpty
|
||||
expr: prtgenStatusInputEmpty != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -414,7 +448,7 @@ groups:
|
|||
|
||||
- alert: PrinterCoverOpen
|
||||
expr: prtgenStatusCoverOpen != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -423,7 +457,7 @@ groups:
|
|||
|
||||
- alert: PrinterPaperTonerError
|
||||
expr: prtgenStatusTonerError != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -432,7 +466,7 @@ groups:
|
|||
|
||||
- alert: PrinterSystemError
|
||||
expr: prtgenStatusSrvcReqd != 1
|
||||
for: 3m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -461,7 +495,7 @@ groups:
|
|||
summary: "{{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }}"
|
||||
|
||||
- alert: MailqNotEmpty
|
||||
expr: postfix_mailq_length > 25
|
||||
expr: sum by(instance) (postfix_showq_message_size_bytes_count) > 30
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
Loading…
Reference in New Issue