[prometheus] ilo_snmp & shit
parent
e9bf2702c5
commit
c2eab645bd
|
@ -7,7 +7,7 @@ glob_snmp_exporter:
|
||||||
|
|
||||||
glob_ninjabot:
|
glob_ninjabot:
|
||||||
config:
|
config:
|
||||||
nick: Prometheus
|
nick: monitoring
|
||||||
server: irc.adm.crans.org
|
server: irc.adm.crans.org
|
||||||
port: 6667
|
port: 6667
|
||||||
channel: "#monitoring"
|
channel: "#monitoring"
|
||||||
|
|
|
@ -90,3 +90,23 @@ loc_prometheus:
|
||||||
- source_labels: [instance]
|
- source_labels: [instance]
|
||||||
target_label: __address__
|
target_label: __address__
|
||||||
replacement: '$1:3903'
|
replacement: '$1:3903'
|
||||||
|
|
||||||
|
ilo_snmp:
|
||||||
|
file: targets_ilo_snmp.json
|
||||||
|
targets: "{{ groups['ilo_snmp'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}"
|
||||||
|
config:
|
||||||
|
- job_name: ilo_snmp
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_ilo_snmp.json'
|
||||||
|
metrics_path: '/snmp'
|
||||||
|
params:
|
||||||
|
module:
|
||||||
|
- ilo
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- replacement: '127.0.0.1:9116'
|
||||||
|
target_label: __address__
|
||||||
|
|
9
hosts
9
hosts
|
@ -480,6 +480,15 @@ marquis.infra.crans.org # manoir
|
||||||
mercure.infra.crans.org # 3m
|
mercure.infra.crans.org # 3m
|
||||||
#5m-5.infra.crans.org Déplacée au 2b
|
#5m-5.infra.crans.org Déplacée au 2b
|
||||||
|
|
||||||
|
[ilo_snmp]
|
||||||
|
ilo-daniel.adm.crans.org
|
||||||
|
ilo-jack.adm.crans.org
|
||||||
|
ilo-odlyd.adm.crans.org
|
||||||
|
ilo-sam.adm.crans.org
|
||||||
|
ilo-stitch.adm.crans.org
|
||||||
|
ilo-thot.adm.crans.org
|
||||||
|
ilo-zamok.adm.crans.org
|
||||||
|
|
||||||
# everything at crans
|
# everything at crans
|
||||||
[crans:children]
|
[crans:children]
|
||||||
crans_server
|
crans_server
|
||||||
|
|
|
@ -34,9 +34,14 @@
|
||||||
roles:
|
roles:
|
||||||
- rsyslog-client
|
- rsyslog-client
|
||||||
|
|
||||||
|
- hosts: server
|
||||||
|
vars:
|
||||||
|
prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
|
||||||
|
roles:
|
||||||
|
- prometheus-node-exporter
|
||||||
|
|
||||||
- import_playbook: scripts.yml
|
- import_playbook: scripts.yml
|
||||||
- import_playbook: vm_setup.yml
|
- import_playbook: vm_setup.yml
|
||||||
- import_playbook: borgbackup_client.yml
|
- import_playbook: borgbackup_client.yml
|
||||||
- import_playbook: monitoring.yml
|
|
||||||
- import_playbook: network_interfaces.yml
|
- import_playbook: network_interfaces.yml
|
||||||
- import_playbook: nullmailer.yml
|
- import_playbook: nullmailer.yml
|
||||||
|
|
|
@ -56,5 +56,3 @@ receivers:
|
||||||
webhook_configs:
|
webhook_configs:
|
||||||
- url: 'http://localhost:5000/'
|
- url: 'http://localhost:5000/'
|
||||||
send_resolved: true
|
send_resolved: true
|
||||||
- url: 'http://localhost:8000/'
|
|
||||||
send_resolved: true
|
|
||||||
|
|
|
@ -479,3 +479,226 @@ ubiquiti_unifi:
|
||||||
auth_protocol: SHA
|
auth_protocol: SHA
|
||||||
priv_protocol: AES
|
priv_protocol: AES
|
||||||
priv_password: {{ snmp_exporter.unifi_password }}
|
priv_password: {{ snmp_exporter.unifi_password }}
|
||||||
|
|
||||||
|
ilo:
|
||||||
|
walk:
|
||||||
|
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
|
||||||
|
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
|
||||||
|
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
|
||||||
|
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
|
||||||
|
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
|
||||||
|
- 1.3.6.1.4.1.232.9.2.2 # iLO
|
||||||
|
metrics:
|
||||||
|
- name: cpqHeResilientMemCondition
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.14.4
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: degraded
|
||||||
|
4: failed
|
||||||
|
- name: cpqHePowerMeterCurrReading
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.15.3
|
||||||
|
type: gauge
|
||||||
|
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
|
||||||
|
- name: cpqHeHWBiosCondition
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.16.1
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: This value indicates an error has been detected during Pre-OS Test (POST)
|
||||||
|
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: degraded
|
||||||
|
4: failed
|
||||||
|
- name: cpqHeSysBatteryCondition
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.17.1
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeSysBatteryChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeSysBatteryIndex
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: degraded
|
||||||
|
4: failed
|
||||||
|
- name: cpqHeTemperatureLocale
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||||
|
type: EnumAsInfo
|
||||||
|
help: This specifies the location of the temperature sensor present in the system.
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeTemperatureChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeTemperatureIndex
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: unknown
|
||||||
|
3: system
|
||||||
|
4: systemBoard
|
||||||
|
5: ioBoard
|
||||||
|
6: cpu
|
||||||
|
7: memory
|
||||||
|
8: storage
|
||||||
|
9: removableMedia
|
||||||
|
10: powerSupply
|
||||||
|
11: ambient
|
||||||
|
12: chassis
|
||||||
|
13: bridgeCard
|
||||||
|
- name: cpqHeTemperatureCelsius
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
|
||||||
|
type: gauge
|
||||||
|
help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeTemperatureChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeTemperatureIndex
|
||||||
|
type: gauge
|
||||||
|
- name: cpqHeTemperatureThreshold
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
|
||||||
|
type: gauge
|
||||||
|
help: This is the shutdown threshold temperature sensor setting in degrees celsius
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.8.1.5
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeTemperatureChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeTemperatureIndex
|
||||||
|
type: gauge
|
||||||
|
- name: cpqHeTemperatureCondition
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeTemperatureChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeTemperatureIndex
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: degraded
|
||||||
|
4: failed
|
||||||
|
- name: cpqHeFltTolFanLocale
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||||
|
type: EnumAsInfo
|
||||||
|
help: This specifies the location of the fan present in the system.
|
||||||
|
- 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeFltTolFanChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeFltTolFanIndex
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: unknown
|
||||||
|
3: system
|
||||||
|
4: systemBoard
|
||||||
|
5: ioBoard
|
||||||
|
6: cpu
|
||||||
|
7: memory
|
||||||
|
8: storage
|
||||||
|
9: removableMedia
|
||||||
|
10: powerSupply
|
||||||
|
11: ambient
|
||||||
|
12: chassis
|
||||||
|
13: bridgeCard
|
||||||
|
- name: cpqHeFltTolFanCondition
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeFltTolFanChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeFltTolFanIndex
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: degraded
|
||||||
|
4: failed
|
||||||
|
- name: cpqHeFltTolPowerSupplyStatus
|
||||||
|
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||||
|
indexes:
|
||||||
|
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||||
|
type: gauge
|
||||||
|
- labelname: cpqHeFltTolPowerSupplyBay
|
||||||
|
type: gauge
|
||||||
|
enum_values:
|
||||||
|
1: noError
|
||||||
|
2: generalFailure
|
||||||
|
3: bistFailure
|
||||||
|
4: fanFailure
|
||||||
|
5: tempFailure
|
||||||
|
6: interlockOpen
|
||||||
|
7: epromFailed
|
||||||
|
8: vrefFailed
|
||||||
|
9: dacFailed
|
||||||
|
10: ramTestFailed
|
||||||
|
11: voltageChannelFailed
|
||||||
|
12: orringdiodeFailed
|
||||||
|
13: brownOut
|
||||||
|
14: giveupOnStartup
|
||||||
|
15: nvramInvalid
|
||||||
|
16: calibrationTableInvalid
|
||||||
|
17: noPowerInput
|
||||||
|
- name: cpqSm2CntlrInterfaceStatus
|
||||||
|
oid: 1.3.6.1.4.1.232.9.2.2.17
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
|
||||||
|
enum_values:
|
||||||
|
1: other
|
||||||
|
2: ok
|
||||||
|
3: notResponding
|
||||||
|
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
|
||||||
|
oid: 1.3.6.1.4.1.232.9.2.2.27
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
|
||||||
|
enum_values:
|
||||||
|
1: notSupported
|
||||||
|
2: set
|
||||||
|
3: notSet
|
||||||
|
- name: cpqSm2CntlrLicenseActive
|
||||||
|
oid: 1.3.6.1.4.1.232.9.2.2.30
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
|
||||||
|
enum_values:
|
||||||
|
1: none
|
||||||
|
2: iloAdvanced
|
||||||
|
3: iloLight
|
||||||
|
4: iloAdvancedBlade
|
||||||
|
5: iloStandard
|
||||||
|
6: iloEssentials
|
||||||
|
7: iloScaleOut
|
||||||
|
8: iloAdvancedPremiumSecurity
|
||||||
|
- name: cpqSm2CntlrServerPowerState
|
||||||
|
oid: 1.3.6.1.4.1.232.9.2.2.32
|
||||||
|
type: EnumAsStateSet
|
||||||
|
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
|
||||||
|
enum_values:
|
||||||
|
1: unknown
|
||||||
|
2: poweredOff
|
||||||
|
3: poweredOn
|
||||||
|
4: insufficientPowerOrPowerOnDenied
|
||||||
|
version: 3
|
||||||
|
# Reduce timeout to retry faster
|
||||||
|
timeout: 1s
|
||||||
|
auth:
|
||||||
|
security_level: authPriv
|
||||||
|
username: crans
|
||||||
|
password: SpnVOv4nuF10BOye9zjX
|
||||||
|
auth_protocol: SHA
|
||||||
|
priv_protocol: AES
|
||||||
|
priv_password: 21rW6LHShmgdHsdPQXHP
|
||||||
|
|
|
@ -23,7 +23,7 @@ groups:
|
||||||
|
|
||||||
- alert: PrometheusTargetMissing
|
- alert: PrometheusTargetMissing
|
||||||
expr: up == 0
|
expr: up == 0
|
||||||
for: 0m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -84,22 +84,6 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
|
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
|
||||||
|
|
||||||
- alert: HostUnusualDiskReadRate
|
|
||||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
|
||||||
|
|
||||||
- alert: HostUnusualDiskWriteRate
|
|
||||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
|
||||||
|
|
||||||
- alert: HostOutOfDiskSpace
|
- alert: HostOutOfDiskSpace
|
||||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
for: 2m
|
for: 2m
|
||||||
|
@ -143,7 +127,7 @@ groups:
|
||||||
# 0B is so hot
|
# 0B is so hot
|
||||||
# En pratique c'est mauvais de tourner des disques trop chauds
|
# En pratique c'est mauvais de tourner des disques trop chauds
|
||||||
- alert: HostPhysicalComponentTooHot
|
- alert: HostPhysicalComponentTooHot
|
||||||
expr: node_hwmon_temp_celsius > 75
|
expr: node_hwmon_temp_celsius > 85
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -205,20 +189,12 @@ groups:
|
||||||
|
|
||||||
- alert: BlackboxProbeFailed
|
- alert: BlackboxProbeFailed
|
||||||
expr: probe_success == 0
|
expr: probe_success == 0
|
||||||
for: 0m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
||||||
|
|
||||||
- alert: BlackboxSlowProbe
|
|
||||||
expr: avg_over_time(probe_duration_seconds[1m]) > 1
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
|
||||||
|
|
||||||
- alert: BlackboxSslCertificateWillExpireSoon
|
- alert: BlackboxSslCertificateWillExpireSoon
|
||||||
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
|
||||||
for: 0m
|
for: 0m
|
||||||
|
@ -304,6 +280,80 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
||||||
|
|
||||||
|
#######
|
||||||
|
# iLO #
|
||||||
|
#######
|
||||||
|
|
||||||
|
- alert: IloResilientMemoryDegraded
|
||||||
|
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La mémoire vive n'est plus résiliente
|
||||||
|
({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloBiosSelfTestDegraded
|
||||||
|
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Une erreur a été détectée lors du POST du serveur
|
||||||
|
({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloBatteryDegraded
|
||||||
|
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La batterie est dégradée
|
||||||
|
({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloTemperatureSensorDegraded
|
||||||
|
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le capteur de température est dégradé
|
||||||
|
({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloFanDegraded
|
||||||
|
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le ventilateur est dégradé
|
||||||
|
({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloPowerSupplyDegraded
|
||||||
|
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
L'alimentation est dégradée
|
||||||
|
({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}
|
||||||
|
|
||||||
|
- alert: IloOverrideSwitchState
|
||||||
|
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le switch de réinitialisation n'est pas à l'état d'origine,
|
||||||
|
l'authentification est bypassée sur {{ $labels.instance }}
|
||||||
|
|
||||||
#########
|
#########
|
||||||
# Other #
|
# Other #
|
||||||
#########
|
#########
|
||||||
|
@ -316,8 +366,8 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
|
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
|
||||||
|
|
||||||
- alert: AptOrphans
|
- alert: AptObsolete
|
||||||
expr: apt_orphans > 10
|
expr: apt_obsolete > 10
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -347,13 +397,4 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
|
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
|
||||||
|
|
||||||
- alert: TooManyUDPErrors
|
|
||||||
expr: irate(node_netstat_Udp_InErrors[5m]) > 100
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "{{ $labels.instance }} a plus de {{ $value }} connexions UDP en erreur. Quelque chose spam!"
|
|
||||||
|
|
||||||
{% endraw %}
|
{% endraw %}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
global:
|
global:
|
||||||
# scrape_interval is set to the global default (60s)
|
# scrape_interval is set to the global default (60s)
|
||||||
# evaluation_interval is set to the global default (60s)
|
# evaluation_interval is set to the global default (60s)
|
||||||
# scrape_timeout is set to the global default (10s).
|
scrape_timeout: 30s # was 10s by default
|
||||||
|
|
||||||
# Attach these labels to any time series or alerts when communicating with
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
# external systems (federation, remote storage, Alertmanager).
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
|
Loading…
Reference in New Issue