[prometheus] ilo_snmp & shit
parent
e9bf2702c5
commit
c2eab645bd
|
@ -7,7 +7,7 @@ glob_snmp_exporter:
|
|||
|
||||
glob_ninjabot:
|
||||
config:
|
||||
nick: Prometheus
|
||||
nick: monitoring
|
||||
server: irc.adm.crans.org
|
||||
port: 6667
|
||||
channel: "#monitoring"
|
||||
|
|
|
@ -90,3 +90,23 @@ loc_prometheus:
|
|||
- source_labels: [instance]
|
||||
target_label: __address__
|
||||
replacement: '$1:3903'
|
||||
|
||||
ilo_snmp:
|
||||
file: targets_ilo_snmp.json
|
||||
targets: "{{ groups['ilo_snmp'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}"
|
||||
config:
|
||||
- job_name: ilo_snmp
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_ilo_snmp.json'
|
||||
metrics_path: '/snmp'
|
||||
params:
|
||||
module:
|
||||
- ilo
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- replacement: '127.0.0.1:9116'
|
||||
target_label: __address__
|
||||
|
|
9
hosts
9
hosts
|
@ -480,6 +480,15 @@ marquis.infra.crans.org # manoir
|
|||
mercure.infra.crans.org # 3m
|
||||
#5m-5.infra.crans.org Déplacée au 2b
|
||||
|
||||
[ilo_snmp]
|
||||
ilo-daniel.adm.crans.org
|
||||
ilo-jack.adm.crans.org
|
||||
ilo-odlyd.adm.crans.org
|
||||
ilo-sam.adm.crans.org
|
||||
ilo-stitch.adm.crans.org
|
||||
ilo-thot.adm.crans.org
|
||||
ilo-zamok.adm.crans.org
|
||||
|
||||
# everything at crans
|
||||
[crans:children]
|
||||
crans_server
|
||||
|
|
|
@ -34,9 +34,14 @@
|
|||
roles:
|
||||
- rsyslog-client
|
||||
|
||||
- hosts: server
|
||||
vars:
|
||||
prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
|
||||
roles:
|
||||
- prometheus-node-exporter
|
||||
|
||||
- import_playbook: scripts.yml
|
||||
- import_playbook: vm_setup.yml
|
||||
- import_playbook: borgbackup_client.yml
|
||||
- import_playbook: monitoring.yml
|
||||
- import_playbook: network_interfaces.yml
|
||||
- import_playbook: nullmailer.yml
|
||||
|
|
|
@ -56,5 +56,3 @@ receivers:
|
|||
webhook_configs:
|
||||
- url: 'http://localhost:5000/'
|
||||
send_resolved: true
|
||||
- url: 'http://localhost:8000/'
|
||||
send_resolved: true
|
||||
|
|
|
@ -479,3 +479,226 @@ ubiquiti_unifi:
|
|||
auth_protocol: SHA
|
||||
priv_protocol: AES
|
||||
priv_password: {{ snmp_exporter.unifi_password }}
|
||||
|
||||
ilo:
|
||||
walk:
|
||||
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
|
||||
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
|
||||
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
|
||||
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
|
||||
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
|
||||
- 1.3.6.1.4.1.232.9.2.2 # iLO
|
||||
metrics:
|
||||
- name: cpqHeResilientMemCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.14.4
|
||||
type: EnumAsStateSet
|
||||
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHePowerMeterCurrReading
|
||||
oid: 1.3.6.1.4.1.232.6.2.15.3
|
||||
type: gauge
|
||||
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
|
||||
- name: cpqHeHWBiosCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.16.1
|
||||
type: EnumAsStateSet
|
||||
help: This value indicates an error has been detected during Pre-OS Test (POST)
|
||||
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeSysBatteryCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.17.1
|
||||
type: EnumAsStateSet
|
||||
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
|
||||
indexes:
|
||||
- labelname: cpqHeSysBatteryChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeSysBatteryIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeTemperatureLocale
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||
type: EnumAsInfo
|
||||
help: This specifies the location of the temperature sensor present in the system.
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: unknown
|
||||
3: system
|
||||
4: systemBoard
|
||||
5: ioBoard
|
||||
6: cpu
|
||||
7: memory
|
||||
8: storage
|
||||
9: removableMedia
|
||||
10: powerSupply
|
||||
11: ambient
|
||||
12: chassis
|
||||
13: bridgeCard
|
||||
- name: cpqHeTemperatureCelsius
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
|
||||
type: gauge
|
||||
help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
type: gauge
|
||||
- name: cpqHeTemperatureThreshold
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
|
||||
type: gauge
|
||||
help: This is the shutdown threshold temperature sensor setting in degrees celsius
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.5
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
type: gauge
|
||||
- name: cpqHeTemperatureCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||
type: EnumAsStateSet
|
||||
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeFltTolFanLocale
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||
type: EnumAsInfo
|
||||
help: This specifies the location of the fan present in the system.
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolFanChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolFanIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: unknown
|
||||
3: system
|
||||
4: systemBoard
|
||||
5: ioBoard
|
||||
6: cpu
|
||||
7: memory
|
||||
8: storage
|
||||
9: removableMedia
|
||||
10: powerSupply
|
||||
11: ambient
|
||||
12: chassis
|
||||
13: bridgeCard
|
||||
- name: cpqHeFltTolFanCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||
type: EnumAsStateSet
|
||||
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolFanChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolFanIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeFltTolPowerSupplyStatus
|
||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||
type: EnumAsStateSet
|
||||
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolPowerSupplyBay
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: noError
|
||||
2: generalFailure
|
||||
3: bistFailure
|
||||
4: fanFailure
|
||||
5: tempFailure
|
||||
6: interlockOpen
|
||||
7: epromFailed
|
||||
8: vrefFailed
|
||||
9: dacFailed
|
||||
10: ramTestFailed
|
||||
11: voltageChannelFailed
|
||||
12: orringdiodeFailed
|
||||
13: brownOut
|
||||
14: giveupOnStartup
|
||||
15: nvramInvalid
|
||||
16: calibrationTableInvalid
|
||||
17: noPowerInput
|
||||
- name: cpqSm2CntlrInterfaceStatus
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.17
|
||||
type: EnumAsStateSet
|
||||
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: notResponding
|
||||
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.27
|
||||
type: EnumAsStateSet
|
||||
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
|
||||
enum_values:
|
||||
1: notSupported
|
||||
2: set
|
||||
3: notSet
|
||||
- name: cpqSm2CntlrLicenseActive
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.30
|
||||
type: EnumAsStateSet
|
||||
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
|
||||
enum_values:
|
||||
1: none
|
||||
2: iloAdvanced
|
||||
3: iloLight
|
||||
4: iloAdvancedBlade
|
||||
5: iloStandard
|
||||
6: iloEssentials
|
||||
7: iloScaleOut
|
||||
8: iloAdvancedPremiumSecurity
|
||||
- name: cpqSm2CntlrServerPowerState
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.32
|
||||
type: EnumAsStateSet
|
||||
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
|
||||
enum_values:
|
||||
1: unknown
|
||||
2: poweredOff
|
||||
3: poweredOn
|
||||
4: insufficientPowerOrPowerOnDenied
|
||||
version: 3
|
||||
# Reduce timeout to retry faster
|
||||
timeout: 1s
|
||||
auth:
|
||||
security_level: authPriv
|
||||
username: crans
|
||||
password: SpnVOv4nuF10BOye9zjX
|
||||
auth_protocol: SHA
|
||||
priv_protocol: AES
|
||||
priv_password: 21rW6LHShmgdHsdPQXHP
|
||||
|
|
|
@ -23,7 +23,7 @@ groups:
|
|||
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -84,22 +84,6 @@ groups:
|
|||
annotations:
|
||||
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
|
@ -143,7 +127,7 @@ groups:
|
|||
# 0B is so hot
|
||||
# En pratique c'est mauvais de tourner des disques trop chauds
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
expr: node_hwmon_temp_celsius > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -205,20 +189,12 @@ groups:
|
|||
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: probe_success == 0
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
||||
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: avg_over_time(probe_duration_seconds[1m]) > 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
|
||||
for: 0m
|
||||
|
@ -304,6 +280,80 @@ groups:
|
|||
annotations:
|
||||
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
|
||||
|
||||
#######
|
||||
# iLO #
|
||||
#######
|
||||
|
||||
- alert: IloResilientMemoryDegraded
|
||||
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La mémoire vive n'est plus résiliente
|
||||
({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloBiosSelfTestDegraded
|
||||
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Une erreur a été détectée lors du POST du serveur
|
||||
({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloBatteryDegraded
|
||||
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La batterie est dégradée
|
||||
({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloTemperatureSensorDegraded
|
||||
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le capteur de température est dégradé
|
||||
({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloFanDegraded
|
||||
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le ventilateur est dégradé
|
||||
({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloPowerSupplyDegraded
|
||||
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
L'alimentation est dégradée
|
||||
({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}
|
||||
|
||||
- alert: IloOverrideSwitchState
|
||||
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le switch de réinitialisation n'est pas à l'état d'origine,
|
||||
l'authentification est bypassée sur {{ $labels.instance }}
|
||||
|
||||
#########
|
||||
# Other #
|
||||
#########
|
||||
|
@ -316,8 +366,8 @@ groups:
|
|||
annotations:
|
||||
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
|
||||
|
||||
- alert: AptOrphans
|
||||
expr: apt_orphans > 10
|
||||
- alert: AptObsolete
|
||||
expr: apt_obsolete > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -347,13 +397,4 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
|
||||
|
||||
- alert: TooManyUDPErrors
|
||||
expr: irate(node_netstat_Udp_InErrors[5m]) > 100
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} a plus de {{ $value }} connexions UDP en erreur. Quelque chose spam!"
|
||||
|
||||
{% endraw %}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
global:
|
||||
# scrape_interval is set to the global default (60s)
|
||||
# evaluation_interval is set to the global default (60s)
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
scrape_timeout: 30s # was 10s by default
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
|
|
Loading…
Reference in New Issue