From c2eab645bd9ba28bb42fe0c0eafc65907cb60f17 Mon Sep 17 00:00:00 2001
From: shirenn <shirenn@crans.org>
Date: Sat, 15 Jan 2022 17:37:50 +0100
Subject: [PATCH] [prometheus] ilo_snmp & shit

---
 group_vars/prometheus.yml                     |   2 +-
 host_vars/monitoring.adm.crans.org.yml        |  20 ++
 hosts                                         |   9 +
 plays/root.yml                                |   7 +-
 .../templates/prometheus/alertmanager.yml.j2  |   2 -
 .../templates/prometheus/snmp.yml.j2          | 223 ++++++++++++++++++
 .../templates/prometheus/alert.rules.yml.j2   | 117 ++++++---
 .../templates/prometheus/prometheus.yml.j2    |   2 +-
 8 files changed, 339 insertions(+), 43 deletions(-)

diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml
index 64642c8a..60449558 100644
--- a/group_vars/prometheus.yml
+++ b/group_vars/prometheus.yml
@@ -7,7 +7,7 @@ glob_snmp_exporter:
 
 glob_ninjabot:
   config:
-    nick: Prometheus
+    nick: monitoring
     server: irc.adm.crans.org
     port: 6667
     channel: "#monitoring"
diff --git a/host_vars/monitoring.adm.crans.org.yml b/host_vars/monitoring.adm.crans.org.yml
index 1a888fc6..6ad8ad0f 100644
--- a/host_vars/monitoring.adm.crans.org.yml
+++ b/host_vars/monitoring.adm.crans.org.yml
@@ -90,3 +90,23 @@ loc_prometheus:
           - source_labels: [instance]
             target_label: __address__
             replacement: '$1:3903'
+
+  ilo_snmp:
+    file: targets_ilo_snmp.json
+    targets: "{{ groups['ilo_snmp'] | select('match', '^.*\\.adm\\.crans\\.org$')  | list | sort }}"
+    config:
+      - job_name: ilo_snmp
+        file_sd_configs:
+          - files:
+            - '/etc/prometheus/targets_ilo_snmp.json'
+        metrics_path: '/snmp'
+        params:
+          module:
+            - ilo
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - replacement: '127.0.0.1:9116'
+            target_label: __address__
diff --git a/hosts b/hosts
index 9432a4a7..b41b969b 100644
--- a/hosts
+++ b/hosts
@@ -480,6 +480,15 @@ marquis.infra.crans.org # manoir
 mercure.infra.crans.org # 3m
 #5m-5.infra.crans.org Déplacée au 2b
 
+[ilo_snmp]
+ilo-daniel.adm.crans.org
+ilo-jack.adm.crans.org
+ilo-odlyd.adm.crans.org
+ilo-sam.adm.crans.org
+ilo-stitch.adm.crans.org
+ilo-thot.adm.crans.org
+ilo-zamok.adm.crans.org
+
 # everything at crans
 [crans:children]
 crans_server
diff --git a/plays/root.yml b/plays/root.yml
index 4539ea53..58940ad2 100755
--- a/plays/root.yml
+++ b/plays/root.yml
@@ -34,9 +34,14 @@
   roles:
     - rsyslog-client
 
+- hosts: server
+  vars:
+    prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
+  roles:
+    - prometheus-node-exporter
+
 - import_playbook: scripts.yml
 - import_playbook: vm_setup.yml
 - import_playbook: borgbackup_client.yml
-- import_playbook: monitoring.yml
 - import_playbook: network_interfaces.yml
 - import_playbook: nullmailer.yml
diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
index 4c10974b..620ddee9 100644
--- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
+++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
@@ -56,5 +56,3 @@ receivers:
   webhook_configs:
   - url: 'http://localhost:5000/'
     send_resolved: true
-  - url: 'http://localhost:8000/'
-    send_resolved: true
diff --git a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2
index fb946b74..5455ae6f 100644
--- a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2
+++ b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2
@@ -479,3 +479,226 @@ ubiquiti_unifi:
     auth_protocol: SHA
     priv_protocol: AES
     priv_password: {{ snmp_exporter.unifi_password }}
+
+ilo:
+  walk:
+  - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
+  - 1.3.6.1.4.1.232.6.2.15.3 # Power meter
+  - 1.3.6.1.4.1.232.6.2.16.1 # POST tests
+  - 1.3.6.1.4.1.232.6.2.17.1 # Battery
+  - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
+  - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
+  - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
+  - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
+  - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
+  - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
+  - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
+  - 1.3.6.1.4.1.232.9.2.2 # iLO
+  metrics:
+  - name: cpqHeResilientMemCondition
+    oid: 1.3.6.1.4.1.232.6.2.14.4
+    type: EnumAsStateSet
+    help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
+    enum_values:
+      1: other
+      2: ok
+      3: degraded
+      4: failed
+  - name: cpqHePowerMeterCurrReading
+    oid: 1.3.6.1.4.1.232.6.2.15.3
+    type: gauge
+    help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
+  - name: cpqHeHWBiosCondition
+    oid: 1.3.6.1.4.1.232.6.2.16.1
+    type: EnumAsStateSet
+    help: This value indicates an error has been detected during Pre-OS Test (POST)
+      or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
+    enum_values:
+      1: other
+      2: ok
+      3: degraded
+      4: failed
+  - name: cpqHeSysBatteryCondition
+    oid: 1.3.6.1.4.1.232.6.2.17.1
+    type: EnumAsStateSet
+    help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
+    indexes:
+    - labelname: cpqHeSysBatteryChassis
+      type: gauge
+    - labelname: cpqHeSysBatteryIndex
+      type: gauge
+    enum_values:
+      1: other
+      2: ok
+      3: degraded
+      4: failed
+  - name: cpqHeTemperatureLocale
+    oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
+    type: EnumAsInfo
+    help: This specifies the location of the temperature sensor present in the system.
+      - 1.3.6.1.4.1.232.6.2.6.8.1.3
+    indexes:
+    - labelname: cpqHeTemperatureChassis
+      type: gauge
+    - labelname: cpqHeTemperatureIndex
+      type: gauge
+    enum_values:
+      1: other
+      2: unknown
+      3: system
+      4: systemBoard
+      5: ioBoard
+      6: cpu
+      7: memory
+      8: storage
+      9: removableMedia
+      10: powerSupply
+      11: ambient
+      12: chassis
+      13: bridgeCard
+  - name: cpqHeTemperatureCelsius
+    oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
+    type: gauge
+    help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
+    indexes:
+    - labelname: cpqHeTemperatureChassis
+      type: gauge
+    - labelname: cpqHeTemperatureIndex
+      type: gauge
+  - name: cpqHeTemperatureThreshold
+    oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
+    type: gauge
+    help: This is the shutdown threshold temperature sensor setting in degrees celsius
+      - 1.3.6.1.4.1.232.6.2.6.8.1.5
+    indexes:
+    - labelname: cpqHeTemperatureChassis
+      type: gauge
+    - labelname: cpqHeTemperatureIndex
+      type: gauge
+  - name: cpqHeTemperatureCondition
+    oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
+    type: EnumAsStateSet
+    help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
+    indexes:
+    - labelname: cpqHeTemperatureChassis
+      type: gauge
+    - labelname: cpqHeTemperatureIndex
+      type: gauge
+    enum_values:
+      1: other
+      2: ok
+      3: degraded
+      4: failed
+  - name: cpqHeFltTolFanLocale
+    oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
+    type: EnumAsInfo
+    help: This specifies the location of the fan present in the system.
+      - 1.3.6.1.4.1.232.6.2.6.7.1.3
+    indexes:
+    - labelname: cpqHeFltTolFanChassis
+      type: gauge
+    - labelname: cpqHeFltTolFanIndex
+      type: gauge
+    enum_values:
+      1: other
+      2: unknown
+      3: system
+      4: systemBoard
+      5: ioBoard
+      6: cpu
+      7: memory
+      8: storage
+      9: removableMedia
+      10: powerSupply
+      11: ambient
+      12: chassis
+      13: bridgeCard
+  - name: cpqHeFltTolFanCondition
+    oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
+    type: EnumAsStateSet
+    help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
+    indexes:
+    - labelname: cpqHeFltTolFanChassis
+      type: gauge
+    - labelname: cpqHeFltTolFanIndex
+      type: gauge
+    enum_values:
+      1: other
+      2: ok
+      3: degraded
+      4: failed
+  - name: cpqHeFltTolPowerSupplyStatus
+    oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
+    type: EnumAsStateSet
+    help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
+    indexes:
+    - labelname: cpqHeFltTolPowerSupplyChassis
+      type: gauge
+    - labelname: cpqHeFltTolPowerSupplyBay
+      type: gauge
+    enum_values:
+      1: noError
+      2: generalFailure
+      3: bistFailure
+      4: fanFailure
+      5: tempFailure
+      6: interlockOpen
+      7: epromFailed
+      8: vrefFailed
+      9: dacFailed
+      10: ramTestFailed
+      11: voltageChannelFailed
+      12: orringdiodeFailed
+      13: brownOut
+      14: giveupOnStartup
+      15: nvramInvalid
+      16: calibrationTableInvalid
+      17: noPowerInput
+  - name: cpqSm2CntlrInterfaceStatus
+    oid: 1.3.6.1.4.1.232.9.2.2.17
+    type: EnumAsStateSet
+    help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
+    enum_values:
+      1: other
+      2: ok
+      3: notResponding
+  - name: cpqSm2CntlriLOSecurityOverrideSwitchState
+    oid: 1.3.6.1.4.1.232.9.2.2.27
+    type: EnumAsStateSet
+    help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
+    enum_values:
+      1: notSupported
+      2: set
+      3: notSet
+  - name: cpqSm2CntlrLicenseActive
+    oid: 1.3.6.1.4.1.232.9.2.2.30
+    type: EnumAsStateSet
+    help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
+    enum_values:
+      1: none
+      2: iloAdvanced
+      3: iloLight
+      4: iloAdvancedBlade
+      5: iloStandard
+      6: iloEssentials
+      7: iloScaleOut
+      8: iloAdvancedPremiumSecurity
+  - name: cpqSm2CntlrServerPowerState
+    oid: 1.3.6.1.4.1.232.9.2.2.32
+    type: EnumAsStateSet
+    help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
+    enum_values:
+      1: unknown
+      2: poweredOff
+      3: poweredOn
+      4: insufficientPowerOrPowerOnDenied
+  version: 3
+  # Reduce timeout to retry faster
+  timeout: 1s
+  auth:
+    security_level: authPriv
+    username: crans
+    password: SpnVOv4nuF10BOye9zjX
+    auth_protocol: SHA
+    priv_protocol: AES
+    priv_password: 21rW6LHShmgdHsdPQXHP
diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
index 9c307ff8..3ed69bf7 100644
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@@ -23,7 +23,7 @@ groups:
 
   - alert: PrometheusTargetMissing
     expr: up == 0
-    for: 0m
+    for: 1m
     labels:
       severity: critical
     annotations:
@@ -84,22 +84,6 @@ groups:
     annotations:
       summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
 
-  - alert: HostUnusualDiskReadRate
-    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk read rate (instance {{ $labels.instance }})
-
-  - alert: HostUnusualDiskWriteRate
-    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk write rate (instance {{ $labels.instance }})
-
   - alert: HostOutOfDiskSpace
     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
     for: 2m
@@ -143,7 +127,7 @@ groups:
   # 0B is so hot
   # En pratique c'est mauvais de tourner des disques trop chauds
   - alert: HostPhysicalComponentTooHot
-    expr: node_hwmon_temp_celsius > 75
+    expr: node_hwmon_temp_celsius > 85
     for: 5m
     labels:
       severity: warning
@@ -205,20 +189,12 @@ groups:
 
   - alert: BlackboxProbeFailed
     expr: probe_success == 0
-    for: 0m
+    for: 1m
     labels:
       severity: critical
     annotations:
       summary: Blackbox probe failed (instance {{ $labels.instance }})
 
-  - alert: BlackboxSlowProbe
-    expr: avg_over_time(probe_duration_seconds[1m]) > 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: Blackbox slow probe (instance {{ $labels.instance }})
-
   - alert: BlackboxSslCertificateWillExpireSoon
     expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
     for: 0m
@@ -304,6 +280,80 @@ groups:
     annotations:
       summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
 
+  #######
+  # iLO #
+  #######
+
+  - alert: IloResilientMemoryDegraded
+    expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      summary: >-
+        La mémoire vive n'est plus résiliente
+        ({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}
+
+  - alert: IloBiosSelfTestDegraded
+    expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: >-
+        Une erreur a été détectée lors du POST du serveur
+        ({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}
+
+  - alert: IloBatteryDegraded
+    expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      summary: >-
+        La batterie est dégradée
+        ({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}
+
+  - alert: IloTemperatureSensorDegraded
+    expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: >-
+        Le capteur de température est dégradé
+        ({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}
+
+  - alert: IloFanDegraded
+    expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: >-
+        Le ventilateur est dégradé
+        ({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}
+
+  - alert: IloPowerSupplyDegraded
+    expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: >-
+        L'alimentation est dégradée
+        ({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}
+
+  - alert: IloOverrideSwitchState
+    expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: >-
+        Le switch de réinitialisation n'est pas à l'état d'origine,
+        l'authentification est bypassée sur {{ $labels.instance }}
+
   #########
   # Other #
   #########
@@ -316,8 +366,8 @@ groups:
     annotations:
       summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
 
-  - alert: AptOrphans
-    expr: apt_orphans > 10
+  - alert: AptObsolete
+    expr: apt_obsolete > 10
     for: 5m
     labels:
       severity: warning
@@ -347,13 +397,4 @@ groups:
       severity: warning
     annotations:
       summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
-
-  - alert: TooManyUDPErrors
-    expr: irate(node_netstat_Udp_InErrors[5m]) > 100
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "{{ $labels.instance }} a plus de {{ $value }} connexions UDP en erreur. Quelque chose spam!"
-
 {% endraw %}
diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2
index daa136c4..8c024bba 100644
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@@ -3,7 +3,7 @@
 global:
   # scrape_interval is set to the global default (60s)
   # evaluation_interval is set to the global default (60s)
-  # scrape_timeout is set to the global default (10s).
+  scrape_timeout: 30s # was 10s by default
 
   # Attach these labels to any time series or alerts when communicating with
   # external systems (federation, remote storage, Alertmanager).