diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml index 64642c8a..60449558 100644 --- a/group_vars/prometheus.yml +++ b/group_vars/prometheus.yml @@ -7,7 +7,7 @@ glob_snmp_exporter: glob_ninjabot: config: - nick: Prometheus + nick: monitoring server: irc.adm.crans.org port: 6667 channel: "#monitoring" diff --git a/host_vars/monitoring.adm.crans.org.yml b/host_vars/monitoring.adm.crans.org.yml index 1a888fc6..6ad8ad0f 100644 --- a/host_vars/monitoring.adm.crans.org.yml +++ b/host_vars/monitoring.adm.crans.org.yml @@ -90,3 +90,23 @@ loc_prometheus: - source_labels: [instance] target_label: __address__ replacement: '$1:3903' + + ilo_snmp: + file: targets_ilo_snmp.json + targets: "{{ groups['ilo_snmp'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}" + config: + - job_name: ilo_snmp + file_sd_configs: + - files: + - '/etc/prometheus/targets_ilo_snmp.json' + metrics_path: '/snmp' + params: + module: + - ilo + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - replacement: '127.0.0.1:9116' + target_label: __address__ diff --git a/hosts b/hosts index 9432a4a7..b41b969b 100644 --- a/hosts +++ b/hosts @@ -480,6 +480,15 @@ marquis.infra.crans.org # manoir mercure.infra.crans.org # 3m #5m-5.infra.crans.org Déplacée au 2b +[ilo_snmp] +ilo-daniel.adm.crans.org +ilo-jack.adm.crans.org +ilo-odlyd.adm.crans.org +ilo-sam.adm.crans.org +ilo-stitch.adm.crans.org +ilo-thot.adm.crans.org +ilo-zamok.adm.crans.org + # everything at crans [crans:children] crans_server diff --git a/plays/root.yml b/plays/root.yml index 4539ea53..58940ad2 100755 --- a/plays/root.yml +++ b/plays/root.yml @@ -34,9 +34,14 @@ roles: - rsyslog-client +- hosts: server + vars: + prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}" + roles: + - prometheus-node-exporter + - import_playbook: scripts.yml - import_playbook: vm_setup.yml - import_playbook: borgbackup_client.yml -- import_playbook: monitoring.yml - import_playbook: network_interfaces.yml - import_playbook: nullmailer.yml diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 index 4c10974b..620ddee9 100644 --- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -56,5 +56,3 @@ receivers: webhook_configs: - url: 'http://localhost:5000/' send_resolved: true - - url: 'http://localhost:8000/' - send_resolved: true diff --git a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 index fb946b74..5455ae6f 100644 --- a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 +++ b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 @@ -479,3 +479,226 @@ ubiquiti_unifi: auth_protocol: SHA priv_protocol: AES priv_password: {{ snmp_exporter.unifi_password }} + +ilo: + walk: + - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory + - 1.3.6.1.4.1.232.6.2.15.3 # Power meter + - 1.3.6.1.4.1.232.6.2.16.1 # POST tests + - 1.3.6.1.4.1.232.6.2.17.1 # Battery + - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location + - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value + - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit + - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition + - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location + - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition + - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply + - 1.3.6.1.4.1.232.9.2.2 # iLO + metrics: + - name: cpqHeResilientMemCondition + oid: 1.3.6.1.4.1.232.6.2.14.4 + type: EnumAsStateSet + help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHePowerMeterCurrReading + oid: 1.3.6.1.4.1.232.6.2.15.3 + type: gauge + help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 + - name: cpqHeHWBiosCondition + oid: 1.3.6.1.4.1.232.6.2.16.1 + type: EnumAsStateSet + help: This value indicates an error has been detected during Pre-OS Test (POST) + or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeSysBatteryCondition + oid: 1.3.6.1.4.1.232.6.2.17.1 + type: EnumAsStateSet + help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1 + indexes: + - labelname: cpqHeSysBatteryChassis + type: gauge + - labelname: cpqHeSysBatteryIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeTemperatureLocale + oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 + type: EnumAsInfo + help: This specifies the location of the temperature sensor present in the system. + - 1.3.6.1.4.1.232.6.2.6.8.1.3 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + enum_values: + 1: other + 2: unknown + 3: system + 4: systemBoard + 5: ioBoard + 6: cpu + 7: memory + 8: storage + 9: removableMedia + 10: powerSupply + 11: ambient + 12: chassis + 13: bridgeCard + - name: cpqHeTemperatureCelsius + oid: 1.3.6.1.4.1.232.6.2.6.8.1.4 + type: gauge + help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureThreshold + oid: 1.3.6.1.4.1.232.6.2.6.8.1.5 + type: gauge + help: This is the shutdown threshold temperature sensor setting in degrees celsius + - 1.3.6.1.4.1.232.6.2.6.8.1.5 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureCondition + oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 + type: EnumAsStateSet + help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeFltTolFanLocale + oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 + type: EnumAsInfo + help: This specifies the location of the fan present in the system. + - 1.3.6.1.4.1.232.6.2.6.7.1.3 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + enum_values: + 1: other + 2: unknown + 3: system + 4: systemBoard + 5: ioBoard + 6: cpu + 7: memory + 8: storage + 9: removableMedia + 10: powerSupply + 11: ambient + 12: chassis + 13: bridgeCard + - name: cpqHeFltTolFanCondition + oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 + type: EnumAsStateSet + help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeFltTolPowerSupplyStatus + oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 + type: EnumAsStateSet + help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 + indexes: + - labelname: cpqHeFltTolPowerSupplyChassis + type: gauge + - labelname: cpqHeFltTolPowerSupplyBay + type: gauge + enum_values: + 1: noError + 2: generalFailure + 3: bistFailure + 4: fanFailure + 5: tempFailure + 6: interlockOpen + 7: epromFailed + 8: vrefFailed + 9: dacFailed + 10: ramTestFailed + 11: voltageChannelFailed + 12: orringdiodeFailed + 13: brownOut + 14: giveupOnStartup + 15: nvramInvalid + 16: calibrationTableInvalid + 17: noPowerInput + - name: cpqSm2CntlrInterfaceStatus + oid: 1.3.6.1.4.1.232.9.2.2.17 + type: EnumAsStateSet + help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 + enum_values: + 1: other + 2: ok + 3: notResponding + - name: cpqSm2CntlriLOSecurityOverrideSwitchState + oid: 1.3.6.1.4.1.232.9.2.2.27 + type: EnumAsStateSet + help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 + enum_values: + 1: notSupported + 2: set + 3: notSet + - name: cpqSm2CntlrLicenseActive + oid: 1.3.6.1.4.1.232.9.2.2.30 + type: EnumAsStateSet + help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 + enum_values: + 1: none + 2: iloAdvanced + 3: iloLight + 4: iloAdvancedBlade + 5: iloStandard + 6: iloEssentials + 7: iloScaleOut + 8: iloAdvancedPremiumSecurity + - name: cpqSm2CntlrServerPowerState + oid: 1.3.6.1.4.1.232.9.2.2.32 + type: EnumAsStateSet + help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 + enum_values: + 1: unknown + 2: poweredOff + 3: poweredOn + 4: insufficientPowerOrPowerOnDenied + version: 3 + # Reduce timeout to retry faster + timeout: 1s + auth: + security_level: authPriv + username: crans + password: SpnVOv4nuF10BOye9zjX + auth_protocol: SHA + priv_protocol: AES + priv_password: 21rW6LHShmgdHsdPQXHP diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 9c307ff8..3ed69bf7 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -23,7 +23,7 @@ groups: - alert: PrometheusTargetMissing expr: up == 0 - for: 0m + for: 1m labels: severity: critical annotations: @@ -84,22 +84,6 @@ groups: annotations: summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%) - - alert: HostUnusualDiskReadRate - expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk read rate (instance {{ $labels.instance }}) - - - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write rate (instance {{ $labels.instance }}) - - alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m @@ -143,7 +127,7 @@ groups: # 0B is so hot # En pratique c'est mauvais de tourner des disques trop chauds - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 + expr: node_hwmon_temp_celsius > 85 for: 5m labels: severity: warning @@ -205,20 +189,12 @@ groups: - alert: BlackboxProbeFailed expr: probe_success == 0 - for: 0m + for: 1m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) - - alert: BlackboxSlowProbe - expr: avg_over_time(probe_duration_seconds[1m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox slow probe (instance {{ $labels.instance }}) - - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20 for: 0m @@ -304,6 +280,80 @@ groups: annotations: summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V + ####### + # iLO # + ####### + + - alert: IloResilientMemoryDegraded + expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire vive n'est plus résiliente + ({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }} + + - alert: IloBiosSelfTestDegraded + expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Une erreur a été détectée lors du POST du serveur + ({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }} + + - alert: IloBatteryDegraded + expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La batterie est dégradée + ({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }} + + - alert: IloTemperatureSensorDegraded + expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le capteur de température est dégradé + ({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }} + + - alert: IloFanDegraded + expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le ventilateur est dégradé + ({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }} + + - alert: IloPowerSupplyDegraded + expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + L'alimentation est dégradée + ({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }} + + - alert: IloOverrideSwitchState + expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le switch de réinitialisation n'est pas à l'état d'origine, + l'authentification est bypassée sur {{ $labels.instance }} + ######### # Other # ######### @@ -316,8 +366,8 @@ groups: annotations: summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}" - - alert: AptOrphans - expr: apt_orphans > 10 + - alert: AptObsolete + expr: apt_obsolete > 10 for: 5m labels: severity: warning @@ -347,13 +397,4 @@ groups: severity: warning annotations: summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués" - - - alert: TooManyUDPErrors - expr: irate(node_netstat_Udp_InErrors[5m]) > 100 - for: 2m - labels: - severity: warning - annotations: - summary: "{{ $labels.instance }} a plus de {{ $value }} connexions UDP en erreur. Quelque chose spam!" - {% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index daa136c4..8c024bba 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -3,7 +3,7 @@ global: # scrape_interval is set to the global default (60s) # evaluation_interval is set to the global default (60s) - # scrape_timeout is set to the global default (10s). + scrape_timeout: 30s # was 10s by default # Attach these labels to any time series or alerts when communicating with # external systems (federation, remote storage, Alertmanager).