[monitoring] PEPCRANS + monitoring of cachan infra

certbot_on_virtu
_shirenn 2021-05-23 18:18:35 +02:00 committed by Yohann D'ANELLO
parent 3d528a1891
commit f7347e41d2
Signed by: _ynerant
GPG Key ID: 3A75C55819C8CF85
28 changed files with 380 additions and 401 deletions

View File

@ -0,0 +1,3 @@
---
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"

View File

@ -0,0 +1,7 @@
---
glob_grafana:
root_url: https://grafana.crans.org
icon: crans_icon_white.svg
ldap_base: "{{ glob_ldap.base }}"
ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"

View File

@ -30,3 +30,6 @@ glob_nginx:
- "172.16.0.0/16" - "172.16.0.0/16"
- "fd00:0:0:10::/64" - "fd00:0:0:10::/64"
deploy_robots_file: false deploy_robots_file: false
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"

View File

@ -0,0 +1,13 @@
---
glob_prometheus: {}
glob_snmp_exporter:
procurve_password: "{{ vault.snmp_procurve_password }}"
unifi_password: "{{ vault.snmp_unifi_password }}"
glob_ninjabot:
config:
nick: Prometheus
server: irc.adm.crans.org
port: 6667
channel: "#monitoring"

View File

@ -18,3 +18,6 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -0,0 +1,116 @@
---
interfaces:
adm: ens18
loc_home_nounou:
ip: 172.17.10.9
mountpoint: /rpool/home
loc_ldap:
servers:
- 172.17.10.9
base: 'dc=crans,dc=org'
loc_ntp_client:
servers:
- terenez.cachan-adm.crans.org
debian_mirror: http://172.17.10.202/debian
loc_mirror:
name: mirror.cachan-adm.crans.org
ip: "{{ query('ldap','ip','terenez','cachan-adm') | ipv4 | first }}"
loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_snmp_exporter:
procurve_password: "{{ vault.snmp_procurve_password }}"
unifi_password: "{{ vault.snmp_unifi_password }}"
loc_ninjabot:
config:
nick: fyre
server: irc.adm.crans.org
port: 6667
channel: "#monitoring"
loc_prometheus:
node:
file: targets_node.json
targets: "{{ groups['server'] | select('match', '^.*\\.cachan-adm\\.crans\\.org$') | list | sort }}"
config:
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets_node.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
ups_snmp:
file: targets_ups_snmp.json
targets:
- pulsar.cachan-adm.crans.org # 0B
- quasar.cachan-adm.crans.org # 4J
config:
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
unifi_snmp:
file: targets_unifi_snmp.json
targets: "{{ groups['crans_unifi'] | list | sort }}"
config:
- job_name: unifi_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_unifi_snmp.json'
metrics_path: /snmp
params:
module: [ubiquiti_unifi]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
nginx:
file: targets_nginx.json
targets:
- rodauh.cachan-adm.crans.org
- terenez.cachan-adm.crans.org
config:
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'

View File

@ -25,3 +25,6 @@ loc_borg:
to_exclude: to_exclude:
- /var/lib/lxcfs - /var/lib/lxcfs
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -1,4 +1,113 @@
interfaces: interfaces:
adm: eth0 adm: eth0
srv_nat: eth1 srv_nat: eth1
infra: eth2
loc_prometheus:
node:
file: targets_node.json
targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}"
config:
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets_node.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
nginx:
file: targets_nginx.json
targets:
- hodaur.adm.crans.org
- charybde.adm.crans.org
config:
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
blackbox:
file: targets_blackbox.json
targets:
- https://crans.org/
- https://www.crans.org/
- https://webirc.crans.org/
- https://jitsi.crans.org/
- https://ftps.crans.org/
- http://ftp.crans.org/
- https://grafana.crans.org/
- https://roundcube.crans.org/
- https://zero.crans.org/
- https://wiki.crans.org/PageAccueil
- https://framadate.crans.org/
- https://pad.crans.org/
- https://lists.crans.org/
- https://cas.crans.org/
- https://ethercalc.crans.org/
- https://phabricator.crans.org/
- https://webmail.crans.org/horde/login.php
- https://gitlab.crans.org/
- https://perso.crans.org/crans/
- https://install-party.crans.org/
- https://intranet.crans.org/
- https://owncloud.crans.org/
config:
- job_name: blackbox
file_sd_configs:
- files:
- '/etc/prometheus/targets_blackbox.json'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
mtail:
file: targets_mtail.json
targets:
- tealc.adm.crans.org
config:
- job_name: mtail
static_configs:
- targets: ["tealc.adm.crans.org"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:3903'
# apache:
# targets:
# config:
# - job_name: apache
# file_sd_configs:
# - files:
# - '/etc/prometheus/targets_apache.json'
# relabel_configs:
# - source_labels: [__address__]
# target_label: instance
# - source_labels: [instance]
# target_label: __address__
# replacement: '$1:9117'
# bird_targets:
# - routeur-sam.adm.crans.org

View File

@ -25,3 +25,6 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -18,3 +18,6 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -14,6 +14,9 @@ glob_ntp_client:
debian_mirror: http://172.17.10.202/debian debian_mirror: http://172.17.10.202/debian
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
loc_borg: loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}

View File

@ -37,3 +37,9 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -22,3 +22,6 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -18,3 +18,9 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -23,3 +23,6 @@ loc_borg:
remote: remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: "" ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -27,3 +27,6 @@ loc_borg:
ssh_options: "" ssh_options: ""
to_exclude: to_exclude:
- /var/lib/backuppc - /var/lib/backuppc
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

14
hosts
View File

@ -18,6 +18,9 @@ tealc.adm.crans.org
tealc.adm.crans.org tealc.adm.crans.org
gulp.cachan-adm.crans.org gulp.cachan-adm.crans.org
[blackbox]
monitoring.adm.crans.org
[bdd:children] [bdd:children]
virtu virtu
@ -81,6 +84,9 @@ neree.adm.crans.org
[gitlab] [gitlab]
gitzly.adm.crans.org gitzly.adm.crans.org
[grafana]
monitoring.adm.crans.org
[horde] [horde]
horde.adm.crans.org horde.adm.crans.org
@ -99,17 +105,16 @@ linx.adm.crans.org
[mailman] [mailman]
mailman.adm.crans.org mailman.adm.crans.org
[monitoring] [prometheus]
monitoring.adm.crans.org monitoring.adm.crans.org
fyre.cachan-adm.crans.org
[nginx]
charybde.adm.crans.org
[nginx:children] [nginx:children]
django_cas django_cas
galene galene
jitsi jitsi
mailman mailman
ntp_server
re2o_front re2o_front
reverseproxy reverseproxy
roundcube roundcube
@ -212,6 +217,7 @@ cas.adm.crans.org
codichotomie.adm.crans.org codichotomie.adm.crans.org
ethercalc.adm.crans.org ethercalc.adm.crans.org
fluxx.adm.crans.org fluxx.adm.crans.org
fyre.cachan-adm.crans.org
gitlab-ci.adm.crans.org gitlab-ci.adm.crans.org
gitzly.adm.crans.org gitzly.adm.crans.org
hodaur.adm.crans.org hodaur.adm.crans.org

View File

@ -1,81 +1,43 @@
#!/usr/bin/env ansible-playbook #!/usr/bin/env ansible-playbook
--- ---
# Deploy Prometheus and Grafana on monitoring server
- hosts: monitoring # Deploy Prometheus on monitoring server
- hosts: prometheus
vars: vars:
# Prometheus targets.json prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}"
prometheus: alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}"
node_targets: "{{ groups['server'] | list | sort }}" snmp_exporter: "{{ glob_snmp_exporter | default({}) | combine(loc_snmp_exporter | default({})) }}"
ups_snmp_targets: ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}"
- pulsar.adm.crans.org # 0B
- quasar.adm.crans.org # 4J
procurve_snmp_targets:
- batg-9.infra.crans.org
unifi_snmp_targets: "{{ groups['crans_unifi'] | list | sort }}"
blackbox_targets:
- https://crans.org/
- https://www.crans.org/
- https://webirc.crans.org/
- https://jitsi.crans.org/
- https://ftps.crans.org/
- http://ftp.crans.org/
- https://grafana.crans.org/
- https://roundcube.crans.org/
- https://zero.crans.org/
- https://wiki.crans.org/PageAccueil
- https://framadate.crans.org/
- https://pad.crans.org/
- https://lists.crans.org/
- https://cas.crans.org/
- https://ethercalc.crans.org/
- https://phabricator.crans.org/
- https://webmail.crans.org/horde/login.php
- https://gitlab.crans.org/
- https://perso.crans.org/crans/
- https://install-party.crans.org/
- https://intranet.crans.org/
- https://owncloud.crans.org/
nginx_targets:
- hodaur.adm.crans.org
- charybde.adm.crans.org
apache_targets: [] # [zamok.adm.crans.org]
bird_targets:
- routeur-sam.adm.crans.org
snmp_procurve_password: "{{ vault.snmp_procurve_password }}"
snmp_unifi_password: "{{ vault.snmp_unifi_password }}"
grafana:
root_url: https://grafana.crans.org
icon: crans_icon_white.svg
ldap_base: "{{ glob_ldap.base }}"
ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"
roles: roles:
- prometheus - prometheus
- prometheus-alertmanager - prometheus-alertmanager
- prometheus-snmp-exporter - prometheus-snmp-exporter
- prometheus-blackbox-exporter
- ninjabot - ninjabot
- grafana
# # Deploy Grafana on monitoring server
# - hosts: grafana
# vars:
# grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}"
# roles:
# - grafana
- hosts: blackbox
roles:
- prometheus-blackbox-exporter
# Monitor all hosts # Monitor all hosts
- hosts: server - hosts: server
vars: vars:
adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
roles: ["prometheus-node-exporter"] roles:
- prometheus-node-exporter
# Export nginx metrics # Export nginx metrics
- hosts: nginx - hosts: nginx
vars: vars:
adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}"
roles: ["prometheus-nginx-exporter"] roles:
- prometheus-nginx-exporter
# Export apache metrics
#- hosts: zamok.adm.crans.org
# vars:
# adm_ipv4: "{{ ansible_all_ipv4_addresses | ipaddr(adm_subnet) | first }}"
# roles: ["prometheus-apache-exporter"]
# Monitor mailq with a special text exporter # Monitor mailq with a special text exporter
#- hosts: redisdead.adm.crans.org #- hosts: redisdead.adm.crans.org
@ -85,7 +47,9 @@
- hosts: tealc.adm.crans.org - hosts: tealc.adm.crans.org
vars: vars:
mirror: '{{ glob_mirror | default({}) | combine(loc_mirror | default({})) }}' mirror: '{{ glob_mirror | default({}) | combine(loc_mirror | default({})) }}'
roles: ["mtail"] roles:
- mtail
- hosts: sputnik.adm.crans.org - hosts: sputnik.adm.crans.org
roles: ["statping"] roles:
- statping

View File

@ -17,6 +17,11 @@
dest: /var/local/ninjabot dest: /var/local/ninjabot
version: master version: master
- name: Deploy NinjaBot configuration
template:
src: ninjabot/ninjabot.json.j2
dest: /var/local/ninjabot/ninjabot.json
- name: Deploy NinjaBot systemd unit - name: Deploy NinjaBot systemd unit
template: template:
src: systemd/system/ninjabot.service.j2 src: systemd/system/ninjabot.service.j2

View File

@ -0,0 +1 @@
{{ ninjabot.config | to_nice_json(indent=2) }}

View File

@ -8,7 +8,7 @@ Type=simple
WorkingDirectory=/var/local/ninjabot WorkingDirectory=/var/local/ninjabot
User=nobody User=nobody
Group=nogroup Group=nogroup
ExecStart=/usr/bin/python3 /var/local/ninjabot/main.py ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py
Restart=always Restart=always
[Install] [Install]

View File

@ -14,7 +14,7 @@
path: /etc/default/prometheus-nginx-exporter path: /etc/default/prometheus-nginx-exporter
regexp: '^ARGS=' regexp: '^ARGS='
line: | line: |
ARGS="-web.listen-address={{ adm_ipv4 }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status" ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status"
notify: notify:
- Restart nginx - Restart nginx
- Restart prometheus-nginx-exporter - Restart prometheus-nginx-exporter

View File

@ -7,21 +7,6 @@
register: apt_result register: apt_result
retries: 3 retries: 3
until: apt_result is succeeded until: apt_result is succeeded
when:
- ansible_lsb.codename != 'stretch'
# Prometheus 2 node is in stretch-backports
- name: Install Prometheus node-exporter (stretch-backports)
apt:
update_cache: true
name: prometheus-node-exporter
install_recommends: false
default_release: stretch-backports
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename == 'stretch'
- name: Install Prometheus node-exporter-collectors (bullseye) - name: Install Prometheus node-exporter-collectors (bullseye)
apt: apt:
@ -45,7 +30,7 @@
path: /etc/default/prometheus-node-exporter path: /etc/default/prometheus-node-exporter
regexp: '^ARGS=' regexp: '^ARGS='
line: | line: |
ARGS="--web.listen-address={{ adm_ipv4 }}:9100" ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100"
tags: restart-node-exporter tags: restart-node-exporter
# Install new APT textfile collector, it might be upstreamed one day # Install new APT textfile collector, it might be upstreamed one day
@ -57,15 +42,4 @@
owner: root owner: root
group: root group: root
mode: 0755 mode: 0755
when: ansible_lsb.id == 'Debian' and ansible_distribution_release != "bullseye" when: ansible_distribution_release != "bullseye"
# Install new APT textfile collector, it might be upstreamed one day
# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35
- name: Patch APT textfile collector
copy:
src: apt.sh
dest: /usr/share/prometheus-node-exporter-collectors/apt.sh
owner: root
group: root
mode: 0755
when: ansible_lsb.id == 'Ubuntu' or ansible_distribution_release == "bullseye"

View File

@ -113,14 +113,14 @@ procurve_switch:
version: 3 version: 3
auth: auth:
# To create SNMPv3 user on HP procurve, execute: # To create SNMPv3 user on HP procurve, execute:
# snmpv3 user snmp_prometheus auth sha {{ snmp_procurve_password }} priv aes {{ snmp_procurve_password }} # snmpv3 user snmp_prometheus auth sha {{ snmp_exporter.procurve_password }} priv aes {{ snmp_exporter.procurve_password }}
# snmpv3 group managerpriv user snmp_prometheus sec-model ver3 # snmpv3 group managerpriv user snmp_prometheus sec-model ver3
security_level: authPriv security_level: authPriv
username: snmp_prometheus username: snmp_prometheus
password: {{ snmp_procurve_password }} password: {{ snmp_exporter.procurve_password }}
auth_protocol: SHA auth_protocol: SHA
priv_protocol: AES priv_protocol: AES
priv_password: {{ snmp_procurve_password }} priv_password: {{ snmp_exporter.procurve_password }}
ubiquiti_unifi: ubiquiti_unifi:
walk: walk:
@ -475,7 +475,7 @@ ubiquiti_unifi:
auth: auth:
security_level: authPriv security_level: authPriv
username: snmp_prometheus username: snmp_prometheus
password: {{ snmp_unifi_password }} password: {{ snmp_exporter.unifi_password }}
auth_protocol: SHA auth_protocol: SHA
priv_protocol: AES priv_protocol: AES
priv_password: {{ snmp_unifi_password }} priv_password: {{ snmp_exporter.unifi_password }}

View File

@ -16,35 +16,18 @@
- name: Configure Prometheus alert rules - name: Configure Prometheus alert rules
template: template:
src: "prometheus/{{ item }}.j2" src: prometheus/alert.rules.yml.j2
dest: "/etc/prometheus/{{ item }}" dest: /etc/prometheus/alert.rules.yml
mode: 0644 mode: 0644
notify: Restart Prometheus notify: Restart Prometheus
loop:
- alert.rules.yml
- django.rules.yml
# We don't need to restart Prometheus when updating nodes # We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus targets - name: Configure Prometheus targets
copy: copy:
content: "{{ [{'targets': item.targets}] | to_nice_json }}\n" content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
dest: "/etc/prometheus/{{ item.file }}.json" dest: "/etc/prometheus/{{ item.value.file }}"
mode: 0644 mode: 0644
loop: loop: "{{ prometheus | dict2items }}"
- file: targets
targets: "{{ prometheus.node_targets }}"
- file: targets_ups_snmp
targets: "{{ prometheus.ups_snmp_targets }}"
- file: targets_procurve_snmp
targets: "{{ prometheus.procurve_snmp_targets }}"
- file: targets_unifi_snmp
targets: "{{ prometheus.unifi_snmp_targets }}"
- file: targets_nginx
targets: "{{ prometheus.nginx_targets }}"
- file: targets_apache
targets: "{{ prometheus.apache_targets }}"
- file: targets_blackbox
targets: "{{ prometheus.blackbox_targets }}"
- name: Activate prometheus service - name: Activate prometheus service
systemd: systemd:

View File

@ -144,7 +144,7 @@ groups:
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsWrongOutputVoltage - alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 225) or (upsOutputVoltage > 235) expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -161,29 +161,27 @@ groups:
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
- alert: MailqNotEmpty - alert: MailqNotEmpty
expr: postfix_mailq_length > 5 expr: postfix_mailq_length > 25
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
# NTP (need NTP plugin in node) - alert: NoRadiusLogin
# - alert: ntp_drifting expr: rate(radiusd_access_ok[3m]) == 0
# expr: node_ntp_drift_seconds > 0.05 for: 2m
# for: 3m labels:
# labels: severity: warning
# severity: critical annotations:
# annotations: summary: "Personne ne vient taper le RADIUS."
# summary: "Décalage NTP trop élevé sur {{ $labels.instance }}"
# description: "Le décalage NTP est trop élevé ({{ $value }} > 0.05)" - alert: TooManyReallocatedSectors
expr: smartmon_reallocated_sector_ct_raw_value > 1e3
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
# - alert: ntp_drifting
# expr: node_ntp_drift_seconds > 0.01
# for: 1m
# labels:
# severity: warning
# annotations:
# summary: "Décalage NTP élevé sur {{ $labels.instance }}"
# description: "Le décalage NTP est élevé ({{ $value }} > 0.01)"
{% endraw %} {% endraw %}

View File

@ -1,106 +0,0 @@
{{ ansible_header | comment }}
{# As this is also Jinja2 it will conflict without a raw block #}
{% raw %}
groups:
- name: django.rules
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
BY (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view, transport, method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) BY (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) BY (job, connection)
{% endraw %}

View File

@ -20,156 +20,23 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
# A scrape configuration containing exactly one endpoint to scrape: # A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself. # Here it's Prometheus itself.
scrape_configs: {{
# The .json in file_sd_configs is dynamically reloaded {
"scrape_configs":
- job_name: prometheus [
static_configs: {
- targets: "job_name": "prometheus",
- localhost:9090 "static_configs" : [
{
- job_name: servers "targets": [
file_sd_configs: "localhost:9090"
- files: ]
- '/etc/prometheus/targets.json' }
relabel_configs: ]
# Do not put :9100 in instance name, rather here }
- source_labels: [__address__] ] + (prometheus | json_query("*.config[0]"))
target_label: __param_target } | to_nice_yaml(indent=2)
- source_labels: [__param_target] }}
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
{% if prometheus.ups_snmp_targets is defined %}
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.procurve_snmp_targets is defined %}
- job_name: procurve_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_procurve_snmp.json'
metrics_path: /snmp
params:
module: [procurve_switch]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.unifi_snmp_targets is defined %}
- job_name: unifi_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_unifi_snmp.json'
metrics_path: /snmp
params:
module: [ubiquiti_unifi]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.nginx_targets is defined %}
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
# Do not put :9117 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
{% endif %}
{% if prometheus.apache_targets is defined %}
- job_name: apache
file_sd_configs:
- files:
- '/etc/prometheus/targets_apache.json'
relabel_configs:
# Do not put :9117 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
{% endif %}
{% if prometheus.blackbox_targets is defined %}
- job_name: blackbox
file_sd_configs:
- files:
- '/etc/prometheus/targets_blackbox.json'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
{% endif %}
- job_name: mtail
static_configs:
- targets: ["tealc.adm.crans.org"]
relabel_configs:
# Do not put :3903 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:3903'
{% if prometheus.bird_targets is defined %}
- job_name: bird
file_sd_configs:
- files:
- '/etc/prometheus/targets_bird.json'
relabel_configs:
# Do not put :3903 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9324'
{% endif %}
- job_name: django
scheme: https
static_configs:
- targets: []
# Activate this line when the captive portal need monitoring
# - targets: ["portail-captif.crans.org:443"]