[monitoring] PEPCRANS + monitoring of cachan infra

certbot_on_virtu
_shirenn 2021-05-23 18:18:35 +02:00 committed by Yohann D'ANELLO
parent 3d528a1891
commit f7347e41d2
Signed by: _ynerant
GPG Key ID: 3A75C55819C8CF85
28 changed files with 380 additions and 401 deletions

View File

@ -0,0 +1,3 @@
---
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"

View File

@ -0,0 +1,7 @@
---
glob_grafana:
root_url: https://grafana.crans.org
icon: crans_icon_white.svg
ldap_base: "{{ glob_ldap.base }}"
ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"

View File

@ -30,3 +30,6 @@ glob_nginx:
- "172.16.0.0/16"
- "fd00:0:0:10::/64"
deploy_robots_file: false
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"

View File

@ -0,0 +1,13 @@
---
glob_prometheus: {}
glob_snmp_exporter:
procurve_password: "{{ vault.snmp_procurve_password }}"
unifi_password: "{{ vault.snmp_unifi_password }}"
glob_ninjabot:
config:
nick: Prometheus
server: irc.adm.crans.org
port: 6667
channel: "#monitoring"

View File

@ -18,3 +18,6 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -0,0 +1,116 @@
---
interfaces:
adm: ens18
loc_home_nounou:
ip: 172.17.10.9
mountpoint: /rpool/home
loc_ldap:
servers:
- 172.17.10.9
base: 'dc=crans,dc=org'
loc_ntp_client:
servers:
- terenez.cachan-adm.crans.org
debian_mirror: http://172.17.10.202/debian
loc_mirror:
name: mirror.cachan-adm.crans.org
ip: "{{ query('ldap','ip','terenez','cachan-adm') | ipv4 | first }}"
loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_snmp_exporter:
procurve_password: "{{ vault.snmp_procurve_password }}"
unifi_password: "{{ vault.snmp_unifi_password }}"
loc_ninjabot:
config:
nick: fyre
server: irc.adm.crans.org
port: 6667
channel: "#monitoring"
loc_prometheus:
node:
file: targets_node.json
targets: "{{ groups['server'] | select('match', '^.*\\.cachan-adm\\.crans\\.org$') | list | sort }}"
config:
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets_node.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
ups_snmp:
file: targets_ups_snmp.json
targets:
- pulsar.cachan-adm.crans.org # 0B
- quasar.cachan-adm.crans.org # 4J
config:
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
unifi_snmp:
file: targets_unifi_snmp.json
targets: "{{ groups['crans_unifi'] | list | sort }}"
config:
- job_name: unifi_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_unifi_snmp.json'
metrics_path: /snmp
params:
module: [ubiquiti_unifi]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
nginx:
file: targets_nginx.json
targets:
- rodauh.cachan-adm.crans.org
- terenez.cachan-adm.crans.org
config:
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'

View File

@ -25,3 +25,6 @@ loc_borg:
to_exclude:
- /var/lib/lxcfs
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -1,4 +1,113 @@
interfaces:
adm: eth0
srv_nat: eth1
infra: eth2
loc_prometheus:
node:
file: targets_node.json
targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}"
config:
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets_node.json'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
nginx:
file: targets_nginx.json
targets:
- hodaur.adm.crans.org
- charybde.adm.crans.org
config:
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
blackbox:
file: targets_blackbox.json
targets:
- https://crans.org/
- https://www.crans.org/
- https://webirc.crans.org/
- https://jitsi.crans.org/
- https://ftps.crans.org/
- http://ftp.crans.org/
- https://grafana.crans.org/
- https://roundcube.crans.org/
- https://zero.crans.org/
- https://wiki.crans.org/PageAccueil
- https://framadate.crans.org/
- https://pad.crans.org/
- https://lists.crans.org/
- https://cas.crans.org/
- https://ethercalc.crans.org/
- https://phabricator.crans.org/
- https://webmail.crans.org/horde/login.php
- https://gitlab.crans.org/
- https://perso.crans.org/crans/
- https://install-party.crans.org/
- https://intranet.crans.org/
- https://owncloud.crans.org/
config:
- job_name: blackbox
file_sd_configs:
- files:
- '/etc/prometheus/targets_blackbox.json'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
mtail:
file: targets_mtail.json
targets:
- tealc.adm.crans.org
config:
- job_name: mtail
static_configs:
- targets: ["tealc.adm.crans.org"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:3903'
# apache:
# targets:
# config:
# - job_name: apache
# file_sd_configs:
# - files:
# - '/etc/prometheus/targets_apache.json'
# relabel_configs:
# - source_labels: [__address__]
# target_label: instance
# - source_labels: [instance]
# target_label: __address__
# replacement: '$1:9117'
# bird_targets:
# - routeur-sam.adm.crans.org

View File

@ -25,3 +25,6 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -18,3 +18,6 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -14,6 +14,9 @@ glob_ntp_client:
debian_mirror: http://172.17.10.202/debian
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}

View File

@ -37,3 +37,9 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -22,3 +22,6 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -18,3 +18,9 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"
glob_prometheus_nginx_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -23,3 +23,6 @@ loc_borg:
remote:
- borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }}
ssh_options: ""
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

View File

@ -27,3 +27,6 @@ loc_borg:
ssh_options: ""
to_exclude:
- /var/lib/backuppc
glob_prometheus_node_exporter:
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}"

14
hosts
View File

@ -18,6 +18,9 @@ tealc.adm.crans.org
tealc.adm.crans.org
gulp.cachan-adm.crans.org
[blackbox]
monitoring.adm.crans.org
[bdd:children]
virtu
@ -81,6 +84,9 @@ neree.adm.crans.org
[gitlab]
gitzly.adm.crans.org
[grafana]
monitoring.adm.crans.org
[horde]
horde.adm.crans.org
@ -99,17 +105,16 @@ linx.adm.crans.org
[mailman]
mailman.adm.crans.org
[monitoring]
[prometheus]
monitoring.adm.crans.org
[nginx]
charybde.adm.crans.org
fyre.cachan-adm.crans.org
[nginx:children]
django_cas
galene
jitsi
mailman
ntp_server
re2o_front
reverseproxy
roundcube
@ -212,6 +217,7 @@ cas.adm.crans.org
codichotomie.adm.crans.org
ethercalc.adm.crans.org
fluxx.adm.crans.org
fyre.cachan-adm.crans.org
gitlab-ci.adm.crans.org
gitzly.adm.crans.org
hodaur.adm.crans.org

View File

@ -1,81 +1,43 @@
#!/usr/bin/env ansible-playbook
---
# Deploy Prometheus and Grafana on monitoring server
- hosts: monitoring
# Deploy Prometheus on monitoring server
- hosts: prometheus
vars:
# Prometheus targets.json
prometheus:
node_targets: "{{ groups['server'] | list | sort }}"
ups_snmp_targets:
- pulsar.adm.crans.org # 0B
- quasar.adm.crans.org # 4J
procurve_snmp_targets:
- batg-9.infra.crans.org
unifi_snmp_targets: "{{ groups['crans_unifi'] | list | sort }}"
blackbox_targets:
- https://crans.org/
- https://www.crans.org/
- https://webirc.crans.org/
- https://jitsi.crans.org/
- https://ftps.crans.org/
- http://ftp.crans.org/
- https://grafana.crans.org/
- https://roundcube.crans.org/
- https://zero.crans.org/
- https://wiki.crans.org/PageAccueil
- https://framadate.crans.org/
- https://pad.crans.org/
- https://lists.crans.org/
- https://cas.crans.org/
- https://ethercalc.crans.org/
- https://phabricator.crans.org/
- https://webmail.crans.org/horde/login.php
- https://gitlab.crans.org/
- https://perso.crans.org/crans/
- https://install-party.crans.org/
- https://intranet.crans.org/
- https://owncloud.crans.org/
nginx_targets:
- hodaur.adm.crans.org
- charybde.adm.crans.org
apache_targets: [] # [zamok.adm.crans.org]
bird_targets:
- routeur-sam.adm.crans.org
snmp_procurve_password: "{{ vault.snmp_procurve_password }}"
snmp_unifi_password: "{{ vault.snmp_unifi_password }}"
grafana:
root_url: https://grafana.crans.org
icon: crans_icon_white.svg
ldap_base: "{{ glob_ldap.base }}"
ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"
prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}"
alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}"
snmp_exporter: "{{ glob_snmp_exporter | default({}) | combine(loc_snmp_exporter | default({})) }}"
ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}"
roles:
- prometheus
- prometheus-alertmanager
- prometheus-snmp-exporter
- prometheus-blackbox-exporter
- ninjabot
- grafana
# # Deploy Grafana on monitoring server
# - hosts: grafana
# vars:
# grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}"
# roles:
# - grafana
- hosts: blackbox
roles:
- prometheus-blackbox-exporter
# Monitor all hosts
- hosts: server
vars:
adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
roles: ["prometheus-node-exporter"]
prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
roles:
- prometheus-node-exporter
# Export nginx metrics
- hosts: nginx
vars:
adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
roles: ["prometheus-nginx-exporter"]
# Export apache metrics
#- hosts: zamok.adm.crans.org
# vars:
# adm_ipv4: "{{ ansible_all_ipv4_addresses | ipaddr(adm_subnet) | first }}"
# roles: ["prometheus-apache-exporter"]
prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}"
roles:
- prometheus-nginx-exporter
# Monitor mailq with a special text exporter
#- hosts: redisdead.adm.crans.org
@ -85,7 +47,9 @@
- hosts: tealc.adm.crans.org
vars:
mirror: '{{ glob_mirror | default({}) | combine(loc_mirror | default({})) }}'
roles: ["mtail"]
roles:
- mtail
- hosts: sputnik.adm.crans.org
roles: ["statping"]
roles:
- statping

View File

@ -17,6 +17,11 @@
dest: /var/local/ninjabot
version: master
- name: Deploy NinjaBot configuration
template:
src: ninjabot/ninjabot.json.j2
dest: /var/local/ninjabot/ninjabot.json
- name: Deploy NinjaBot systemd unit
template:
src: systemd/system/ninjabot.service.j2

View File

@ -0,0 +1 @@
{{ ninjabot.config | to_nice_json(indent=2) }}

View File

@ -8,7 +8,7 @@ Type=simple
WorkingDirectory=/var/local/ninjabot
User=nobody
Group=nogroup
ExecStart=/usr/bin/python3 /var/local/ninjabot/main.py
ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py
Restart=always
[Install]

View File

@ -14,7 +14,7 @@
path: /etc/default/prometheus-nginx-exporter
regexp: '^ARGS='
line: |
ARGS="-web.listen-address={{ adm_ipv4 }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status"
ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status"
notify:
- Restart nginx
- Restart prometheus-nginx-exporter

View File

@ -7,21 +7,6 @@
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename != 'stretch'
# Prometheus 2 node is in stretch-backports
- name: Install Prometheus node-exporter (stretch-backports)
apt:
update_cache: true
name: prometheus-node-exporter
install_recommends: false
default_release: stretch-backports
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename == 'stretch'
- name: Install Prometheus node-exporter-collectors (bullseye)
apt:
@ -45,7 +30,7 @@
path: /etc/default/prometheus-node-exporter
regexp: '^ARGS='
line: |
ARGS="--web.listen-address={{ adm_ipv4 }}:9100"
ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100"
tags: restart-node-exporter
# Install new APT textfile collector, it might be upstreamed one day
@ -57,15 +42,4 @@
owner: root
group: root
mode: 0755
when: ansible_lsb.id == 'Debian' and ansible_distribution_release != "bullseye"
# Install new APT textfile collector, it might be upstreamed one day
# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35
- name: Patch APT textfile collector
copy:
src: apt.sh
dest: /usr/share/prometheus-node-exporter-collectors/apt.sh
owner: root
group: root
mode: 0755
when: ansible_lsb.id == 'Ubuntu' or ansible_distribution_release == "bullseye"
when: ansible_distribution_release != "bullseye"

View File

@ -113,14 +113,14 @@ procurve_switch:
version: 3
auth:
# To create SNMPv3 user on HP procurve, execute:
# snmpv3 user snmp_prometheus auth sha {{ snmp_procurve_password }} priv aes {{ snmp_procurve_password }}
# snmpv3 user snmp_prometheus auth sha {{ snmp_exporter.procurve_password }} priv aes {{ snmp_exporter.procurve_password }}
# snmpv3 group managerpriv user snmp_prometheus sec-model ver3
security_level: authPriv
username: snmp_prometheus
password: {{ snmp_procurve_password }}
password: {{ snmp_exporter.procurve_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_procurve_password }}
priv_password: {{ snmp_exporter.procurve_password }}
ubiquiti_unifi:
walk:
@ -475,7 +475,7 @@ ubiquiti_unifi:
auth:
security_level: authPriv
username: snmp_prometheus
password: {{ snmp_unifi_password }}
password: {{ snmp_exporter.unifi_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_unifi_password }}
priv_password: {{ snmp_exporter.unifi_password }}

View File

@ -16,35 +16,18 @@
- name: Configure Prometheus alert rules
template:
src: "prometheus/{{ item }}.j2"
dest: "/etc/prometheus/{{ item }}"
src: prometheus/alert.rules.yml.j2
dest: /etc/prometheus/alert.rules.yml
mode: 0644
notify: Restart Prometheus
loop:
- alert.rules.yml
- django.rules.yml
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus targets
copy:
content: "{{ [{'targets': item.targets}] | to_nice_json }}\n"
dest: "/etc/prometheus/{{ item.file }}.json"
content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
dest: "/etc/prometheus/{{ item.value.file }}"
mode: 0644
loop:
- file: targets
targets: "{{ prometheus.node_targets }}"
- file: targets_ups_snmp
targets: "{{ prometheus.ups_snmp_targets }}"
- file: targets_procurve_snmp
targets: "{{ prometheus.procurve_snmp_targets }}"
- file: targets_unifi_snmp
targets: "{{ prometheus.unifi_snmp_targets }}"
- file: targets_nginx
targets: "{{ prometheus.nginx_targets }}"
- file: targets_apache
targets: "{{ prometheus.apache_targets }}"
- file: targets_blackbox
targets: "{{ prometheus.blackbox_targets }}"
loop: "{{ prometheus | dict2items }}"
- name: Activate prometheus service
systemd:

View File

@ -144,7 +144,7 @@ groups:
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 225) or (upsOutputVoltage > 235)
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
for: 5m
labels:
severity: warning
@ -161,29 +161,27 @@ groups:
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
- alert: MailqNotEmpty
expr: postfix_mailq_length > 5
expr: postfix_mailq_length > 25
for: 1m
labels:
severity: warning
annotations:
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
# NTP (need NTP plugin in node)
# - alert: ntp_drifting
# expr: node_ntp_drift_seconds > 0.05
# for: 3m
# labels:
# severity: critical
# annotations:
# summary: "Décalage NTP trop élevé sur {{ $labels.instance }}"
# description: "Le décalage NTP est trop élevé ({{ $value }} > 0.05)"
- alert: NoRadiusLogin
expr: rate(radiusd_access_ok[3m]) == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Personne ne vient taper le RADIUS."
- alert: TooManyReallocatedSectors
expr: smartmon_reallocated_sector_ct_raw_value > 1e3
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
# - alert: ntp_drifting
# expr: node_ntp_drift_seconds > 0.01
# for: 1m
# labels:
# severity: warning
# annotations:
# summary: "Décalage NTP élevé sur {{ $labels.instance }}"
# description: "Le décalage NTP est élevé ({{ $value }} > 0.01)"
{% endraw %}

View File

@ -1,106 +0,0 @@
{{ ansible_header | comment }}
{# As this is also Jinja2 it will conflict without a raw block #}
{% raw %}
groups:
- name: django.rules
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
BY (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view, transport, method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) BY (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) BY (job, connection)
{% endraw %}

View File

@ -20,156 +20,23 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The .json in file_sd_configs is dynamically reloaded
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets.json'
relabel_configs:
# Do not put :9100 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
{% if prometheus.ups_snmp_targets is defined %}
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.procurve_snmp_targets is defined %}
- job_name: procurve_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_procurve_snmp.json'
metrics_path: /snmp
params:
module: [procurve_switch]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.unifi_snmp_targets is defined %}
- job_name: unifi_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_unifi_snmp.json'
metrics_path: /snmp
params:
module: [ubiquiti_unifi]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
{% if prometheus.nginx_targets is defined %}
- job_name: nginx
file_sd_configs:
- files:
- '/etc/prometheus/targets_nginx.json'
relabel_configs:
# Do not put :9117 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
{% endif %}
{% if prometheus.apache_targets is defined %}
- job_name: apache
file_sd_configs:
- files:
- '/etc/prometheus/targets_apache.json'
relabel_configs:
# Do not put :9117 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9117'
{% endif %}
{% if prometheus.blackbox_targets is defined %}
- job_name: blackbox
file_sd_configs:
- files:
- '/etc/prometheus/targets_blackbox.json'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
{% endif %}
- job_name: mtail
static_configs:
- targets: ["tealc.adm.crans.org"]
relabel_configs:
# Do not put :3903 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:3903'
{% if prometheus.bird_targets is defined %}
- job_name: bird
file_sd_configs:
- files:
- '/etc/prometheus/targets_bird.json'
relabel_configs:
# Do not put :3903 in instance name, rather here
- source_labels: [__address__]
target_label: instance
- source_labels: [instance]
target_label: __address__
replacement: '$1:9324'
{% endif %}
- job_name: django
scheme: https
static_configs:
- targets: []
# Activate this line when the captive portal need monitoring
# - targets: ["portail-captif.crans.org:443"]
{{
{
"scrape_configs":
[
{
"job_name": "prometheus",
"static_configs" : [
{
"targets": [
"localhost:9090"
]
}
]
}
] + (prometheus | json_query("*.config[0]"))
} | to_nice_yaml(indent=2)
}}