[Prometheus] Update to 2.x
parent
528e443d67
commit
f30e5c9f21
|
@ -15,8 +15,8 @@
|
||||||
|
|
||||||
- name: Configure Prometheus alert rules
|
- name: Configure Prometheus alert rules
|
||||||
template:
|
template:
|
||||||
src: prometheus/alert.rules.j2
|
src: prometheus/alert.rules.yml.j2
|
||||||
dest: /etc/prometheus/alert.rules
|
dest: /etc/prometheus/alert.rules.yml
|
||||||
notify: Restart Prometheus
|
notify: Restart Prometheus
|
||||||
|
|
||||||
# We don't need to restart Prometheus when updating nodes
|
# We don't need to restart Prometheus when updating nodes
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
# {{ ansible_managed }}
|
|
||||||
|
|
||||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
|
||||||
{% raw %}
|
|
||||||
ALERT InstanceDown
|
|
||||||
IF up == 0
|
|
||||||
FOR 5m
|
|
||||||
LABELS { severity = "page" }
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Instance {{ $labels.instance }} down",
|
|
||||||
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
|
|
||||||
}
|
|
||||||
{% endraw %}
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||||
|
{% raw %}
|
||||||
|
groups:
|
||||||
|
- name: example
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# Alert for any instance that is unreachable for >5 minutes.
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
# Alert for any instance that has a median request latency >1s.
|
||||||
|
- alert: APIHighRequestLatency
|
||||||
|
expr: api_http_request_latencies_second{quantile="0.5"} > 1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
summary: "High request latency on {{ $labels.instance }}"
|
||||||
|
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
|
||||||
|
{% endraw %}
|
|
@ -1,8 +1,8 @@
|
||||||
# {{ ansible_managed }}
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
global:
|
global:
|
||||||
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
# scrape_interval is set to the global default (60s)
|
||||||
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
# evaluation_interval is set to the global default (60s)
|
||||||
# scrape_timeout is set to the global default (10s).
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
# Attach these labels to any time series or alerts when communicating with
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
@ -10,16 +10,15 @@ global:
|
||||||
external_labels:
|
external_labels:
|
||||||
monitor: 'example'
|
monitor: 'example'
|
||||||
|
|
||||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
# Alertmanager configuration
|
||||||
rule_files:
|
|
||||||
- "alert.rules"
|
|
||||||
|
|
||||||
# Route alerts to Prometheus Alertmanager
|
|
||||||
alerting:
|
alerting:
|
||||||
alertmanagers:
|
alertmanagers:
|
||||||
- static_configs:
|
- static_configs:
|
||||||
- targets:
|
- targets: ['localhost:9093']
|
||||||
- 'localhost:9093'
|
|
||||||
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
|
rule_files:
|
||||||
|
- "alert.rules.yml"
|
||||||
|
|
||||||
# A scrape configuration containing exactly one endpoint to scrape:
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
# Here it's Prometheus itself.
|
# Here it's Prometheus itself.
|
||||||
|
|
Loading…
Reference in New Issue