[Prometheus] Update to 2.x
parent
528e443d67
commit
f30e5c9f21
|
@ -15,8 +15,8 @@
|
|||
|
||||
- name: Configure Prometheus alert rules
|
||||
template:
|
||||
src: prometheus/alert.rules.j2
|
||||
dest: /etc/prometheus/alert.rules
|
||||
src: prometheus/alert.rules.yml.j2
|
||||
dest: /etc/prometheus/alert.rules.yml
|
||||
notify: Restart Prometheus
|
||||
|
||||
# We don't need to restart Prometheus when updating nodes
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
|
||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||
{% raw %}
|
||||
ALERT InstanceDown
|
||||
IF up == 0
|
||||
FOR 5m
|
||||
LABELS { severity = "page" }
|
||||
ANNOTATIONS {
|
||||
summary = "Instance {{ $labels.instance }} down",
|
||||
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
|
||||
}
|
||||
{% endraw %}
|
|
@ -0,0 +1,25 @@
|
|||
# {{ ansible_managed }}
|
||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||
{% raw %}
|
||||
groups:
|
||||
- name: example
|
||||
rules:
|
||||
|
||||
# Alert for any instance that is unreachable for >5 minutes.
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
|
||||
# Alert for any instance that has a median request latency >1s.
|
||||
- alert: APIHighRequestLatency
|
||||
expr: api_http_request_latencies_second{quantile="0.5"} > 1
|
||||
for: 10m
|
||||
annotations:
|
||||
summary: "High request latency on {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
|
||||
{% endraw %}
|
|
@ -1,8 +1,8 @@
|
|||
# {{ ansible_managed }}
|
||||
|
||||
global:
|
||||
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
# scrape_interval is set to the global default (60s)
|
||||
# evaluation_interval is set to the global default (60s)
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
|
@ -10,16 +10,15 @@ global:
|
|||
external_labels:
|
||||
monitor: 'example'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- "alert.rules"
|
||||
|
||||
# Route alerts to Prometheus Alertmanager
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- 'localhost:9093'
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
- "alert.rules.yml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
|
|
Loading…
Reference in New Issue