[Prometheus] Update to 2.x

2019-05-12 18:34:51 +02:00 · 2019-05-12 18:34:51 +02:00 · f30e5c9f21
parent 528e443d67
commit f30e5c9f21
4 changed files with 35 additions and 24 deletions
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -15,8 +15,8 @@

 - name: Configure Prometheus alert rules
  template:
-    src: prometheus/alert.rules.j2
-    dest: /etc/prometheus/alert.rules
+    src: prometheus/alert.rules.yml.j2
+    dest: /etc/prometheus/alert.rules.yml
  notify: Restart Prometheus

 # We don't need to restart Prometheus when updating nodes
--- a/roles/prometheus/templates/prometheus/alert.rules.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.j2
@ -1,13 +0,0 @@
-# {{ ansible_managed }}
-
-{# As this is also Jinja2 it will conflict without a raw block #}
-{% raw %}
-ALERT InstanceDown
-  IF up == 0
-  FOR 5m
-  LABELS { severity = "page" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} down",
-    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
-  }
-{% endraw %}
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@ -0,0 +1,25 @@
+# {{ ansible_managed }}
+{# As this is also Jinja2 it will conflict without a raw block #}
+{% raw %}
+groups:
+- name: example
+  rules:
+
+  # Alert for any instance that is unreachable for >5 minutes.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
+
+  # Alert for any instance that has a median request latency >1s.
+  - alert: APIHighRequestLatency
+    expr: api_http_request_latencies_second{quantile="0.5"} > 1
+    for: 10m
+    annotations:
+      summary: "High request latency on {{ $labels.instance }}"
+      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
+{% endraw %}
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@ -1,8 +1,8 @@
 # {{ ansible_managed }}

 global:
-  scrape_interval:     15s # By default, scrape targets every 15 seconds.
-  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+  # scrape_interval is set to the global default (60s)
+  # evaluation_interval is set to the global default (60s)
  # scrape_timeout is set to the global default (10s).

  # Attach these labels to any time series or alerts when communicating with
@ -10,16 +10,15 @@ global:
  external_labels:
      monitor: 'example'

-# Load and evaluate rules in this file every 'evaluation_interval' seconds.
-rule_files:
-  - "alert.rules"
-
-# Route alerts to Prometheus Alertmanager
+# Alertmanager configuration
 alerting:
  alertmanagers:
  - static_configs:
-    - targets:
-      - 'localhost:9093'
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "alert.rules.yml"

 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.