[Prometheus] Update to 2.x

2019-05-12 18:34:51 +02:00 · 2019-05-12 18:34:51 +02:00 · f30e5c9f21
parent 528e443d67
commit f30e5c9f21
4 changed files with 35 additions and 24 deletions
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -15,8 +15,8 @@
 - name: Configure Prometheus alert rules
  template:
-    src: prometheus/alert.rules.j2
+    src: prometheus/alert.rules.yml.j2
-    dest: /etc/prometheus/alert.rules
+    dest: /etc/prometheus/alert.rules.yml
  notify: Restart Prometheus
 # We don't need to restart Prometheus when updating nodes
--- a/roles/prometheus/templates/prometheus/alert.rules.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.j2
@ -1,13 +0,0 @@
 # {{ ansible_managed }}
 {# As this is also Jinja2 it will conflict without a raw block #}
 {% raw %}
 ALERT InstanceDown
  IF up == 0
  FOR 5m
  LABELS { severity = "page" }
  ANNOTATIONS {
    summary = "Instance {{ $labels.instance }} down",
    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
  }
 {% endraw %}
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@ -0,0 +1,25 @@
 # {{ ansible_managed }}
 {# As this is also Jinja2 it will conflict without a raw block #}
 {% raw %}
 groups:
 - name: example
  rules:
  # Alert for any instance that is unreachable for >5 minutes.
  - alert: InstanceDown
    expr: up == 0
    for: 5m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
  # Alert for any instance that has a median request latency >1s.
  - alert: APIHighRequestLatency
    expr: api_http_request_latencies_second{quantile="0.5"} > 1
    for: 10m
    annotations:
      summary: "High request latency on {{ $labels.instance }}"
      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
 {% endraw %}
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@ -1,8 +1,8 @@
 # {{ ansible_managed }}
 global:
-  scrape_interval:     15s # By default, scrape targets every 15 seconds.
+  # scrape_interval is set to the global default (60s)
-  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+  # evaluation_interval is set to the global default (60s)
  # scrape_timeout is set to the global default (10s).
  # Attach these labels to any time series or alerts when communicating with
@ -10,16 +10,15 @@ global:
  external_labels:
      monitor: 'example'
-# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+# Alertmanager configuration
 rule_files:
  - "alert.rules"
 # Route alerts to Prometheus Alertmanager
 alerting:
  alertmanagers:
  - static_configs:
-    - targets:
+    - targets: ['localhost:9093']
-      - 'localhost:9093'
+
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
  - "alert.rules.yml"
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.