summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com>2018-01-29 03:25:44 -0800
committerGitHub <noreply@github.com>2018-01-29 03:25:44 -0800
commit2ec70a36f50f670887f2f257d348db6332de5d26 (patch)
tree4e9eebca6c1979b57275d0a60ac30cb6dfcb1457
parent312692222f11c19df680867b4e497947e343d017 (diff)
parent3de29f6d5a3017b57c553c5e2fb63a50994df840 (diff)
downloadopenshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.gz
openshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.bz2
openshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.xz
openshift-2ec70a36f50f670887f2f257d348db6332de5d26.zip
Merge pull request #6811 from mjudeikis/prometheus-new-template
Automatic merge from submit-queue. Prometheus new template rebase Updating Prometheus for new templates/example. 1. New scraping rules, fixes 2. exposed alert manager 3. clean ansible 4. add a custom annotation for routes (in the example when AVI router in use we need to be able to add custom annotations) 5. Externalise some of the configs Still work in progress... FIY: @zgalor
-rw-r--r--playbooks/openshift-prometheus/private/uninstall.yml8
-rw-r--r--playbooks/openshift-prometheus/uninstall.yml2
-rw-r--r--roles/openshift_prometheus/defaults/main.yaml15
-rw-r--r--roles/openshift_prometheus/tasks/facts.yaml10
-rw-r--r--roles/openshift_prometheus/tasks/install_prometheus.yaml119
-rw-r--r--roles/openshift_prometheus/tasks/main.yaml4
-rw-r--r--roles/openshift_prometheus/tasks/uninstall.yaml (renamed from roles/openshift_prometheus/tasks/uninstall_prometheus.yaml)0
-rw-r--r--roles/openshift_prometheus/templates/prometheus.j292
-rw-r--r--roles/openshift_prometheus/templates/prometheus.yml.j2175
9 files changed, 283 insertions, 142 deletions
diff --git a/playbooks/openshift-prometheus/private/uninstall.yml b/playbooks/openshift-prometheus/private/uninstall.yml
new file mode 100644
index 000000000..2df39c2a8
--- /dev/null
+++ b/playbooks/openshift-prometheus/private/uninstall.yml
@@ -0,0 +1,8 @@
+---
+- name: Uninstall Prometheus
+ hosts: masters[0]
+ tasks:
+ - name: Run the Prometheus Uninstall Role Tasks
+ include_role:
+ name: openshift_prometheus
+ tasks_from: uninstall
diff --git a/playbooks/openshift-prometheus/uninstall.yml b/playbooks/openshift-prometheus/uninstall.yml
new file mode 100644
index 000000000..c92ade786
--- /dev/null
+++ b/playbooks/openshift-prometheus/uninstall.yml
@@ -0,0 +1,2 @@
+---
+- import_playbook: private/uninstall.yml
diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml
index 1b21c4739..37a05f3f0 100644
--- a/roles/openshift_prometheus/defaults/main.yaml
+++ b/roles/openshift_prometheus/defaults/main.yaml
@@ -7,9 +7,24 @@ openshift_prometheus_namespace: openshift-metrics
# defaults hosts for routes
openshift_prometheus_hostname: prometheus-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
openshift_prometheus_alerts_hostname: alerts-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+openshift_prometheus_alertmanager_hostname: alertmanager-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+
openshift_prometheus_node_selector: {"region":"infra"}
+openshift_prometheus_service_port: 443
+openshift_prometheus_service_targetport: 8443
+openshift_prometheus_service_name: prometheus
+openshift_prometheus_alerts_service_targetport: 9443
+openshift_prometheus_alerts_service_name: alerts
+openshift_prometheus_alertmanager_service_targetport: 10443
+openshift_prometheus_alertmanager_service_name: alertmanager
+openshift_prometheus_serviceaccount_annotations: []
+l_openshift_prometheus_serviceaccount_annotations:
+ - serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+ - serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+ - serviceaccounts.openshift.io/oauth-redirectreference.alertmanager='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alertmanager"}}'
+
# additional prometheus rules file
openshift_prometheus_additional_rules_file: null
diff --git a/roles/openshift_prometheus/tasks/facts.yaml b/roles/openshift_prometheus/tasks/facts.yaml
new file mode 100644
index 000000000..214089732
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/facts.yaml
@@ -0,0 +1,10 @@
+---
+# The kubernetes version impacts the prometheus scraping endpoint
+# so gathering it before constructing the configmap
+- name: get oc version
+ oc_version:
+ register: oc_version
+
+- set_fact:
+ kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
+ openshift_prometheus_serviceaccount_annotations: "{{ l_openshift_prometheus_serviceaccount_annotations + openshift_prometheus_serviceaccount_annotations|list }}"
diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml
index 749df5152..0b565502f 100644
--- a/roles/openshift_prometheus/tasks/install_prometheus.yaml
+++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml
@@ -1,4 +1,6 @@
---
+# set facts
+- include_tasks: facts.yaml
# namespace
- name: Add prometheus project
@@ -9,7 +11,7 @@
description: Prometheus
# secrets
-- name: Set alert and prometheus secrets
+- name: Set alert, alertmanager and prometheus secrets
oc_secret:
state: present
name: "{{ item }}-proxy"
@@ -20,30 +22,24 @@
with_items:
- prometheus
- alerts
+ - alertmanager
# serviceaccount
- name: create prometheus serviceaccount
oc_serviceaccount:
state: present
- name: prometheus
+ name: "{{ openshift_prometheus_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
- # TODO add annotations when supproted
- # annotations:
- # serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
- # serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
- secrets:
- - prometheus-secrets
changed_when: no
+
# TODO remove this when annotations are supported by oc_serviceaccount
- name: annotate serviceaccount
command: >
{{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- serviceaccount prometheus
- serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
- serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
+ serviceaccount {{ openshift_prometheus_service_name }} {{ item }}
+ with_items:
+ "{{ openshift_prometheus_serviceaccount_annotations }}"
# create clusterrolebinding for prometheus serviceaccount
- name: Set cluster-reader permissions for prometheus
@@ -52,63 +48,61 @@
namespace: "{{ openshift_prometheus_namespace }}"
resource_kind: cluster-role
resource_name: cluster-reader
- user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus"
+ user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:{{ openshift_prometheus_service_name }}"
+
-# create prometheus and alerts services
-# TODO join into 1 task with loop
-- name: Create prometheus service
+- name: create services for prometheus
oc_service:
- state: present
- name: "{{ item.name }}"
+ name: "{{ openshift_prometheus_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
- selector:
- app: prometheus
labels:
- name: "{{ item.name }}"
- # TODO add annotations when supported
- # annotations:
- # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ name: prometheus
+ annotations:
+ oprometheus.io/scrape: 'true'
+ oprometheus.io/scheme: https
+ service.alpha.openshift.io/serving-cert-secret-name: prometheus-tls
ports:
- - port: 443
- targetPort: 8443
- with_items:
- - name: prometheus
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_service_targetport }}"
+ protocol: TCP
+ selector:
+ app: prometheus
-- name: Create alerts service
+- name: create services for alert buffer
oc_service:
- state: present
- name: "{{ item.name }}"
+ name: "{{ openshift_prometheus_alerts_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
+ labels:
+ name: prometheus
+ annotations:
+ service.alpha.openshift.io/serving-cert-secret-name: alerts-tls
+ ports:
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_alerts_service_targetport }}"
+ protocol: TCP
selector:
app: prometheus
+
+- name: create services for alertmanager
+ oc_service:
+ name: "{{ openshift_prometheus_alertmanager_service_name }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
labels:
- name: "{{ item.name }}"
- # TODO add annotations when supported
- # annotations:
- # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ name: prometheus
+ annotations:
+ service.alpha.openshift.io/serving-cert-secret-name: alertmanager-tls
ports:
- - port: 443
- targetPort: 9443
- with_items:
- - name: alerts
-
-
-# Annotate services with secret name
-# TODO remove this when annotations are supported by oc_service
-- name: annotate prometheus service
- command: >
- {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- service prometheus
- prometheus.io/scrape='true'
- prometheus.io/scheme=https
- service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls
-
-- name: annotate alerts service
- command: >
- {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls'
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_alertmanager_service_targetport }}"
+ protocol: TCP
+ selector:
+ app: prometheus
# create prometheus and alerts routes
+# TODO: oc_route module should support insecureEdgeTerminationPolicy: Redirect
- name: create prometheus and alerts routes
oc_route:
state: present
@@ -122,6 +116,8 @@
host: "{{ openshift_prometheus_hostname }}"
- name: alerts
host: "{{ openshift_prometheus_alerts_hostname }}"
+ - name: alertmanager
+ host: "{{ openshift_prometheus_alertmanager_hostname }}"
# Storage
- name: create prometheus pvc
@@ -169,15 +165,6 @@
path: "{{ tempdir }}/prometheus.additional.rules"
register: additional_rules_stat
-# The kubernetes version impacts the prometheus scraping endpoint
-# so gathering it before constructing the configmap
-- name: get oc version
- oc_version:
- register: oc_version
-
-- set_fact:
- kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
-
- template:
src: prometheus.yml.j2
dest: "{{ tempdir }}/prometheus.yml"
@@ -219,7 +206,7 @@
- name: Set alertmanager configmap
oc_configmap:
state: present
- name: "prometheus-alerts"
+ name: "alertmanager"
namespace: "{{ openshift_prometheus_namespace }}"
from_file:
alertmanager.yml: "{{ tempdir }}/alertmanager.yml"
diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml
index b859eb111..66d65a3f2 100644
--- a/roles/openshift_prometheus/tasks/main.yaml
+++ b/roles/openshift_prometheus/tasks/main.yaml
@@ -16,9 +16,11 @@
- name: Create templates subdirectory
file:
state: directory
- path: "{{ tempdir }}/templates"
+ path: "{{ tempdir }}/{{ item }}"
mode: 0755
changed_when: False
+ with_items:
+ - templates
- include_tasks: install_prometheus.yaml
when: openshift_prometheus_state == 'present'
diff --git a/roles/openshift_prometheus/tasks/uninstall_prometheus.yaml b/roles/openshift_prometheus/tasks/uninstall.yaml
index d746402db..d746402db 100644
--- a/roles/openshift_prometheus/tasks/uninstall_prometheus.yaml
+++ b/roles/openshift_prometheus/tasks/uninstall.yaml
diff --git a/roles/openshift_prometheus/templates/prometheus.j2 b/roles/openshift_prometheus/templates/prometheus.j2
index d780550b8..c0abd483b 100644
--- a/roles/openshift_prometheus/templates/prometheus.j2
+++ b/roles/openshift_prometheus/templates/prometheus.j2
@@ -19,7 +19,7 @@ spec:
labels:
app: prometheus
spec:
- serviceAccountName: prometheus
+ serviceAccountName: "{{ openshift_prometheus_service_name }}"
{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
nodeSelector:
{% for key, value in openshift_prometheus_node_selector.items() %}
@@ -47,15 +47,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 8443
+ - containerPort: {{ openshift_prometheus_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:8443
+ - -https-address=:{{ openshift_prometheus_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9090
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -67,9 +67,9 @@ spec:
- -skip-auth-regex=^/metrics
volumeMounts:
- mountPath: /etc/tls/private
- name: prometheus-tls
+ name: prometheus-tls-secret
- mountPath: /etc/proxy/secrets
- name: prometheus-secrets
+ name: prometheus-proxy-secret
- mountPath: /prometheus
name: prometheus-data
@@ -104,7 +104,7 @@ spec:
- mountPath: /prometheus
name: prometheus-data
- # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+ # Deploy alert-buffer behind oauth alerts-proxy
- name: alerts-proxy
image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
imagePullPolicy: IfNotPresent
@@ -124,15 +124,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 9443
+ - containerPort: {{ openshift_prometheus_alerts_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:9443
+ - -https-address=:{{ openshift_prometheus_alerts_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9099
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -143,9 +143,9 @@ spec:
- -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
volumeMounts:
- mountPath: /etc/tls/private
- name: alerts-tls
+ name: alerts-tls-secret
- mountPath: /etc/proxy/secrets
- name: alerts-secrets
+ name: alerts-proxy-secret
- name: alert-buffer
args:
@@ -169,11 +169,54 @@ spec:
{% endif %}
volumeMounts:
- mountPath: /alert-buffer
- name: alert-buffer-data
+ name: alerts-data
ports:
- containerPort: 9099
name: alert-buf
+ # Deploy alertmanager behind oauth alertmanager-proxy
+ - name: alertmanager-proxy
+ image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
+ imagePullPolicy: IfNotPresent
+ requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
+{% endif %}
+ ports:
+ - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }}
+ name: web
+ args:
+ - -provider=openshift
+ - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }}
+ - -http-address=
+ - -email-domain=*
+ - -upstream=http://localhost:9093
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
+ - -openshift-ca=/etc/pki/tls/cert.pem
+ - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+ - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+ - -tls-cert=/etc/tls/private/tls.crt
+ - -tls-key=/etc/tls/private/tls.key
+ - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+ - -cookie-secret-file=/etc/proxy/secrets/session_secret
+ - -skip-auth-regex=^/metrics
+ volumeMounts:
+ - mountPath: /etc/tls/private
+ name: alertmanager-tls-secret
+ - mountPath: /etc/proxy/secrets
+ name: alertmanager-proxy-secret
+
- name: alertmanager
args:
- -config.file=/etc/alertmanager/alertmanager.yml
@@ -205,14 +248,15 @@ spec:
restartPolicy: Always
volumes:
+
- name: prometheus-config
configMap:
defaultMode: 420
name: prometheus
- - name: prometheus-secrets
+ - name: prometheus-proxy-secret
secret:
secretName: prometheus-proxy
- - name: prometheus-tls
+ - name: prometheus-tls-secret
secret:
secretName: prometheus-tls
- name: prometheus-data
@@ -225,13 +269,19 @@ spec:
- name: alertmanager-config
configMap:
defaultMode: 420
- name: prometheus-alerts
- - name: alerts-secrets
+ name: alertmanager
+ - name: alertmanager-proxy-secret
secret:
- secretName: alerts-proxy
- - name: alerts-tls
+ secretName: alertmanager-proxy
+ - name: alertmanager-tls-secret
+ secret:
+ secretName: alertmanager-tls
+ - name: alerts-tls-secret
secret:
- secretName: prometheus-alerts-tls
+ secretName: alerts-tls
+ - name: alerts-proxy-secret
+ secret:
+ secretName: alerts-proxy
- name: alertmanager-data
{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
persistentVolumeClaim:
@@ -239,7 +289,7 @@ spec:
{% else %}
emptydir: {}
{% endif %}
- - name: alert-buffer-data
+ - name: alerts-data
{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
persistentVolumeClaim:
claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2
index 63430f834..005c2c564 100644
--- a/roles/openshift_prometheus/templates/prometheus.yml.j2
+++ b/roles/openshift_prometheus/templates/prometheus.yml.j2
@@ -1,10 +1,5 @@
rule_files:
- - 'prometheus.rules'
-{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
- - 'prometheus.additional.rules'
-{% endif %}
-
-
+ - '*.rules'
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
@@ -39,31 +34,11 @@ scrape_configs:
action: keep
regex: default;kubernetes;https
-# Scrape config for nodes.
-#
-# Each node exposes a /metrics endpoint that contains operational metrics for
-# the Kubelet and other components.
-- job_name: 'kubernetes-nodes'
-
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
- kubernetes_sd_configs:
- - role: node
-
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
-
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
-# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
-# endpoints.
- job_name: 'kubernetes-controllers'
scheme: https
@@ -87,6 +62,27 @@ scrape_configs:
regex: (.+)(?::\d+)
replacement: $1:8444
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ kubernetes_sd_configs:
+ - role: node
+ # Drop a very high cardinality metric that is incorrect in 3.7. It will be
+ # fixed in 3.9.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -107,6 +103,14 @@ scrape_configs:
kubernetes_sd_configs:
- role: node
+ # Exclude a set of high cardinality metrics that can contribute to significant
+ # memory use in large clusters. These can be selectively enabled as necessary
+ # for medium or small clusters.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
+
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
@@ -133,38 +137,101 @@ scrape_configs:
- role: endpoints
relabel_configs:
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
- action: keep
- regex: true
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
- action: replace
- target_label: __scheme__
- regex: (https?)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ # only scrape infrastructure components
+ - source_labels: [__meta_kubernetes_namespace]
+ action: keep
+ regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
+ # drop infrastructure components managed by other scrape targets
+ - source_labels: [__meta_kubernetes_service_name]
+ action: drop
+ regex: 'prometheus-node-exporter'
+ # only those that have requested scraping
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+);(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: kubernetes_name
+
+# Scrape config for node-exporter, which is expected to be running on port 9100.
+- job_name: 'kubernetes-nodes-exporter'
+
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+
+ kubernetes_sd_configs:
+ - role: node
+
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_cpu|node_(disk|scrape_collector)_.+'
+ # preserve a subset of the network, netstat, vmstat, and filesystem series
+ - source_labels: [__name__]
action: replace
- target_label: __metrics_path__
- regex: (.+)
- - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
+ target_label: __name__
+ replacement: renamed_$1
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_(netstat|vmstat|filesystem|network)_.+'
+ - source_labels: [__name__]
action: replace
+ regex: 'renamed_(.+)'
+ target_label: __name__
+ replacement: $1
+ # drop any partial expensive series
+ - source_labels: [__name__, device]
+ action: drop
+ regex: 'node_network_.+;veth.+'
+ - source_labels: [__name__, mountpoint]
+ action: drop
+ regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
+
+ relabel_configs:
+ - source_labels: [__address__]
+ regex: '(.*):10250'
+ replacement: '${1}:9100'
target_label: __address__
- regex: (.+)(?::\d+);(\d+)
- replacement: $1:$2
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
- action: replace
- target_label: __basic_auth_username__
- regex: (.+)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
- action: replace
- target_label: __basic_auth_password__
- regex: (.+)
+ - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
+ target_label: __instance__
- action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - source_labels: [__meta_kubernetes_namespace]
- action: replace
- target_label: kubernetes_namespace
- - source_labels: [__meta_kubernetes_service_name]
- action: replace
- target_label: kubernetes_name
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for the template service broker
+- job_name: 'openshift-template-service-broker'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
+ server_name: apiserver.openshift-template-service-broker.svc
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: openshift-template-service-broker;apiserver;https
+
alerting:
alertmanagers: