diff options
author | OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com> | 2018-01-29 03:25:44 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-29 03:25:44 -0800 |
commit | 2ec70a36f50f670887f2f257d348db6332de5d26 (patch) | |
tree | 4e9eebca6c1979b57275d0a60ac30cb6dfcb1457 /roles/openshift_prometheus/templates | |
parent | 312692222f11c19df680867b4e497947e343d017 (diff) | |
parent | 3de29f6d5a3017b57c553c5e2fb63a50994df840 (diff) | |
download | openshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.gz openshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.bz2 openshift-2ec70a36f50f670887f2f257d348db6332de5d26.tar.xz openshift-2ec70a36f50f670887f2f257d348db6332de5d26.zip |
Merge pull request #6811 from mjudeikis/prometheus-new-template
Automatic merge from submit-queue.
Prometheus new template rebase
Updating Prometheus for new templates/example.
1. New scraping rules, fixes
2. exposed alert manager
3. clean ansible
4. add a custom annotation for routes (in the example when AVI router in use we need to be able to add custom annotations)
5. Externalise some of the configs
Still work in progress...
FIY: @zgalor
Diffstat (limited to 'roles/openshift_prometheus/templates')
-rw-r--r-- | roles/openshift_prometheus/templates/prometheus.j2 | 92 | ||||
-rw-r--r-- | roles/openshift_prometheus/templates/prometheus.yml.j2 | 175 |
2 files changed, 192 insertions, 75 deletions
diff --git a/roles/openshift_prometheus/templates/prometheus.j2 b/roles/openshift_prometheus/templates/prometheus.j2 index d780550b8..c0abd483b 100644 --- a/roles/openshift_prometheus/templates/prometheus.j2 +++ b/roles/openshift_prometheus/templates/prometheus.j2 @@ -19,7 +19,7 @@ spec: labels: app: prometheus spec: - serviceAccountName: prometheus + serviceAccountName: "{{ openshift_prometheus_service_name }}" {% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} nodeSelector: {% for key, value in openshift_prometheus_node_selector.items() %} @@ -47,15 +47,15 @@ spec: cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" {% endif %} ports: - - containerPort: 8443 + - containerPort: {{ openshift_prometheus_service_targetport }} name: web args: - -provider=openshift - - -https-address=:8443 + - -https-address=:{{ openshift_prometheus_service_targetport }} - -http-address= - -email-domain=* - -upstream=http://localhost:9090 - - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' - -tls-cert=/etc/tls/private/tls.crt @@ -67,9 +67,9 @@ spec: - -skip-auth-regex=^/metrics volumeMounts: - mountPath: /etc/tls/private - name: prometheus-tls + name: prometheus-tls-secret - mountPath: /etc/proxy/secrets - name: prometheus-secrets + name: prometheus-proxy-secret - mountPath: /prometheus name: prometheus-data @@ -104,7 +104,7 @@ spec: - mountPath: /prometheus name: prometheus-data - # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy + # Deploy alert-buffer behind oauth alerts-proxy - name: alerts-proxy image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}" imagePullPolicy: IfNotPresent @@ -124,15 +124,15 @@ spec: cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" {% endif %} ports: - - containerPort: 9443 + - containerPort: {{ openshift_prometheus_alerts_service_targetport }} name: web args: - -provider=openshift - - -https-address=:9443 + - -https-address=:{{ openshift_prometheus_alerts_service_targetport }} - -http-address= - -email-domain=* - -upstream=http://localhost:9099 - - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' - -tls-cert=/etc/tls/private/tls.crt @@ -143,9 +143,9 @@ spec: - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt volumeMounts: - mountPath: /etc/tls/private - name: alerts-tls + name: alerts-tls-secret - mountPath: /etc/proxy/secrets - name: alerts-secrets + name: alerts-proxy-secret - name: alert-buffer args: @@ -169,11 +169,54 @@ spec: {% endif %} volumeMounts: - mountPath: /alert-buffer - name: alert-buffer-data + name: alerts-data ports: - containerPort: 9099 name: alert-buf + # Deploy alertmanager behind oauth alertmanager-proxy + - name: alertmanager-proxy + image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}" + imagePullPolicy: IfNotPresent + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}" +{% endif %} + limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" +{% endif %} + ports: + - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }} + name: web + args: + - -provider=openshift + - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }} + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9093 + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + - -skip-auth-regex=^/metrics + volumeMounts: + - mountPath: /etc/tls/private + name: alertmanager-tls-secret + - mountPath: /etc/proxy/secrets + name: alertmanager-proxy-secret + - name: alertmanager args: - -config.file=/etc/alertmanager/alertmanager.yml @@ -205,14 +248,15 @@ spec: restartPolicy: Always volumes: + - name: prometheus-config configMap: defaultMode: 420 name: prometheus - - name: prometheus-secrets + - name: prometheus-proxy-secret secret: secretName: prometheus-proxy - - name: prometheus-tls + - name: prometheus-tls-secret secret: secretName: prometheus-tls - name: prometheus-data @@ -225,13 +269,19 @@ spec: - name: alertmanager-config configMap: defaultMode: 420 - name: prometheus-alerts - - name: alerts-secrets + name: alertmanager + - name: alertmanager-proxy-secret secret: - secretName: alerts-proxy - - name: alerts-tls + secretName: alertmanager-proxy + - name: alertmanager-tls-secret + secret: + secretName: alertmanager-tls + - name: alerts-tls-secret secret: - secretName: prometheus-alerts-tls + secretName: alerts-tls + - name: alerts-proxy-secret + secret: + secretName: alerts-proxy - name: alertmanager-data {% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} persistentVolumeClaim: @@ -239,7 +289,7 @@ spec: {% else %} emptydir: {} {% endif %} - - name: alert-buffer-data + - name: alerts-data {% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} persistentVolumeClaim: claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 index 63430f834..005c2c564 100644 --- a/roles/openshift_prometheus/templates/prometheus.yml.j2 +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -1,10 +1,5 @@ rule_files: - - 'prometheus.rules' -{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} - - 'prometheus.additional.rules' -{% endif %} - - + - '*.rules' # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) @@ -39,31 +34,11 @@ scrape_configs: action: keep regex: default;kubernetes;https -# Scrape config for nodes. -# -# Each node exposes a /metrics endpoint that contains operational metrics for -# the Kubelet and other components. -- job_name: 'kubernetes-nodes' - - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - # Scrape config for controllers. # # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for # the controllers. # -# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via -# endpoints. - job_name: 'kubernetes-controllers' scheme: https @@ -87,6 +62,27 @@ scrape_configs: regex: (.+)(?::\d+) replacement: $1:8444 +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + # Drop a very high cardinality metric that is incorrect in 3.7. It will be + # fixed in 3.9. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)' + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + # Scrape config for cAdvisor. # # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that @@ -107,6 +103,14 @@ scrape_configs: kubernetes_sd_configs: - role: node + # Exclude a set of high cardinality metrics that can contribute to significant + # memory use in large clusters. These can be selectively enabled as necessary + # for medium or small clusters. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))' + relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) @@ -133,38 +137,101 @@ scrape_configs: - role: endpoints relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + # only scrape infrastructure components + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+' + # drop infrastructure components managed by other scrape targets + - source_labels: [__meta_kubernetes_service_name] + action: drop + regex: 'prometheus-node-exporter' + # only those that have requested scraping + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +# Scrape config for node-exporter, which is expected to be running on port 9100. +- job_name: 'kubernetes-nodes-exporter' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + + kubernetes_sd_configs: + - role: node + + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'node_cpu|node_(disk|scrape_collector)_.+' + # preserve a subset of the network, netstat, vmstat, and filesystem series + - source_labels: [__name__] action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))' + target_label: __name__ + replacement: renamed_$1 + - source_labels: [__name__] + action: drop + regex: 'node_(netstat|vmstat|filesystem|network)_.+' + - source_labels: [__name__] action: replace + regex: 'renamed_(.+)' + target_label: __name__ + replacement: $1 + # drop any partial expensive series + - source_labels: [__name__, device] + action: drop + regex: 'node_network_.+;veth.+' + - source_labels: [__name__, mountpoint] + action: drop + regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)' + + relabel_configs: + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:9100' target_label: __address__ - regex: (.+)(?::\d+);(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] - action: replace - target_label: __basic_auth_username__ - regex: (.+) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] - action: replace - target_label: __basic_auth_password__ - regex: (.+) + - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname] + target_label: __instance__ - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for the template service broker +- job_name: 'openshift-template-service-broker' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + server_name: apiserver.openshift-template-service-broker.svc + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: openshift-template-service-broker;apiserver;https + alerting: alertmanagers: |