From ec75d0ac888f3fab87f8d335224596df045e260a Mon Sep 17 00:00:00 2001 From: Zohar Galor Date: Tue, 20 Jun 2017 17:28:07 +0300 Subject: Create ansible role for deploying prometheus on openshift A new role for installing prometheus on openshift. Depends on `openshift_hosted_prometheus_deploy` flag role creates: - prometheus namespace - prometheus clusterrolebinding and service account - pvs for prometheus, alertmanager and alertbuffer for internal nfs - prometheus pod with prometheus behind oauth-proxy, alertmanager and alert-buffer behind oauth-proxy - prometheus and alertmanager configmaps - prometheus and alerts services and direct routes - prometheus, alertmanager and alert-buffer pvcs --- filter_plugins/oo_filters.py | 15 +- .../byo/openshift-cluster/openshift-prometheus.yml | 4 + .../common/openshift-cluster/openshift_hosted.yml | 3 + .../openshift-cluster/openshift_prometheus.yml | 9 + roles/openshift_prometheus/README.md | 95 ++++++++ roles/openshift_prometheus/defaults/main.yaml | 74 +++++++ .../files/openshift_prometheus.exports | 3 + roles/openshift_prometheus/meta/main.yaml | 19 ++ roles/openshift_prometheus/tasks/create_pvs.yaml | 36 +++ .../tasks/install_prometheus.yaml | 241 +++++++++++++++++++++ roles/openshift_prometheus/tasks/main.yaml | 26 +++ roles/openshift_prometheus/tasks/nfs.yaml | 44 ++++ .../templates/alertmanager.yml.j2 | 20 ++ .../templates/prom-pv-alertbuffer.yml.j2 | 15 ++ .../templates/prom-pv-alertmanager.yml.j2 | 15 ++ .../templates/prom-pv-server.yml.j2 | 15 ++ .../templates/prometheus.rules.j2 | 4 + .../templates/prometheus.yml.j2 | 174 +++++++++++++++ .../templates/prometheus_deployment.j2 | 240 ++++++++++++++++++++ roles/openshift_prometheus/tests/inventory | 2 + roles/openshift_prometheus/tests/test.yaml | 5 + 21 files changed, 1058 insertions(+), 1 deletion(-) create mode 100644 playbooks/byo/openshift-cluster/openshift-prometheus.yml create mode 100644 playbooks/common/openshift-cluster/openshift_prometheus.yml create mode 100644 roles/openshift_prometheus/README.md create mode 100644 roles/openshift_prometheus/defaults/main.yaml create mode 100644 roles/openshift_prometheus/files/openshift_prometheus.exports create mode 100644 roles/openshift_prometheus/meta/main.yaml create mode 100644 roles/openshift_prometheus/tasks/create_pvs.yaml create mode 100644 roles/openshift_prometheus/tasks/install_prometheus.yaml create mode 100644 roles/openshift_prometheus/tasks/main.yaml create mode 100644 roles/openshift_prometheus/tasks/nfs.yaml create mode 100644 roles/openshift_prometheus/templates/alertmanager.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-server.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus.rules.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus_deployment.j2 create mode 100644 roles/openshift_prometheus/tests/inventory create mode 100644 roles/openshift_prometheus/tests/test.yaml diff --git a/filter_plugins/oo_filters.py b/filter_plugins/oo_filters.py index 36a90a870..277695f78 100644 --- a/filter_plugins/oo_filters.py +++ b/filter_plugins/oo_filters.py @@ -1024,6 +1024,18 @@ def oo_contains_rule(source, apiGroups, resources, verbs): return False +def oo_selector_to_string_list(user_dict): + """Convert a dict of selectors to a key=value list of strings + +Given input of {'region': 'infra', 'zone': 'primary'} returns a list +of items as ['region=infra', 'zone=primary'] + """ + selectors = [] + for key in user_dict: + selectors.append("{}={}".format(key, user_dict[key])) + return selectors + + class FilterModule(object): """ Custom ansible filter mapping """ @@ -1065,5 +1077,6 @@ class FilterModule(object): "oo_openshift_loadbalancer_backends": oo_openshift_loadbalancer_backends, "to_padded_yaml": to_padded_yaml, "oo_random_word": oo_random_word, - "oo_contains_rule": oo_contains_rule + "oo_contains_rule": oo_contains_rule, + "oo_selector_to_string_list": oo_selector_to_string_list } diff --git a/playbooks/byo/openshift-cluster/openshift-prometheus.yml b/playbooks/byo/openshift-cluster/openshift-prometheus.yml new file mode 100644 index 000000000..15917078d --- /dev/null +++ b/playbooks/byo/openshift-cluster/openshift-prometheus.yml @@ -0,0 +1,4 @@ +--- +- include: initialize_groups.yml + +- include: ../../common/openshift-cluster/openshift_prometheus.yml diff --git a/playbooks/common/openshift-cluster/openshift_hosted.yml b/playbooks/common/openshift-cluster/openshift_hosted.yml index 99a634970..a391b963a 100644 --- a/playbooks/common/openshift-cluster/openshift_hosted.yml +++ b/playbooks/common/openshift-cluster/openshift_hosted.yml @@ -49,6 +49,9 @@ - role: cockpit-ui when: ( openshift.common.version_gte_3_3_or_1_3 | bool ) and ( openshift_hosted_manage_registry | default(true) | bool ) and not (openshift.docker.hosted_registry_insecure | default(false) | bool) + - role: openshift_prometheus + when: openshift_hosted_prometheus_deploy | default(false) | bool + - name: Update master-config for publicLoggingURL hosts: oo_masters_to_config:!oo_first_master tags: diff --git a/playbooks/common/openshift-cluster/openshift_prometheus.yml b/playbooks/common/openshift-cluster/openshift_prometheus.yml new file mode 100644 index 000000000..a979c0c00 --- /dev/null +++ b/playbooks/common/openshift-cluster/openshift_prometheus.yml @@ -0,0 +1,9 @@ +--- +- include: std_include.yml + +- name: OpenShift Prometheus + hosts: oo_first_master + roles: + - openshift_prometheus + vars: + openshift_prometheus_state: present diff --git a/roles/openshift_prometheus/README.md b/roles/openshift_prometheus/README.md new file mode 100644 index 000000000..c5a44bffb --- /dev/null +++ b/roles/openshift_prometheus/README.md @@ -0,0 +1,95 @@ +OpenShift Prometheus +==================== + +OpenShift Prometheus Installation + +Requirements +------------ + + +Role Variables +-------------- + +For default values, see [`defaults/main.yaml`](defaults/main.yaml). + +- `openshift_prometheus_state`: present - install/update. absent - uninstall. + +- `openshift_prometheus_namespace`: project (i.e. namespace) where the components will be + deployed. + +- `openshift_prometheus_replicas`: The number of replicas for prometheus deployment. + +- `openshift_prometheus_node_selector`: Selector for the nodes prometheus will be deployed on. + +- `openshift_prometheus_image_`: specify image for the component + +## Storage related variables +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can set pv claim by setting corresponding role variable: +``` +openshift_prometheus__storage_type: +openshift_prometheus__pvc_(name|size|access_modes|pv_selector): +``` +e.g +``` +openshift_prometheus_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: alertmanager +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +``` + +## Additional Alert Rules file variable +An external file with alert rules can be added by setting path to additional rules variable: +``` +openshift_prometheus_additional_rules_file: +``` + +File content should be in prometheus alert rules format. +Following example sets rule to fire an alert when one of the cluster nodes is down: + +``` +groups: +- name: example-rules + interval: 30s # defaults to global interval + rules: + - alert: Node Down + expr: up{job="kubernetes-nodes"} == 0 + annotations: + miqTarget: "ContainerNode" + severity: "HIGH" + message: "{{ '{{' }}{{ '$labels.instance' }}{{ '}}' }} is down" +``` + + +## Additional variables to control resource limits +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can specify a cpu and memory limits and requests by setting +the corresponding role variable: +``` +openshift_prometheus__(limits|requests)_(memory|cpu): +``` +e.g +``` +openshift_prometheus_alertmanager_limits_memory: 1Gi +openshift_prometheus_oath_proxy_requests_cpu: 100 +``` + +Dependencies +------------ + +openshift_facts + + +Example Playbook +---------------- + +``` +- name: Configure openshift-prometheus + hosts: oo_first_master + roles: + - role: openshift_prometheus +``` + +License +------- + +Apache License, Version 2.0 + diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml new file mode 100644 index 000000000..18d6a1645 --- /dev/null +++ b/roles/openshift_prometheus/defaults/main.yaml @@ -0,0 +1,74 @@ +--- +# defaults file for openshift_prometheus +openshift_prometheus_state: present + +openshift_prometheus_namespace: prometheus + +openshift_prometheus_replicas: 1 +openshift_prometheus_node_selector: {"region":"infra"} + +# images +openshift_prometheus_image_proxy: "openshift/oauth-proxy:v1.0.0" +openshift_prometheus_image_prometheus: "openshift/prometheus:v2.0.0-dev" +openshift_prometheus_image_alertmanager: "openshift/prometheus-alertmanager:dev" +openshift_prometheus_image_alertbuffer: "ilackarms/message-buffer" + +# additional prometheus rules file +openshift_prometheus_additional_rules_file: null + +# All the required exports +openshift_prometheus_pv_exports: + - prometheus + - prometheus-alertmanager + - prometheus-alertbuffer +# PV template files and their created object names +openshift_prometheus_pv_data: + - pv_name: prometheus + pv_template: prom-pv-server.yml + pv_label: Prometheus Server PV + - pv_name: prometheus-alertmanager + pv_template: prom-pv-alertmanager.yml + pv_label: Prometheus Alertmanager PV + - pv_name: prometheus-alertbuffer + pv_template: prom-pv-alertbuffer.yml + pv_label: Prometheus Alert Buffer PV + +# Hostname/IP of the NFS server. Currently defaults to first master +openshift_prometheus_nfs_server: "{{ groups.nfs.0 }}" + +# storage +openshift_prometheus_storage_type: pvc +openshift_prometheus_pvc_name: prometheus +openshift_prometheus_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_pvc_pv_selector: {} + +openshift_prometheus_alertmanager_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: prometheus-alertmanager +openshift_prometheus_alertmanager_pvc_size: 10G +openshift_prometheus_alertmanager_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertmanager_pvc_pv_selector: {} + +openshift_prometheus_alertbuffer_storage_type: pvc +openshift_prometheus_alertbuffer_pvc_name: prometheus-alertbuffer +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_alertbuffer_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertbuffer_pvc_pv_selector: {} + +# container resources +openshift_prometheus_cpu_limit: null +openshift_prometheus_memory_limit: null +openshift_prometheus_cpu_requests: null +openshift_prometheus_memory_requests: null +openshift_prometheus_alertmanager_cpu_limit: null +openshift_prometheus_alertmanager_memory_limit: null +openshift_prometheus_alertmanager_cpu_requests: null +openshift_prometheus_alertmanager_memory_requests: null +openshift_prometheus_alertbuffer_cpu_limit: null +openshift_prometheus_alertbuffer_memory_limit: null +openshift_prometheus_alertbuffer_cpu_requests: null +openshift_prometheus_alertbuffer_memory_requests: null +openshift_prometheus_oauth_proxy_cpu_limit: null +openshift_prometheus_oauth_proxy_memory_limit: null +openshift_prometheus_oauth_proxy_cpu_requests: null +openshift_prometheus_oauth_proxy_memory_requests: null diff --git a/roles/openshift_prometheus/files/openshift_prometheus.exports b/roles/openshift_prometheus/files/openshift_prometheus.exports new file mode 100644 index 000000000..3ccedb1fd --- /dev/null +++ b/roles/openshift_prometheus/files/openshift_prometheus.exports @@ -0,0 +1,3 @@ +/exports/prometheus *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertmanager *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertbuffer *(rw,no_root_squash,no_wdelay) diff --git a/roles/openshift_prometheus/meta/main.yaml b/roles/openshift_prometheus/meta/main.yaml new file mode 100644 index 000000000..33188bb7e --- /dev/null +++ b/roles/openshift_prometheus/meta/main.yaml @@ -0,0 +1,19 @@ +--- +galaxy_info: + author: OpenShift Development + description: Deploy OpenShift prometheus integration for the cluster + company: Red Hat, Inc. + license: license (Apache) + min_ansible_version: 2.2 + platforms: + - name: EL + versions: + - 7 + - name: Fedora + versions: + - all + categories: + - openshift +dependencies: +- { role: lib_openshift } +- { role: openshift_facts } diff --git a/roles/openshift_prometheus/tasks/create_pvs.yaml b/roles/openshift_prometheus/tasks/create_pvs.yaml new file mode 100644 index 000000000..4e79da05f --- /dev/null +++ b/roles/openshift_prometheus/tasks/create_pvs.yaml @@ -0,0 +1,36 @@ +--- +# Check for existance and then conditionally: +# - evaluate templates +# - PVs +# +# These tasks idempotently create required Prometheus PV objects. Do not +# call this file directly. This file is intended to be ran as an +# include that has a 'with_items' attached to it. Hence the use below +# of variables like "{{ item.pv_label }}" + +- name: "Check if the {{ item.pv_label }} template has been created already" + oc_obj: + namespace: "{{ openshift_prometheus_namespace }}" + state: list + kind: pv + name: "{{ item.pv_name }}" + register: prom_pv_check + +# Skip all of this if the PV already exists +- block: + - name: "Ensure the {{ item.pv_label }} template is evaluated" + template: + src: "{{ item.pv_template }}.j2" + dest: "{{ tempdir }}/templates/{{ item.pv_template }}" + + - name: "Ensure {{ item.pv_label }} is created" + oc_obj: + namespace: "{{ openshift_prometheus_namespace }}" + kind: pv + name: "{{ item.pv_name }}" + state: present + delete_after: True + files: + - "{{ tempdir }}/templates/{{ item.pv_template }}" + when: + - not prom_pv_check.results.results.0 diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml new file mode 100644 index 000000000..93bdda3e8 --- /dev/null +++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml @@ -0,0 +1,241 @@ +--- + +# namespace +- name: Add prometheus project + oc_project: + state: "{{ state }}" + name: "{{ openshift_prometheus_namespace }}" + node_selector: "{{ openshift_prometheus_node_selector | oo_selector_to_string_list() }}" + description: Prometheus + +# secrets +- name: Set alert and prometheus secrets + oc_secret: + state: "{{ state }}" + name: "{{ item }}-proxy" + namespace: "{{ openshift_prometheus_namespace }}" + contents: + - path: session_secret + data: "{{ 43 | oo_random_word }}=" + with_items: + - prometheus + - alerts + +# serviceaccount +- name: create prometheus serviceaccount + oc_serviceaccount: + state: "{{ state }}" + name: prometheus + namespace: "{{ openshift_prometheus_namespace }}" + # TODO add annotations when supproted + # annotations: + # serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' + # serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + + secrets: + - prometheus-secrets + changed_when: no + +# TODO remove this when annotations are supported by oc_serviceaccount +- name: annotate serviceaccount + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + serviceaccount prometheus + serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' + serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + + +# create clusterrolebinding for prometheus serviceaccount +- name: Set cluster-reader permissions for prometheus + oc_adm_policy_user: + state: "{{ state }}" + namespace: "{{ openshift_prometheus_namespace }}" + resource_kind: cluster-role + resource_name: cluster-reader + user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus" + + +###################################################################### +# NFS +# In the case that we are not running on a cloud provider, volumes must be statically provisioned + +- include: nfs.yaml + when: not (openshift_cloudprovider_kind is defined and (openshift_cloudprovider_kind == 'aws' or openshift_cloudprovider_kind == 'gce')) + + +# create prometheus and alerts services +# TODO join into 1 task with loop +- name: Create prometheus service + oc_service: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + selector: + app: prometheus + labels: + name: "{{ item.name }}" + # TODO add annotations when supported + # annotations: + # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" + ports: + - port: 443 + targetPort: 8443 + with_items: + - name: prometheus + +- name: Create alerts service + oc_service: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + selector: + app: prometheus + labels: + name: "{{ item.name }}" + # TODO add annotations when supported + # annotations: + # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" + ports: + - port: 443 + targetPort: 9443 + with_items: + - name: alerts + + +# Annotate services with secret name +# TODO remove this when annotations are supported by oc_service +- name: annotate prometheus service + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + service prometheus 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls' + +- name: annotate alerts service + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls' + +# create prometheus and alerts routes +- name: create prometheus and alerts routes + oc_route: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + service_name: "{{ item.name }}" + tls_termination: reencrypt + with_items: + - name: prometheus + - name: alerts + +# Storage +- name: create prometheus pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_pvc_name }}" + access_modes: "{{ openshift_prometheus_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_pvc_size }}" + selector: "{{ openshift_prometheus_pvc_pv_selector }}" + +- name: create alertmanager pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_alertmanager_pvc_name }}" + access_modes: "{{ openshift_prometheus_alertmanager_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_alertmanager_pvc_size }}" + selector: "{{ openshift_prometheus_alertmanager_pvc_pv_selector }}" + +- name: create alertbuffer pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_alertbuffer_pvc_name }}" + access_modes: "{{ openshift_prometheus_alertbuffer_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_alertbuffer_pvc_size }}" + selector: "{{ openshift_prometheus_alertbuffer_pvc_pv_selector }}" + +# create prometheus deployment +- name: Set prometheus deployment template + template: + src: prometheus_deployment.j2 + dest: "{{ tempdir }}/templates/prometheus.yaml" + vars: + namespace: "{{ openshift_prometheus_namespace }}" + prom_replicas: "{{ openshift_prometheus_replicas }}" + +- name: Set prometheus deployment + oc_obj: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + kind: deployment + files: + - "{{ tempdir }}/templates/prometheus.yaml" + delete_after: true + +# prometheus configmap +# Copy the additional rules file if it is defined +- name: Copy additional rules file to host + copy: + src: "{{ openshift_prometheus_additional_rules_file }}" + dest: "{{ tempdir }}/prometheus.additional.rules" + when: + - openshift_prometheus_additional_rules_file is defined + - openshift_prometheus_additional_rules_file is not none + - openshift_prometheus_additional_rules_file | trim | length > 0 + +- stat: + path: "{{ tempdir }}/prometheus.additional.rules" + register: additional_rules_stat + +# The kubernetes version impacts the prometheus scraping endpoint +# so gathering it before constructing the configmap +- name: get oc version + oc_version: + register: oc_version + +- set_fact: + kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}" + +- template: + src: prometheus.yml.j2 + dest: "{{ tempdir }}/prometheus.yml" + changed_when: no + +- template: + src: prometheus.rules.j2 + dest: "{{ tempdir }}/prometheus.rules" + changed_when: no + +# In prometheus configmap create "additional.rules" section if file exists +- name: Set prometheus configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + prometheus.rules: "{{ tempdir }}/prometheus.rules" + prometheus.additional.rules: "{{ tempdir }}/prometheus.additional.rules" + prometheus.yml: "{{ tempdir }}/prometheus.yml" + when: additional_rules_stat.stat.exists == True + +- name: Set prometheus configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + prometheus.rules: "{{ tempdir }}/prometheus.rules" + prometheus.yml: "{{ tempdir }}/prometheus.yml" + when: additional_rules_stat.stat.exists == False + +# alertmanager configmap +- template: + src: alertmanager.yml.j2 + dest: "{{ tempdir }}/alertmanager.yml" + changed_when: no + +- name: Set alertmanager configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus-alerts" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + alertmanager.yml: "{{ tempdir }}/alertmanager.yml" diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml new file mode 100644 index 000000000..523a64334 --- /dev/null +++ b/roles/openshift_prometheus/tasks/main.yaml @@ -0,0 +1,26 @@ +--- + +- name: Create temp directory for doing work in on target + command: mktemp -td openshift-prometheus-ansible-XXXXXX + register: mktemp + changed_when: False + +- set_fact: + tempdir: "{{ mktemp.stdout }}" + +- name: Create templates subdirectory + file: + state: directory + path: "{{ tempdir }}/templates" + mode: 0755 + changed_when: False + +- include: install_prometheus.yaml + vars: + state: "{{ openshift_prometheus_state }}" + +- name: Delete temp directory + file: + name: "{{ tempdir }}" + state: absent + changed_when: False diff --git a/roles/openshift_prometheus/tasks/nfs.yaml b/roles/openshift_prometheus/tasks/nfs.yaml new file mode 100644 index 000000000..0b45f2cee --- /dev/null +++ b/roles/openshift_prometheus/tasks/nfs.yaml @@ -0,0 +1,44 @@ +--- +# Tasks to statically provision NFS volumes +# Include if not using dynamic volume provisioning +- name: Ensure the /exports/ directory exists + file: + path: /exports/ + state: directory + mode: 0755 + owner: root + group: root + +- name: Ensure the prom-pv0X export directories exist + file: + path: "/exports/{{ item }}" + state: directory + mode: 0777 + owner: nfsnobody + group: nfsnobody + with_items: "{{ openshift_prometheus_pv_exports }}" + +- name: Ensure the NFS exports for Prometheus PVs exist + copy: + src: openshift_prometheus.exports + dest: /etc/exports.d/openshift_prometheus.exports + register: nfs_exports_updated + +- name: Ensure the NFS export table is refreshed if exports were added + command: exportfs -ar + when: + - nfs_exports_updated.changed + + +###################################################################### +# Create the required Prometheus PVs. Check out these online docs if you +# need a refresher on includes looping with items: +# * http://docs.ansible.com/ansible/playbooks_loops.html#loops-and-includes-in-2-0 +# * http://stackoverflow.com/a/35128533 +# +# TODO: Handle the case where a PV template is updated in +# openshift-ansible and the change needs to be landed on the managed +# cluster. + +- include: create_pvs.yaml + with_items: "{{ openshift_prometheus_pv_data }}" diff --git a/roles/openshift_prometheus/templates/alertmanager.yml.j2 b/roles/openshift_prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..6c432a3d0 --- /dev/null +++ b/roles/openshift_prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,20 @@ +global: + +# The root route on which each incoming alert enters. +route: + # default route if none match + receiver: alert-buffer-wh + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # TODO: + group_by: [] + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + +receivers: +- name: alert-buffer-wh + webhook_configs: + - url: http://localhost:9099/topics/alerts diff --git a/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 new file mode 100644 index 000000000..55a5e19c3 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertbuffer + labels: + storage: prometheus-alertbuffer +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertbuffer + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 new file mode 100644 index 000000000..4ee518735 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertmanager + labels: + storage: prometheus-alertmanager +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertmanager + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 new file mode 100644 index 000000000..933bf0f60 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus + labels: + storage: prometheus +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prometheus.rules.j2 b/roles/openshift_prometheus/templates/prometheus.rules.j2 new file mode 100644 index 000000000..e861dc127 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.rules.j2 @@ -0,0 +1,4 @@ +groups: +- name: example-rules + interval: 30s # defaults to global interval + rules: diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 new file mode 100644 index 000000000..63430f834 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,174 @@ +rule_files: + - 'prometheus.rules' +{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} + - 'prometheus.additional.rules' +{% endif %} + + + +# A scrape configuration for running Prometheus on a Kubernetes cluster. +# This uses separate scrape configs for cluster components (i.e. API server, node) +# and services to allow each to use different authentication configs. +# +# Kubernetes labels will be added as Prometheus labels on metrics via the +# `labelmap` relabeling action. + +# Scrape config for API servers. +# +# Kubernetes exposes API servers as endpoints to the default/kubernetes +# service so this uses `endpoints` role and uses relabelling to only keep +# the endpoints associated with the default/kubernetes service using the +# default named port `https`. This works for single API server deployments as +# well as HA API server deployments. +scrape_configs: +- job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for controllers. +# +# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for +# the controllers. +# +# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via +# endpoints. +- job_name: 'kubernetes-controllers' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + # Keep only the default/kubernetes service endpoints for the https port, and then + # set the port to 8444. This is the default configuration for the controllers on OpenShift + # masters. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - source_labels: [__address__] + action: replace + target_label: __address__ + regex: (.+)(?::\d+) + replacement: $1:8444 + +# Scrape config for cAdvisor. +# +# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that +# reports container metrics for each running pod. Scrape those by default. +- job_name: 'kubernetes-cadvisor' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +{% if kubernetes_version | float() >= 1.7 | float() %} + metrics_path: /metrics/cadvisor +{% else %} + metrics_path: /metrics +{% endif %} + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for service endpoints. +# +# The relabeling allows the actual service scrape endpoint to be configured +# via the following annotations: +# +# * `prometheus.io/scrape`: Only scrape services that have a value of `true` +# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# to set this to `https` & most likely set the `tls_config` of the scrape config. +# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# * `prometheus.io/port`: If the metrics are exposed on a different port to the +# service then set this appropriately. +- job_name: 'kubernetes-service-endpoints' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # TODO: this should be per target + insecure_skip_verify: true + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] + action: replace + target_label: __basic_auth_username__ + regex: (.+) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] + action: replace + target_label: __basic_auth_password__ + regex: (.+) + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "localhost:9093" diff --git a/roles/openshift_prometheus/templates/prometheus_deployment.j2 b/roles/openshift_prometheus/templates/prometheus_deployment.j2 new file mode 100644 index 000000000..98c117f19 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus_deployment.j2 @@ -0,0 +1,240 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus + namespace: {{ namespace }} + labels: + app: prometheus +spec: + replicas: {{ prom_replicas|default(1) }} + selector: + provider: openshift + matchLabels: + app: prometheus + template: + metadata: + name: prometheus + labels: + app: prometheus + spec: + serviceAccountName: prometheus +{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} + nodeSelector: +{% for key, value in openshift_prometheus_node_selector.iteritems() %} + {{key}}: "{{value}}" +{% endfor %} +{% endif %} + containers: + # Deploy Prometheus behind an oauth proxy + - name: prom-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 8443 + name: web + args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9090 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + - -skip-auth-regex=^/metrics + volumeMounts: + - mountPath: /etc/tls/private + name: prometheus-tls + - mountPath: /etc/proxy/secrets + name: prometheus-secrets + - mountPath: /prometheus + name: prometheus-data + + - name: prometheus + args: + - --storage.tsdb.retention=6h + - --config.file=/etc/prometheus/prometheus.yml + - --web.listen-address=localhost:9090 + image: "{{ openshift_prometheus_image_prometheus }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %} + memory: "{{openshift_prometheus_memory_requests}}" +{% endif %} +{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %} + cpu: "{{openshift_prometheus_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %} + memory: "{{ openshift_prometheus_memory_limit }}" +{% endif %} +{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %} + cpu: "{{openshift_prometheus_cpu_limit}}" +{% endif %} + + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + - mountPath: /prometheus + name: prometheus-data + + # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy + - name: alerts-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9443 + name: web + args: + - -provider=openshift + - -https-address=:9443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9099 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + volumeMounts: + - mountPath: /etc/tls/private + name: alerts-tls + - mountPath: /etc/proxy/secrets + name: alerts-secrets + + - name: alert-buffer + args: + - --storage-path=/alert-buffer/messages.db + image: "{{ openshift_prometheus_image_alertbuffer }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}" +{% endif %} + volumeMounts: + - mountPath: /alert-buffer + name: alert-buffer-data + ports: + - containerPort: 9099 + name: alert-buf + + - name: alertmanager + args: + - -config.file=/etc/alertmanager/alertmanager.yml + image: "{{ openshift_prometheus_image_alertmanager }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9093 + name: web + volumeMounts: + - mountPath: /etc/alertmanager + name: alertmanager-config + - mountPath: /alertmanager + name: alertmanager-data + + restartPolicy: Always + volumes: + - name: prometheus-config + configMap: + defaultMode: 420 + name: prometheus + - name: prometheus-secrets + secret: + secretName: prometheus-proxy + - name: prometheus-tls + secret: + secretName: prometheus-tls + - name: prometheus-data +{% if openshift_prometheus_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alertmanager-config + configMap: + defaultMode: 420 + name: prometheus-alerts + - name: alerts-secrets + secret: + secretName: alerts-proxy + - name: alerts-tls + secret: + secretName: prometheus-alerts-tls + - name: alertmanager-data +{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertmanager_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alert-buffer-data +{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} +{% else %} + emptydir: {} +{% endif %} diff --git a/roles/openshift_prometheus/tests/inventory b/roles/openshift_prometheus/tests/inventory new file mode 100644 index 000000000..878877b07 --- /dev/null +++ b/roles/openshift_prometheus/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/roles/openshift_prometheus/tests/test.yaml b/roles/openshift_prometheus/tests/test.yaml new file mode 100644 index 000000000..37baf573c --- /dev/null +++ b/roles/openshift_prometheus/tests/test.yaml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - openshift_prometheus -- cgit v1.2.3