diff options
| author | Scott Dodson <sdodson@redhat.com> | 2017-09-06 10:32:39 -0400 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2017-09-06 10:32:39 -0400 | 
| commit | bb59f5617bce05d61f594efe6e602d307bc6f200 (patch) | |
| tree | 9b31d02dc5a8d8986e08901d6ea4ffd5ef07ca98 | |
| parent | 5aaa24b25653cca479955eb39d353caee0fcf373 (diff) | |
| parent | ec75d0ac888f3fab87f8d335224596df045e260a (diff) | |
| download | openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.gz openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.bz2 openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.xz openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.zip | |
Merge pull request #4509 from zgalor/prometheus-role
Create ansible role for deploying prometheus on openshift
21 files changed, 1058 insertions, 1 deletions
| diff --git a/filter_plugins/oo_filters.py b/filter_plugins/oo_filters.py index 36a90a870..277695f78 100644 --- a/filter_plugins/oo_filters.py +++ b/filter_plugins/oo_filters.py @@ -1024,6 +1024,18 @@ def oo_contains_rule(source, apiGroups, resources, verbs):      return False +def oo_selector_to_string_list(user_dict): +    """Convert a dict of selectors to a key=value list of strings + +Given input of {'region': 'infra', 'zone': 'primary'} returns a list +of items as ['region=infra', 'zone=primary'] +    """ +    selectors = [] +    for key in user_dict: +        selectors.append("{}={}".format(key, user_dict[key])) +    return selectors + +  class FilterModule(object):      """ Custom ansible filter mapping """ @@ -1065,5 +1077,6 @@ class FilterModule(object):              "oo_openshift_loadbalancer_backends": oo_openshift_loadbalancer_backends,              "to_padded_yaml": to_padded_yaml,              "oo_random_word": oo_random_word, -            "oo_contains_rule": oo_contains_rule +            "oo_contains_rule": oo_contains_rule, +            "oo_selector_to_string_list": oo_selector_to_string_list          } diff --git a/playbooks/byo/openshift-cluster/openshift-prometheus.yml b/playbooks/byo/openshift-cluster/openshift-prometheus.yml new file mode 100644 index 000000000..15917078d --- /dev/null +++ b/playbooks/byo/openshift-cluster/openshift-prometheus.yml @@ -0,0 +1,4 @@ +--- +- include: initialize_groups.yml + +- include: ../../common/openshift-cluster/openshift_prometheus.yml diff --git a/playbooks/common/openshift-cluster/openshift_hosted.yml b/playbooks/common/openshift-cluster/openshift_hosted.yml index 99a634970..a391b963a 100644 --- a/playbooks/common/openshift-cluster/openshift_hosted.yml +++ b/playbooks/common/openshift-cluster/openshift_hosted.yml @@ -49,6 +49,9 @@    - role: cockpit-ui      when: ( openshift.common.version_gte_3_3_or_1_3  | bool ) and ( openshift_hosted_manage_registry | default(true) | bool ) and not (openshift.docker.hosted_registry_insecure | default(false) | bool) +  - role: openshift_prometheus +    when: openshift_hosted_prometheus_deploy | default(false) | bool +  - name: Update master-config for publicLoggingURL    hosts: oo_masters_to_config:!oo_first_master    tags: diff --git a/playbooks/common/openshift-cluster/openshift_prometheus.yml b/playbooks/common/openshift-cluster/openshift_prometheus.yml new file mode 100644 index 000000000..a979c0c00 --- /dev/null +++ b/playbooks/common/openshift-cluster/openshift_prometheus.yml @@ -0,0 +1,9 @@ +--- +- include: std_include.yml + +- name: OpenShift Prometheus +  hosts: oo_first_master +  roles: +  - openshift_prometheus +  vars: +    openshift_prometheus_state: present diff --git a/roles/openshift_prometheus/README.md b/roles/openshift_prometheus/README.md new file mode 100644 index 000000000..c5a44bffb --- /dev/null +++ b/roles/openshift_prometheus/README.md @@ -0,0 +1,95 @@ +OpenShift Prometheus +==================== + +OpenShift Prometheus Installation + +Requirements +------------ + + +Role Variables +-------------- + +For default values, see [`defaults/main.yaml`](defaults/main.yaml). + +- `openshift_prometheus_state`: present - install/update. absent - uninstall. + +- `openshift_prometheus_namespace`: project (i.e. namespace) where the components will be +  deployed. + +- `openshift_prometheus_replicas`: The number of replicas for prometheus deployment. + +- `openshift_prometheus_node_selector`: Selector for the nodes prometheus will be deployed on. + +- `openshift_prometheus_image_<COMPONENT>`: specify image for the component  + +## Storage related variables +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can set pv claim by setting corresponding role variable: +``` +openshift_prometheus_<COMPONENT>_storage_type: <VALUE> +openshift_prometheus_<COMPONENT>_pvc_(name|size|access_modes|pv_selector): <VALUE> +``` +e.g +``` +openshift_prometheus_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: alertmanager +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +``` + +## Additional Alert Rules file variable +An external file with alert rules can be added by setting path to additional rules variable:  +``` +openshift_prometheus_additional_rules_file: <PATH>  +``` + +File content should be in prometheus alert rules format. +Following example sets rule to fire an alert when one of the cluster nodes is down: + +``` +groups: +- name: example-rules +  interval: 30s # defaults to global interval +  rules: +  - alert: Node Down +    expr: up{job="kubernetes-nodes"} == 0 +    annotations: +      miqTarget: "ContainerNode" +      severity: "HIGH" +      message: "{{ '{{' }}{{ '$labels.instance' }}{{ '}}' }} is down" +``` + + +## Additional variables to control resource limits +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can specify a cpu and memory limits and requests by setting +the corresponding role variable: +``` +openshift_prometheus_<COMPONENT>_(limits|requests)_(memory|cpu): <VALUE> +``` +e.g +``` +openshift_prometheus_alertmanager_limits_memory: 1Gi +openshift_prometheus_oath_proxy_requests_cpu: 100 +``` + +Dependencies +------------ + +openshift_facts + + +Example Playbook +---------------- + +``` +- name: Configure openshift-prometheus +  hosts: oo_first_master +  roles: +  - role: openshift_prometheus +``` + +License +------- + +Apache License, Version 2.0 + diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml new file mode 100644 index 000000000..18d6a1645 --- /dev/null +++ b/roles/openshift_prometheus/defaults/main.yaml @@ -0,0 +1,74 @@ +--- +# defaults file for openshift_prometheus +openshift_prometheus_state: present + +openshift_prometheus_namespace: prometheus + +openshift_prometheus_replicas: 1 +openshift_prometheus_node_selector: {"region":"infra"} + +# images +openshift_prometheus_image_proxy: "openshift/oauth-proxy:v1.0.0" +openshift_prometheus_image_prometheus: "openshift/prometheus:v2.0.0-dev" +openshift_prometheus_image_alertmanager: "openshift/prometheus-alertmanager:dev" +openshift_prometheus_image_alertbuffer: "ilackarms/message-buffer" + +# additional prometheus rules file +openshift_prometheus_additional_rules_file: null + +# All the required exports +openshift_prometheus_pv_exports: +  - prometheus +  - prometheus-alertmanager +  - prometheus-alertbuffer +# PV template files and their created object names +openshift_prometheus_pv_data: +  - pv_name: prometheus +    pv_template: prom-pv-server.yml +    pv_label: Prometheus Server PV +  - pv_name: prometheus-alertmanager +    pv_template: prom-pv-alertmanager.yml +    pv_label: Prometheus Alertmanager PV +  - pv_name: prometheus-alertbuffer +    pv_template: prom-pv-alertbuffer.yml +    pv_label: Prometheus Alert Buffer PV + +# Hostname/IP of the NFS server. Currently defaults to first master +openshift_prometheus_nfs_server: "{{ groups.nfs.0 }}" + +# storage +openshift_prometheus_storage_type: pvc +openshift_prometheus_pvc_name: prometheus +openshift_prometheus_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_pvc_pv_selector: {} + +openshift_prometheus_alertmanager_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: prometheus-alertmanager +openshift_prometheus_alertmanager_pvc_size: 10G +openshift_prometheus_alertmanager_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertmanager_pvc_pv_selector: {} + +openshift_prometheus_alertbuffer_storage_type: pvc +openshift_prometheus_alertbuffer_pvc_name: prometheus-alertbuffer +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_alertbuffer_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertbuffer_pvc_pv_selector: {} + +# container resources +openshift_prometheus_cpu_limit: null +openshift_prometheus_memory_limit: null +openshift_prometheus_cpu_requests: null +openshift_prometheus_memory_requests: null +openshift_prometheus_alertmanager_cpu_limit: null +openshift_prometheus_alertmanager_memory_limit: null +openshift_prometheus_alertmanager_cpu_requests: null +openshift_prometheus_alertmanager_memory_requests: null +openshift_prometheus_alertbuffer_cpu_limit: null +openshift_prometheus_alertbuffer_memory_limit: null +openshift_prometheus_alertbuffer_cpu_requests: null +openshift_prometheus_alertbuffer_memory_requests: null +openshift_prometheus_oauth_proxy_cpu_limit: null +openshift_prometheus_oauth_proxy_memory_limit: null +openshift_prometheus_oauth_proxy_cpu_requests: null +openshift_prometheus_oauth_proxy_memory_requests: null diff --git a/roles/openshift_prometheus/files/openshift_prometheus.exports b/roles/openshift_prometheus/files/openshift_prometheus.exports new file mode 100644 index 000000000..3ccedb1fd --- /dev/null +++ b/roles/openshift_prometheus/files/openshift_prometheus.exports @@ -0,0 +1,3 @@ +/exports/prometheus *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertmanager *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertbuffer *(rw,no_root_squash,no_wdelay) diff --git a/roles/openshift_prometheus/meta/main.yaml b/roles/openshift_prometheus/meta/main.yaml new file mode 100644 index 000000000..33188bb7e --- /dev/null +++ b/roles/openshift_prometheus/meta/main.yaml @@ -0,0 +1,19 @@ +--- +galaxy_info: +  author: OpenShift Development <dev@lists.openshift.redhat.com> +  description: Deploy OpenShift prometheus integration for the cluster +  company: Red Hat, Inc. +  license: license (Apache) +  min_ansible_version: 2.2 +  platforms: +  - name: EL +    versions: +    - 7 +  - name: Fedora +    versions: +    - all +  categories: +  - openshift +dependencies: +- { role: lib_openshift } +- { role: openshift_facts } diff --git a/roles/openshift_prometheus/tasks/create_pvs.yaml b/roles/openshift_prometheus/tasks/create_pvs.yaml new file mode 100644 index 000000000..4e79da05f --- /dev/null +++ b/roles/openshift_prometheus/tasks/create_pvs.yaml @@ -0,0 +1,36 @@ +--- +# Check for existance and then conditionally: +# - evaluate templates +# - PVs +# +# These tasks idempotently create required Prometheus PV objects. Do not +# call this file directly. This file is intended to be ran as an +# include that has a 'with_items' attached to it. Hence the use below +# of variables like "{{ item.pv_label }}" + +- name: "Check if the {{ item.pv_label }} template has been created already" +  oc_obj: +    namespace: "{{ openshift_prometheus_namespace }}" +    state: list +    kind: pv +    name: "{{ item.pv_name }}" +  register: prom_pv_check + +# Skip all of this if the PV already exists +- block: +    - name: "Ensure the {{ item.pv_label }} template is evaluated" +      template: +        src: "{{ item.pv_template }}.j2" +        dest: "{{ tempdir }}/templates/{{ item.pv_template }}" + +    - name: "Ensure {{ item.pv_label }} is created" +      oc_obj: +        namespace: "{{ openshift_prometheus_namespace }}" +        kind: pv +        name: "{{ item.pv_name }}" +        state: present +        delete_after: True +        files: +          - "{{ tempdir }}/templates/{{ item.pv_template }}" +  when: +    - not prom_pv_check.results.results.0 diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml new file mode 100644 index 000000000..93bdda3e8 --- /dev/null +++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml @@ -0,0 +1,241 @@ +--- + +# namespace +- name: Add prometheus project +  oc_project: +    state: "{{ state }}" +    name: "{{ openshift_prometheus_namespace }}" +    node_selector: "{{ openshift_prometheus_node_selector | oo_selector_to_string_list() }}" +    description: Prometheus + +# secrets +- name: Set alert and prometheus secrets +  oc_secret: +    state: "{{ state }}" +    name: "{{ item }}-proxy" +    namespace: "{{ openshift_prometheus_namespace }}" +    contents: +      - path: session_secret +        data: "{{ 43 | oo_random_word }}=" +  with_items: +    - prometheus +    - alerts + +# serviceaccount +- name: create prometheus serviceaccount +  oc_serviceaccount: +    state: "{{ state }}" +    name: prometheus +    namespace: "{{ openshift_prometheus_namespace }}" +    #    TODO add annotations when supproted +    #    annotations: +    #      serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' +    #      serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + +    secrets: +      - prometheus-secrets +  changed_when: no + +# TODO remove this when annotations are supported by oc_serviceaccount +- name: annotate serviceaccount +  command: > +    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} +    serviceaccount prometheus +    serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' +    serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + + +# create clusterrolebinding for prometheus serviceaccount +- name: Set cluster-reader permissions for prometheus +  oc_adm_policy_user: +    state: "{{ state }}" +    namespace: "{{ openshift_prometheus_namespace }}" +    resource_kind: cluster-role +    resource_name: cluster-reader +    user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus" + + +###################################################################### +# NFS +# In the case that we are not running on a cloud provider, volumes must be statically provisioned + +- include: nfs.yaml +  when: not (openshift_cloudprovider_kind is defined and (openshift_cloudprovider_kind == 'aws' or openshift_cloudprovider_kind == 'gce')) + + +# create prometheus and alerts services +# TODO join into 1 task with loop +- name: Create prometheus service +  oc_service: +    state: "{{ state }}" +    name: "{{ item.name }}" +    namespace: "{{ openshift_prometheus_namespace }}" +    selector: +      app: prometheus +    labels: +      name: "{{ item.name }}" +      #    TODO add annotations when supported +      #    annotations: +      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" +    ports: +      - port: 443 +        targetPort: 8443 +  with_items: +    - name: prometheus + +- name: Create alerts service +  oc_service: +    state: "{{ state }}" +    name: "{{ item.name }}" +    namespace: "{{ openshift_prometheus_namespace }}" +    selector: +      app: prometheus +    labels: +      name: "{{ item.name }}" +      #    TODO add annotations when supported +      #    annotations: +      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" +    ports: +      - port: 443 +        targetPort: 9443 +  with_items: +    - name: alerts + + +# Annotate services with secret name +# TODO remove this when annotations are supported by oc_service +- name: annotate prometheus service +  command: > +    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} +    service prometheus 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls' + +- name: annotate alerts service +  command: > +    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} +    service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls' + +# create prometheus and alerts routes +- name: create prometheus and alerts routes +  oc_route: +    state: "{{ state }}" +    name: "{{ item.name }}" +    namespace: "{{ openshift_prometheus_namespace }}" +    service_name: "{{ item.name }}" +    tls_termination: reencrypt +  with_items: +    - name: prometheus +    - name: alerts + +# Storage +- name: create prometheus pvc +  oc_pvc: +    namespace: "{{ openshift_prometheus_namespace }}" +    name: "{{ openshift_prometheus_pvc_name }}" +    access_modes: "{{ openshift_prometheus_pvc_access_modes }}" +    volume_capacity: "{{ openshift_prometheus_pvc_size }}" +    selector: "{{ openshift_prometheus_pvc_pv_selector }}" + +- name: create alertmanager pvc +  oc_pvc: +    namespace: "{{ openshift_prometheus_namespace }}" +    name: "{{ openshift_prometheus_alertmanager_pvc_name }}" +    access_modes: "{{ openshift_prometheus_alertmanager_pvc_access_modes }}" +    volume_capacity: "{{ openshift_prometheus_alertmanager_pvc_size }}" +    selector: "{{ openshift_prometheus_alertmanager_pvc_pv_selector }}" + +- name: create alertbuffer pvc +  oc_pvc: +    namespace: "{{ openshift_prometheus_namespace }}" +    name: "{{ openshift_prometheus_alertbuffer_pvc_name }}" +    access_modes: "{{ openshift_prometheus_alertbuffer_pvc_access_modes }}" +    volume_capacity: "{{ openshift_prometheus_alertbuffer_pvc_size }}" +    selector: "{{ openshift_prometheus_alertbuffer_pvc_pv_selector }}" + +# create prometheus deployment +- name: Set prometheus deployment template +  template: +    src: prometheus_deployment.j2 +    dest: "{{ tempdir }}/templates/prometheus.yaml" +  vars: +    namespace: "{{ openshift_prometheus_namespace }}" +    prom_replicas: "{{ openshift_prometheus_replicas }}" + +- name: Set prometheus deployment +  oc_obj: +    state: "{{ state }}" +    name: "prometheus" +    namespace: "{{ openshift_prometheus_namespace }}" +    kind: deployment +    files: +      - "{{ tempdir }}/templates/prometheus.yaml" +    delete_after: true + +# prometheus configmap +# Copy the additional rules file if it is defined +- name: Copy additional rules file to host +  copy: +    src: "{{ openshift_prometheus_additional_rules_file }}" +    dest: "{{ tempdir }}/prometheus.additional.rules" +  when: +    - openshift_prometheus_additional_rules_file is defined +    - openshift_prometheus_additional_rules_file is not none +    - openshift_prometheus_additional_rules_file | trim | length > 0 + +- stat: +    path: "{{ tempdir }}/prometheus.additional.rules" +  register: additional_rules_stat + +# The kubernetes version impacts the prometheus scraping endpoint +# so gathering it before constructing the configmap +- name: get oc version +  oc_version: +  register: oc_version + +- set_fact: +    kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}" + +- template: +    src: prometheus.yml.j2 +    dest: "{{ tempdir }}/prometheus.yml" +  changed_when: no + +- template: +    src: prometheus.rules.j2 +    dest: "{{ tempdir }}/prometheus.rules" +  changed_when: no + +# In prometheus configmap create "additional.rules" section if file exists +- name: Set prometheus configmap +  oc_configmap: +    state: "{{ state }}" +    name: "prometheus" +    namespace: "{{ openshift_prometheus_namespace }}" +    from_file: +      prometheus.rules: "{{ tempdir }}/prometheus.rules" +      prometheus.additional.rules: "{{ tempdir }}/prometheus.additional.rules" +      prometheus.yml: "{{ tempdir }}/prometheus.yml" +  when: additional_rules_stat.stat.exists == True + +- name: Set prometheus configmap +  oc_configmap: +    state: "{{ state }}" +    name: "prometheus" +    namespace: "{{ openshift_prometheus_namespace }}" +    from_file: +      prometheus.rules: "{{ tempdir }}/prometheus.rules" +      prometheus.yml: "{{ tempdir }}/prometheus.yml" +  when: additional_rules_stat.stat.exists == False + +# alertmanager configmap +- template: +    src: alertmanager.yml.j2 +    dest: "{{ tempdir }}/alertmanager.yml" +  changed_when: no + +- name: Set alertmanager configmap +  oc_configmap: +    state: "{{ state }}" +    name: "prometheus-alerts" +    namespace: "{{ openshift_prometheus_namespace }}" +    from_file: +      alertmanager.yml: "{{ tempdir }}/alertmanager.yml" diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml new file mode 100644 index 000000000..523a64334 --- /dev/null +++ b/roles/openshift_prometheus/tasks/main.yaml @@ -0,0 +1,26 @@ +--- + +- name: Create temp directory for doing work in on target +  command: mktemp -td openshift-prometheus-ansible-XXXXXX +  register: mktemp +  changed_when: False + +- set_fact: +    tempdir: "{{ mktemp.stdout }}" + +- name: Create templates subdirectory +  file: +    state: directory +    path: "{{ tempdir }}/templates" +    mode: 0755 +  changed_when: False + +- include: install_prometheus.yaml +  vars: +    state: "{{ openshift_prometheus_state }}" + +- name: Delete temp directory +  file: +    name: "{{ tempdir }}" +    state: absent +  changed_when: False diff --git a/roles/openshift_prometheus/tasks/nfs.yaml b/roles/openshift_prometheus/tasks/nfs.yaml new file mode 100644 index 000000000..0b45f2cee --- /dev/null +++ b/roles/openshift_prometheus/tasks/nfs.yaml @@ -0,0 +1,44 @@ +--- +# Tasks to statically provision NFS volumes +# Include if not using dynamic volume provisioning +- name: Ensure the /exports/ directory exists +  file: +    path: /exports/ +    state: directory +    mode: 0755 +    owner: root +    group: root + +- name: Ensure the prom-pv0X export directories exist +  file: +    path: "/exports/{{ item }}" +    state: directory +    mode: 0777 +    owner: nfsnobody +    group: nfsnobody +  with_items: "{{ openshift_prometheus_pv_exports }}" + +- name: Ensure the NFS exports for Prometheus PVs exist +  copy: +    src: openshift_prometheus.exports +    dest: /etc/exports.d/openshift_prometheus.exports +  register: nfs_exports_updated + +- name: Ensure the NFS export table is refreshed if exports were added +  command: exportfs -ar +  when: +    - nfs_exports_updated.changed + + +###################################################################### +# Create the required Prometheus PVs. Check out these online docs if you +# need a refresher on includes looping with items: +# * http://docs.ansible.com/ansible/playbooks_loops.html#loops-and-includes-in-2-0 +# * http://stackoverflow.com/a/35128533 +# +# TODO: Handle the case where a PV template is updated in +# openshift-ansible and the change needs to be landed on the managed +# cluster. + +- include: create_pvs.yaml +  with_items: "{{ openshift_prometheus_pv_data }}" diff --git a/roles/openshift_prometheus/templates/alertmanager.yml.j2 b/roles/openshift_prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..6c432a3d0 --- /dev/null +++ b/roles/openshift_prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,20 @@ +global: + +# The root route on which each incoming alert enters. +route: +  # default route if none match +  receiver: alert-buffer-wh + +  # The labels by which incoming alerts are grouped together. For example, +  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would +  # be batched into a single group. +  # TODO: +  group_by: [] + +  # All the above attributes are inherited by all child routes and can +  # overwritten on each. + +receivers: +- name: alert-buffer-wh +  webhook_configs: +  - url: http://localhost:9099/topics/alerts diff --git a/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 new file mode 100644 index 000000000..55a5e19c3 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: +  name: prometheus-alertbuffer +  labels: +    storage: prometheus-alertbuffer +spec: +  capacity: +    storage: 15Gi +  accessModes: +    - ReadWriteOnce +  nfs: +    path: /exports/prometheus-alertbuffer +    server: {{ openshift_prometheus_nfs_server }} +  persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 new file mode 100644 index 000000000..4ee518735 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: +  name: prometheus-alertmanager +  labels: +    storage: prometheus-alertmanager +spec: +  capacity: +    storage: 15Gi +  accessModes: +    - ReadWriteOnce +  nfs: +    path: /exports/prometheus-alertmanager +    server: {{ openshift_prometheus_nfs_server }} +  persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 new file mode 100644 index 000000000..933bf0f60 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: +  name: prometheus +  labels: +    storage: prometheus +spec: +  capacity: +    storage: 15Gi +  accessModes: +    - ReadWriteOnce +  nfs: +    path: /exports/prometheus +    server: {{ openshift_prometheus_nfs_server }} +  persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prometheus.rules.j2 b/roles/openshift_prometheus/templates/prometheus.rules.j2 new file mode 100644 index 000000000..e861dc127 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.rules.j2 @@ -0,0 +1,4 @@ +groups: +- name: example-rules +  interval: 30s # defaults to global interval +  rules: diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 new file mode 100644 index 000000000..63430f834 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,174 @@ +rule_files: +  - 'prometheus.rules' +{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} +  - 'prometheus.additional.rules' +{% endif %} + + + +# A scrape configuration for running Prometheus on a Kubernetes cluster. +# This uses separate scrape configs for cluster components (i.e. API server, node) +# and services to allow each to use different authentication configs. +# +# Kubernetes labels will be added as Prometheus labels on metrics via the +# `labelmap` relabeling action. + +# Scrape config for API servers. +# +# Kubernetes exposes API servers as endpoints to the default/kubernetes +# service so this uses `endpoints` role and uses relabelling to only keep +# the endpoints associated with the default/kubernetes service using the +# default named port `https`. This works for single API server deployments as +# well as HA API server deployments. +scrape_configs: +- job_name: 'kubernetes-apiservers' + +  kubernetes_sd_configs: +  - role: endpoints + +  scheme: https +  tls_config: +    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +  # Keep only the default/kubernetes service endpoints for the https port. This +  # will add targets for each API server which Kubernetes adds an endpoint to +  # the default/kubernetes service. +  relabel_configs: +  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] +    action: keep +    regex: default;kubernetes;https + +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + +  scheme: https +  tls_config: +    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +  kubernetes_sd_configs: +  - role: node + +  relabel_configs: +  - action: labelmap +    regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for controllers. +# +# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for +# the controllers. +# +# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via +#       endpoints. +- job_name: 'kubernetes-controllers' + +  scheme: https +  tls_config: +    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +  kubernetes_sd_configs: +  - role: endpoints + +  # Keep only the default/kubernetes service endpoints for the https port, and then +  # set the port to 8444. This is the default configuration for the controllers on OpenShift +  # masters. +  relabel_configs: +  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] +    action: keep +    regex: default;kubernetes;https +  - source_labels: [__address__] +    action: replace +    target_label: __address__ +    regex: (.+)(?::\d+) +    replacement: $1:8444 + +# Scrape config for cAdvisor. +# +# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that +# reports container metrics for each running pod. Scrape those by default. +- job_name: 'kubernetes-cadvisor' + +  scheme: https +  tls_config: +    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +{% if kubernetes_version | float() >= 1.7 | float() %} +  metrics_path: /metrics/cadvisor +{% else %} +  metrics_path: /metrics +{% endif %} + +  kubernetes_sd_configs: +  - role: node + +  relabel_configs: +  - action: labelmap +    regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for service endpoints. +# +# The relabeling allows the actual service scrape endpoint to be configured +# via the following annotations: +# +# * `prometheus.io/scrape`: Only scrape services that have a value of `true` +# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# to set this to `https` & most likely set the `tls_config` of the scrape config. +# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# * `prometheus.io/port`: If the metrics are exposed on a different port to the +# service then set this appropriately. +- job_name: 'kubernetes-service-endpoints' + +  tls_config: +    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +    # TODO: this should be per target +    insecure_skip_verify: true + +  kubernetes_sd_configs: +  - role: endpoints + +  relabel_configs: +  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] +    action: keep +    regex: true +  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] +    action: replace +    target_label: __scheme__ +    regex: (https?) +  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] +    action: replace +    target_label: __metrics_path__ +    regex: (.+) +  - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] +    action: replace +    target_label: __address__ +    regex: (.+)(?::\d+);(\d+) +    replacement: $1:$2 +  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] +    action: replace +    target_label: __basic_auth_username__ +    regex: (.+) +  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] +    action: replace +    target_label: __basic_auth_password__ +    regex: (.+) +  - action: labelmap +    regex: __meta_kubernetes_service_label_(.+) +  - source_labels: [__meta_kubernetes_namespace] +    action: replace +    target_label: kubernetes_namespace +  - source_labels: [__meta_kubernetes_service_name] +    action: replace +    target_label: kubernetes_name + +alerting: +  alertmanagers: +  - scheme: http +    static_configs: +    - targets: +      - "localhost:9093" diff --git a/roles/openshift_prometheus/templates/prometheus_deployment.j2 b/roles/openshift_prometheus/templates/prometheus_deployment.j2 new file mode 100644 index 000000000..98c117f19 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus_deployment.j2 @@ -0,0 +1,240 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: +  name: prometheus +  namespace: {{ namespace }} +  labels: +    app: prometheus +spec: +  replicas: {{ prom_replicas|default(1) }} +  selector: +    provider: openshift +    matchLabels: +      app: prometheus +  template: +    metadata: +      name: prometheus +      labels: +        app: prometheus +    spec: +      serviceAccountName: prometheus +{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} +      nodeSelector: +{% for key, value in openshift_prometheus_node_selector.iteritems() %} +        {{key}}: "{{value}}" +{% endfor %} +{% endif %} +      containers: +      # Deploy Prometheus behind an oauth proxy +      - name: prom-proxy +        image: "{{ openshift_prometheus_image_proxy }}" +        imagePullPolicy: IfNotPresent +        resources: +          requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} +            memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} +            cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} +          limits: +{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} +            memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} +            cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} +        ports: +        - containerPort: 8443 +          name: web +        args: +        - -provider=openshift +        - -https-address=:8443 +        - -http-address= +        - -email-domain=* +        - -upstream=http://localhost:9090 +        - -client-id=system:serviceaccount:{{ namespace }}:prometheus +        - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' +        - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' +        - -tls-cert=/etc/tls/private/tls.crt +        - -tls-key=/etc/tls/private/tls.key +        - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token +        - -cookie-secret-file=/etc/proxy/secrets/session_secret +        - -skip-auth-regex=^/metrics +        volumeMounts: +        - mountPath: /etc/tls/private +          name: prometheus-tls +        - mountPath: /etc/proxy/secrets +          name: prometheus-secrets +        - mountPath: /prometheus +          name: prometheus-data + +      - name: prometheus +        args: +        - --storage.tsdb.retention=6h +        - --config.file=/etc/prometheus/prometheus.yml +        - --web.listen-address=localhost:9090 +        image: "{{ openshift_prometheus_image_prometheus }}" +        imagePullPolicy: IfNotPresent +        resources: +          requests: +{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %} +            memory: "{{openshift_prometheus_memory_requests}}" +{% endif %} +{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %} +            cpu: "{{openshift_prometheus_cpu_requests}}" +{% endif %} +          limits: +{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %} +            memory: "{{ openshift_prometheus_memory_limit }}" +{% endif %} +{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %} +            cpu: "{{openshift_prometheus_cpu_limit}}" +{% endif %} + +        volumeMounts: +        - mountPath: /etc/prometheus +          name: prometheus-config +        - mountPath: /prometheus +          name: prometheus-data + +      # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy +      - name: alerts-proxy +        image: "{{ openshift_prometheus_image_proxy }}" +        imagePullPolicy: IfNotPresent +        resources: +          requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} +            memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} +            cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} +          limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} +            memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} +            cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} +        ports: +        - containerPort: 9443 +          name: web +        args: +        - -provider=openshift +        - -https-address=:9443 +        - -http-address= +        - -email-domain=* +        - -upstream=http://localhost:9099 +        - -client-id=system:serviceaccount:{{ namespace }}:prometheus +        - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' +        - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' +        - -tls-cert=/etc/tls/private/tls.crt +        - -tls-key=/etc/tls/private/tls.key +        - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token +        - -cookie-secret-file=/etc/proxy/secrets/session_secret +        volumeMounts: +        - mountPath: /etc/tls/private +          name: alerts-tls +        - mountPath: /etc/proxy/secrets +          name: alerts-secrets + +      - name: alert-buffer +        args: +        - --storage-path=/alert-buffer/messages.db +        image: "{{ openshift_prometheus_image_alertbuffer }}" +        imagePullPolicy: IfNotPresent +        resources: +          requests: +{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %} +            memory: "{{openshift_prometheus_alertbuffer_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %} +            cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}" +{% endif %} +          limits: +{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %} +            memory: "{{openshift_prometheus_alertbuffer_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %} +            cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}" +{% endif %} +        volumeMounts: +        - mountPath: /alert-buffer +          name: alert-buffer-data +        ports: +        - containerPort: 9099 +          name: alert-buf + +      - name: alertmanager +        args: +        - -config.file=/etc/alertmanager/alertmanager.yml +        image: "{{ openshift_prometheus_image_alertmanager }}" +        imagePullPolicy: IfNotPresent +        resources: +          requests: +{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %} +            memory: "{{openshift_prometheus_alertmanager_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %} +            cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}" +{% endif %} +          limits: +{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %} +            memory: "{{openshift_prometheus_alertmanager_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %} +            cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}" +{% endif %} +        ports: +        - containerPort: 9093 +          name: web +        volumeMounts: +        - mountPath: /etc/alertmanager +          name: alertmanager-config +        - mountPath: /alertmanager +          name: alertmanager-data + +      restartPolicy: Always +      volumes: +      - name: prometheus-config +        configMap: +          defaultMode: 420 +          name: prometheus +      - name: prometheus-secrets +        secret: +          secretName: prometheus-proxy +      - name: prometheus-tls +        secret: +          secretName: prometheus-tls +      - name: prometheus-data +{% if openshift_prometheus_storage_type == 'pvc' %} +        persistentVolumeClaim: +          claimName: {{ openshift_prometheus_pvc_name }} +{% else %} +        emptydir: {} +{% endif %} +      - name: alertmanager-config +        configMap: +          defaultMode: 420 +          name: prometheus-alerts +      - name: alerts-secrets +        secret: +          secretName: alerts-proxy +      - name: alerts-tls +        secret: +          secretName: prometheus-alerts-tls +      - name: alertmanager-data +{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} +        persistentVolumeClaim: +          claimName: {{ openshift_prometheus_alertmanager_pvc_name }} +{% else %} +        emptydir: {} +{% endif %} +      - name: alert-buffer-data +{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} +        persistentVolumeClaim: +          claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} +{% else %} +        emptydir: {} +{% endif %} diff --git a/roles/openshift_prometheus/tests/inventory b/roles/openshift_prometheus/tests/inventory new file mode 100644 index 000000000..878877b07 --- /dev/null +++ b/roles/openshift_prometheus/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/roles/openshift_prometheus/tests/test.yaml b/roles/openshift_prometheus/tests/test.yaml new file mode 100644 index 000000000..37baf573c --- /dev/null +++ b/roles/openshift_prometheus/tests/test.yaml @@ -0,0 +1,5 @@ +--- +- hosts: localhost +  remote_user: root +  roles: +    - openshift_prometheus | 
