diff options
40 files changed, 1138 insertions, 37 deletions
diff --git a/filter_plugins/oo_filters.py b/filter_plugins/oo_filters.py index 36a90a870..277695f78 100644 --- a/filter_plugins/oo_filters.py +++ b/filter_plugins/oo_filters.py @@ -1024,6 +1024,18 @@ def oo_contains_rule(source, apiGroups, resources, verbs): return False +def oo_selector_to_string_list(user_dict): + """Convert a dict of selectors to a key=value list of strings + +Given input of {'region': 'infra', 'zone': 'primary'} returns a list +of items as ['region=infra', 'zone=primary'] + """ + selectors = [] + for key in user_dict: + selectors.append("{}={}".format(key, user_dict[key])) + return selectors + + class FilterModule(object): """ Custom ansible filter mapping """ @@ -1065,5 +1077,6 @@ class FilterModule(object): "oo_openshift_loadbalancer_backends": oo_openshift_loadbalancer_backends, "to_padded_yaml": to_padded_yaml, "oo_random_word": oo_random_word, - "oo_contains_rule": oo_contains_rule + "oo_contains_rule": oo_contains_rule, + "oo_selector_to_string_list": oo_selector_to_string_list } diff --git a/playbooks/byo/openshift-cluster/openshift-prometheus.yml b/playbooks/byo/openshift-cluster/openshift-prometheus.yml new file mode 100644 index 000000000..15917078d --- /dev/null +++ b/playbooks/byo/openshift-cluster/openshift-prometheus.yml @@ -0,0 +1,4 @@ +--- +- include: initialize_groups.yml + +- include: ../../common/openshift-cluster/openshift_prometheus.yml diff --git a/playbooks/common/openshift-cluster/openshift_hosted.yml b/playbooks/common/openshift-cluster/openshift_hosted.yml index 99a634970..a391b963a 100644 --- a/playbooks/common/openshift-cluster/openshift_hosted.yml +++ b/playbooks/common/openshift-cluster/openshift_hosted.yml @@ -49,6 +49,9 @@ - role: cockpit-ui when: ( openshift.common.version_gte_3_3_or_1_3 | bool ) and ( openshift_hosted_manage_registry | default(true) | bool ) and not (openshift.docker.hosted_registry_insecure | default(false) | bool) + - role: openshift_prometheus + when: openshift_hosted_prometheus_deploy | default(false) | bool + - name: Update master-config for publicLoggingURL hosts: oo_masters_to_config:!oo_first_master tags: diff --git a/playbooks/common/openshift-cluster/openshift_prometheus.yml b/playbooks/common/openshift-cluster/openshift_prometheus.yml new file mode 100644 index 000000000..a979c0c00 --- /dev/null +++ b/playbooks/common/openshift-cluster/openshift_prometheus.yml @@ -0,0 +1,9 @@ +--- +- include: std_include.yml + +- name: OpenShift Prometheus + hosts: oo_first_master + roles: + - openshift_prometheus + vars: + openshift_prometheus_state: present diff --git a/roles/docker/tasks/systemcontainer_crio.yml b/roles/docker/tasks/systemcontainer_crio.yml index 8f81273e5..24ca0d9f8 100644 --- a/roles/docker/tasks/systemcontainer_crio.yml +++ b/roles/docker/tasks/systemcontainer_crio.yml @@ -3,6 +3,15 @@ - set_fact: l_insecure_crio_registries: "{{ '\"{}\"'.format('\", \"'.join(openshift.docker.insecure_registries)) }}" when: openshift.docker.insecure_registries +- set_fact: + l_crio_registries: "{{ openshift.docker.additional_registries + ['docker.io'] }}" + when: openshift.docker.additional_registries +- set_fact: + l_crio_registries: "{{ ['docker.io'] }}" + when: not openshift.docker.additional_registries +- set_fact: + l_additional_crio_registries: "{{ '\"{}\"'.format('\", \"'.join(l_crio_registries)) }}" + when: openshift.docker.additional_registries - name: Ensure container-selinux is installed package: diff --git a/roles/docker/templates/crio.conf.j2 b/roles/docker/templates/crio.conf.j2 index 5b31932b1..b4ee84fd0 100644 --- a/roles/docker/templates/crio.conf.j2 +++ b/roles/docker/templates/crio.conf.j2 @@ -120,6 +120,11 @@ insecure_registries = [ {{ l_insecure_crio_registries|default("") }} ] +# registries is used to specify a comma separated list of registries to be used +# when pulling an unqualified image (e.g. fedora:rawhide). +registries = [ +{{ l_additional_crio_registries|default("") }} +] # The "crio.network" table contains settings pertaining to the # management of CNI plugins. [crio.network] diff --git a/roles/flannel_register/defaults/main.yaml b/roles/flannel_register/defaults/main.yaml index ddf8230ec..71c8f38c3 100644 --- a/roles/flannel_register/defaults/main.yaml +++ b/roles/flannel_register/defaults/main.yaml @@ -1,7 +1,6 @@ --- -flannel_network: "{{ openshift.common.portal_net | default('172.30.0.0/16', true) }}" -flannel_min_network: 172.30.5.0 -flannel_subnet_len: 24 +flannel_network: "{{ openshift.master.sdn_cluster_network_cidr }}" +flannel_subnet_len: "{{ 32 - openshift.master.sdn_host_subnet_length }}" flannel_etcd_key: /openshift.com/network etcd_hosts: "{{ etcd_urls }}" etcd_conf_dir: "{{ openshift.common.config_base }}/master" diff --git a/roles/flannel_register/templates/flannel-config.json b/roles/flannel_register/templates/flannel-config.json index 89ce4c30b..bba3729fa 100644 --- a/roles/flannel_register/templates/flannel-config.json +++ b/roles/flannel_register/templates/flannel-config.json @@ -1,7 +1,6 @@ { "Network": "{{ flannel_network }}", "SubnetLen": {{ flannel_subnet_len }}, - "SubnetMin": "{{ flannel_min_network }}", "Backend": { "Type": "host-gw" } diff --git a/roles/openshift_cfme/defaults/main.yml b/roles/openshift_cfme/defaults/main.yml index 8aa57e75a..b82c2e602 100644 --- a/roles/openshift_cfme/defaults/main.yml +++ b/roles/openshift_cfme/defaults/main.yml @@ -27,9 +27,6 @@ openshift_cfme_pv_data: # Tuning parameter to use more than 5 images at once from an ImageStream openshift_cfme_maxImagesBulkImportedPerRepository: 100 -# Hostname/IP of the NFS server. Currently defaults to first master -openshift_cfme_nfs_server: "{{ groups.nfs.0 }}" -openshift_cfme_nfs_directory: "/exports" # TODO: Refactor '_install_app' variable. This is just for testing but # maybe in the future it should control the entire yes/no for CFME. # diff --git a/roles/openshift_cfme/tasks/nfs.yml b/roles/openshift_cfme/tasks/nfs.yml index 8db45492e..ca04628a8 100644 --- a/roles/openshift_cfme/tasks/nfs.yml +++ b/roles/openshift_cfme/tasks/nfs.yml @@ -1,6 +1,13 @@ --- # Tasks to statically provision NFS volumes # Include if not using dynamic volume provisioning + +- name: Set openshift_cfme_nfs_server fact + when: openshift_cfme_nfs_server is not defined + set_fact: + # Hostname/IP of the NFS server. Currently defaults to first master + openshift_cfme_nfs_server: "{{ oo_nfs_to_config.0 }}" + - name: Ensure the /exports/ directory exists file: path: /exports/ diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py index c8769b511..db3c0b654 100644 --- a/roles/openshift_health_checker/library/aos_version.py +++ b/roles/openshift_health_checker/library/aos_version.py @@ -26,15 +26,13 @@ from ansible.module_utils.six import string_types YUM_IMPORT_EXCEPTION = None DNF_IMPORT_EXCEPTION = None -PKG_MGR = None try: import yum # pylint: disable=import-error - PKG_MGR = "yum" except ImportError as err: YUM_IMPORT_EXCEPTION = err + try: import dnf # pylint: disable=import-error - PKG_MGR = "dnf" except ImportError as err: DNF_IMPORT_EXCEPTION = err @@ -51,14 +49,19 @@ def main(): module = AnsibleModule( argument_spec=dict( package_list=dict(type="list", required=True), + package_mgr=dict(type="str", required=True), ), supports_check_mode=True ) - if YUM_IMPORT_EXCEPTION and DNF_IMPORT_EXCEPTION: + # determine the package manager to use + package_mgr = module.params['package_mgr'] + if package_mgr not in ('yum', 'dnf'): + module.fail_json(msg="package_mgr must be one of: yum, dnf") + pkg_mgr_exception = dict(yum=YUM_IMPORT_EXCEPTION, dnf=DNF_IMPORT_EXCEPTION)[package_mgr] + if pkg_mgr_exception: module.fail_json( - msg="aos_version module could not import yum or dnf: %s %s" % - (YUM_IMPORT_EXCEPTION, DNF_IMPORT_EXCEPTION) + msg="aos_version module could not import {}: {}".format(package_mgr, pkg_mgr_exception) ) # determine the packages we will look for @@ -78,7 +81,7 @@ def main(): # get the list of packages available and complain if anything is wrong try: - pkgs = _retrieve_available_packages(expected_pkg_names) + pkgs = _retrieve_available_packages(package_mgr, expected_pkg_names) if versioned_pkgs: _check_precise_version_found(pkgs, _to_dict(versioned_pkgs)) _check_higher_version_found(pkgs, _to_dict(versioned_pkgs)) @@ -93,7 +96,7 @@ def _to_dict(pkg_list): return {pkg["name"]: pkg for pkg in pkg_list} -def _retrieve_available_packages(expected_pkgs): +def _retrieve_available_packages(pkg_mgr, expected_pkgs): # The openshift excluder prevents unintended updates to openshift # packages by setting yum excludes on those packages. See: # https://wiki.centos.org/SpecialInterestGroup/PaaS/OpenShift-Origin-Control-Updates @@ -103,14 +106,15 @@ def _retrieve_available_packages(expected_pkgs): # be excluded. So, for our purposes here, disable excludes to see # what will really be available during an install or upgrade. - if PKG_MGR == "yum": + if pkg_mgr == "yum": # search for package versions available for openshift pkgs yb = yum.YumBase() # pylint: disable=invalid-name yb.conf.disable_excludes = ['all'] try: - pkgs = yb.pkgSack.returnPackages(patterns=expected_pkgs) + pkgs = yb.rpmdb.returnPackages(patterns=expected_pkgs) + pkgs += yb.pkgSack.returnPackages(patterns=expected_pkgs) except yum.Errors.PackageSackError as excinfo: # you only hit this if *none* of the packages are available raise AosVersionException('\n'.join([ @@ -118,7 +122,7 @@ def _retrieve_available_packages(expected_pkgs): 'Check your subscription and repo settings.', str(excinfo), ])) - elif PKG_MGR == "dnf": + elif pkg_mgr == "dnf": dbase = dnf.Base() # pyling: disable=invalid-name dbase.conf.disable_excludes = ['all'] @@ -127,8 +131,11 @@ def _retrieve_available_packages(expected_pkgs): dquery = dbase.sack.query() aquery = dquery.available() + iquery = dquery.installed() - pkgs = list(aquery.filter(name=expected_pkgs)) + available_pkgs = list(aquery.filter(name=expected_pkgs)) + installed_pkgs = list(iquery.filter(name=expected_pkgs)) + pkgs = available_pkgs + installed_pkgs if not pkgs: # pkgs list is empty, raise because no expected packages found diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 857a80c74..866c74d7c 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -32,6 +32,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # we use python-docker-py to check local docker for images, and skopeo # to look for images available remotely without waiting to pull them. dependencies = ["python-docker-py", "skopeo"] + skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false" def is_active(self): """Skip hosts with unsupported deployment types.""" @@ -67,8 +68,10 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): "failed": True, "msg": ( "One or more required Docker images are not available:\n {}\n" - "Configured registries: {}" - ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), + "Configured registries: {}\n" + "Checked by: {}" + ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries), + self.skopeo_img_check_command), } return {} @@ -169,8 +172,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): for registry in registries: args = { - "_raw_params": "timeout 10 skopeo inspect --tls-verify=false " - "docker://{}/{}".format(registry, image) + "_raw_params": self.skopeo_img_check_command + " docker://{}/{}".format(registry, image) } result = self.execute_module("command", args) if result.get("rc", 0) == 0 and not result.get("failed"): diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 8b780114f..0b795b6c4 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -46,6 +46,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): check_multi_minor_release = deployment_type in ['openshift-enterprise'] args = { + "package_mgr": self.get_var("ansible_pkg_mgr"), "package_list": [ { "name": "openvswitch", diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py index 6054d3f3e..e871f39f0 100644 --- a/roles/openshift_health_checker/test/package_version_test.py +++ b/roles/openshift_health_checker/test/package_version_test.py @@ -5,6 +5,7 @@ from openshift_checks.package_version import PackageVersion, OpenShiftCheckExcep def task_vars_for(openshift_release, deployment_type): return dict( + ansible_pkg_mgr='yum', openshift=dict(common=dict(service_type=deployment_type)), openshift_release=openshift_release, openshift_image_tag='v' + openshift_release, @@ -27,6 +28,7 @@ def test_openshift_version_not_supported(): def test_invalid_openshift_release_format(): task_vars = dict( + ansible_pkg_mgr='yum', openshift=dict(common=dict(service_type='origin')), openshift_image_tag='v0', openshift_deployment_type='origin', diff --git a/roles/openshift_hosted/tasks/registry/registry.yml b/roles/openshift_hosted/tasks/registry/registry.yml index 3e424da12..d73c290ff 100644 --- a/roles/openshift_hosted/tasks/registry/registry.yml +++ b/roles/openshift_hosted/tasks/registry/registry.yml @@ -61,6 +61,14 @@ openshift_hosted_registry_env_vars: "{{ openshift_hosted_registry_env_vars | combine({'OPENSHIFT_DEFAULT_REGISTRY':'docker-registry.default.svc:5000'}) }}" when: openshift_push_via_dns | default(false) | bool +- name: Update registry proxy settings for dc/docker-registry + set_fact: + openshift_hosted_registry_env_vars: "{{ {'HTTPS_PROXY': (openshift.common.https_proxy | default('')), + 'HTTP_PROXY': (openshift.common.http_proxy | default('')), + 'NO_PROXY': (openshift.common.no_proxy | default(''))} + | combine(openshift_hosted_registry_env_vars) }}" + when: (openshift.common.https_proxy | default(False)) or (openshift.common.http_proxy | default('')) != '' + - name: Create the registry service account oc_serviceaccount: name: "{{ openshift_hosted_registry_serviceaccount }}" diff --git a/roles/openshift_hosted/tasks/router/router.yml b/roles/openshift_hosted/tasks/router/router.yml index e57ed733e..68ec7233e 100644 --- a/roles/openshift_hosted/tasks/router/router.yml +++ b/roles/openshift_hosted/tasks/router/router.yml @@ -18,6 +18,15 @@ openshift_hosted_router_selector: "{{ openshift.hosted.router.selector | default(None) }}" openshift_hosted_router_image: "{{ openshift.hosted.router.registryurl }}" +- name: Get the certificate contents for router + copy: + backup: True + dest: "/etc/origin/master/{{ item | basename }}" + src: "{{ item }}" + with_items: "{{ openshift_hosted_routers | oo_collect(attribute='certificate') | + oo_select_keys_from_list(['keyfile', 'certfile', 'cafile']) }}" + when: ( not openshift_hosted_router_create_certificate | bool ) or openshift_hosted_router_certificate != {} + # This is for when we desire a cluster signed cert # The certificate is generated and placed in master_config_dir/ - block: @@ -43,15 +52,6 @@ # End Block when: ( openshift_hosted_router_create_certificate | bool ) and openshift_hosted_router_certificate == {} -- name: Get the certificate contents for router - copy: - backup: True - dest: "/etc/origin/master/{{ item | basename }}" - src: "{{ item }}" - with_items: "{{ openshift_hosted_routers | oo_collect(attribute='certificate') | - oo_select_keys_from_list(['keyfile', 'certfile', 'cafile']) }}" - when: not openshift_hosted_router_create_certificate | bool - - name: Create the router service account(s) oc_serviceaccount: name: "{{ item.serviceaccount }}" diff --git a/roles/openshift_logging_curator/templates/curator.j2 b/roles/openshift_logging_curator/templates/curator.j2 index 6431f86d9..e74918a40 100644 --- a/roles/openshift_logging_curator/templates/curator.j2 +++ b/roles/openshift_logging_curator/templates/curator.j2 @@ -44,6 +44,8 @@ spec: cpu: "{{curator_cpu_limit}}" {% if curator_memory_limit is defined and curator_memory_limit is not none and curator_memory_limit != "" %} memory: "{{curator_memory_limit}}" + requests: + memory: "{{curator_memory_limit}}" {% endif %} env: - diff --git a/roles/openshift_logging_elasticsearch/templates/es.j2 b/roles/openshift_logging_elasticsearch/templates/es.j2 index cbe6b89f2..5f2932541 100644 --- a/roles/openshift_logging_elasticsearch/templates/es.j2 +++ b/roles/openshift_logging_elasticsearch/templates/es.j2 @@ -48,7 +48,7 @@ spec: cpu: "{{es_cpu_limit}}" {% endif %} requests: - memory: "512Mi" + memory: "{{es_memory_limit}}" ports: - containerPort: 9200 diff --git a/roles/openshift_logging_fluentd/templates/fluentd.j2 b/roles/openshift_logging_fluentd/templates/fluentd.j2 index 88e039e3f..a4afb6618 100644 --- a/roles/openshift_logging_fluentd/templates/fluentd.j2 +++ b/roles/openshift_logging_fluentd/templates/fluentd.j2 @@ -36,6 +36,8 @@ spec: limits: cpu: {{ openshift_logging_fluentd_cpu_limit }} memory: {{ openshift_logging_fluentd_memory_limit }} + requests: + memory: {{ openshift_logging_fluentd_memory_limit }} volumeMounts: - name: runlogjournal mountPath: /run/log/journal diff --git a/roles/openshift_logging_kibana/templates/kibana.j2 b/roles/openshift_logging_kibana/templates/kibana.j2 index 512d99d06..da1386d3e 100644 --- a/roles/openshift_logging_kibana/templates/kibana.j2 +++ b/roles/openshift_logging_kibana/templates/kibana.j2 @@ -46,6 +46,8 @@ spec: {% endif %} {% if kibana_memory_limit is not none and kibana_memory_limit != "" %} memory: "{{ kibana_memory_limit }}" + requests: + memory: "{{ kibana_memory_limit }}" {% endif %} {% endif %} env: @@ -82,6 +84,8 @@ spec: {% endif %} {% if kibana_proxy_memory_limit is not none and kibana_proxy_memory_limit != "" %} memory: "{{ kibana_proxy_memory_limit }}" + requests: + memory: "{{ kibana_proxy_memory_limit }}" {% endif %} {% endif %} ports: diff --git a/roles/openshift_logging_mux/templates/mux.j2 b/roles/openshift_logging_mux/templates/mux.j2 index 70afe5cee..ff18d3270 100644 --- a/roles/openshift_logging_mux/templates/mux.j2 +++ b/roles/openshift_logging_mux/templates/mux.j2 @@ -45,6 +45,8 @@ spec: {% endif %} {% if mux_memory_limit is not none %} memory: "{{mux_memory_limit}}" + requests: + memory: "{{mux_memory_limit}}" {% endif %} {% endif %} ports: diff --git a/roles/openshift_master/tasks/systemd_units.yml b/roles/openshift_master/tasks/systemd_units.yml index 782a35abe..c480d8223 100644 --- a/roles/openshift_master/tasks/systemd_units.yml +++ b/roles/openshift_master/tasks/systemd_units.yml @@ -7,7 +7,7 @@ # openshift_master_config_dir is set. - name: Set openshift_master_config_dir if unset set_fact: - openshift_master_config_dir: '/var/lib/origin' + openshift_master_config_dir: '/etc/origin/master' when: openshift_master_config_dir is not defined - name: Remove the legacy master service if it exists diff --git a/roles/openshift_node/templates/node.yaml.v1.j2 b/roles/openshift_node/templates/node.yaml.v1.j2 index 711afcadb..f59aa6fb4 100644 --- a/roles/openshift_node/templates/node.yaml.v1.j2 +++ b/roles/openshift_node/templates/node.yaml.v1.j2 @@ -21,8 +21,6 @@ kubeletArguments: {{ openshift.node.kubelet_args | default(None) | to_padded_yam - remote container-runtime-endpoint: - /var/run/crio.sock - experimental-cri: - - 'true' image-service-endpoint: - /var/run/crio.sock node-labels: diff --git a/roles/openshift_prometheus/README.md b/roles/openshift_prometheus/README.md new file mode 100644 index 000000000..c5a44bffb --- /dev/null +++ b/roles/openshift_prometheus/README.md @@ -0,0 +1,95 @@ +OpenShift Prometheus +==================== + +OpenShift Prometheus Installation + +Requirements +------------ + + +Role Variables +-------------- + +For default values, see [`defaults/main.yaml`](defaults/main.yaml). + +- `openshift_prometheus_state`: present - install/update. absent - uninstall. + +- `openshift_prometheus_namespace`: project (i.e. namespace) where the components will be + deployed. + +- `openshift_prometheus_replicas`: The number of replicas for prometheus deployment. + +- `openshift_prometheus_node_selector`: Selector for the nodes prometheus will be deployed on. + +- `openshift_prometheus_image_<COMPONENT>`: specify image for the component + +## Storage related variables +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can set pv claim by setting corresponding role variable: +``` +openshift_prometheus_<COMPONENT>_storage_type: <VALUE> +openshift_prometheus_<COMPONENT>_pvc_(name|size|access_modes|pv_selector): <VALUE> +``` +e.g +``` +openshift_prometheus_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: alertmanager +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +``` + +## Additional Alert Rules file variable +An external file with alert rules can be added by setting path to additional rules variable: +``` +openshift_prometheus_additional_rules_file: <PATH> +``` + +File content should be in prometheus alert rules format. +Following example sets rule to fire an alert when one of the cluster nodes is down: + +``` +groups: +- name: example-rules + interval: 30s # defaults to global interval + rules: + - alert: Node Down + expr: up{job="kubernetes-nodes"} == 0 + annotations: + miqTarget: "ContainerNode" + severity: "HIGH" + message: "{{ '{{' }}{{ '$labels.instance' }}{{ '}}' }} is down" +``` + + +## Additional variables to control resource limits +Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can specify a cpu and memory limits and requests by setting +the corresponding role variable: +``` +openshift_prometheus_<COMPONENT>_(limits|requests)_(memory|cpu): <VALUE> +``` +e.g +``` +openshift_prometheus_alertmanager_limits_memory: 1Gi +openshift_prometheus_oath_proxy_requests_cpu: 100 +``` + +Dependencies +------------ + +openshift_facts + + +Example Playbook +---------------- + +``` +- name: Configure openshift-prometheus + hosts: oo_first_master + roles: + - role: openshift_prometheus +``` + +License +------- + +Apache License, Version 2.0 + diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml new file mode 100644 index 000000000..18d6a1645 --- /dev/null +++ b/roles/openshift_prometheus/defaults/main.yaml @@ -0,0 +1,74 @@ +--- +# defaults file for openshift_prometheus +openshift_prometheus_state: present + +openshift_prometheus_namespace: prometheus + +openshift_prometheus_replicas: 1 +openshift_prometheus_node_selector: {"region":"infra"} + +# images +openshift_prometheus_image_proxy: "openshift/oauth-proxy:v1.0.0" +openshift_prometheus_image_prometheus: "openshift/prometheus:v2.0.0-dev" +openshift_prometheus_image_alertmanager: "openshift/prometheus-alertmanager:dev" +openshift_prometheus_image_alertbuffer: "ilackarms/message-buffer" + +# additional prometheus rules file +openshift_prometheus_additional_rules_file: null + +# All the required exports +openshift_prometheus_pv_exports: + - prometheus + - prometheus-alertmanager + - prometheus-alertbuffer +# PV template files and their created object names +openshift_prometheus_pv_data: + - pv_name: prometheus + pv_template: prom-pv-server.yml + pv_label: Prometheus Server PV + - pv_name: prometheus-alertmanager + pv_template: prom-pv-alertmanager.yml + pv_label: Prometheus Alertmanager PV + - pv_name: prometheus-alertbuffer + pv_template: prom-pv-alertbuffer.yml + pv_label: Prometheus Alert Buffer PV + +# Hostname/IP of the NFS server. Currently defaults to first master +openshift_prometheus_nfs_server: "{{ groups.nfs.0 }}" + +# storage +openshift_prometheus_storage_type: pvc +openshift_prometheus_pvc_name: prometheus +openshift_prometheus_pvc_size: 10G +openshift_prometheus_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_pvc_pv_selector: {} + +openshift_prometheus_alertmanager_storage_type: pvc +openshift_prometheus_alertmanager_pvc_name: prometheus-alertmanager +openshift_prometheus_alertmanager_pvc_size: 10G +openshift_prometheus_alertmanager_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertmanager_pvc_pv_selector: {} + +openshift_prometheus_alertbuffer_storage_type: pvc +openshift_prometheus_alertbuffer_pvc_name: prometheus-alertbuffer +openshift_prometheus_alertbuffer_pvc_size: 10G +openshift_prometheus_alertbuffer_pvc_access_modes: [ReadWriteOnce] +openshift_prometheus_alertbuffer_pvc_pv_selector: {} + +# container resources +openshift_prometheus_cpu_limit: null +openshift_prometheus_memory_limit: null +openshift_prometheus_cpu_requests: null +openshift_prometheus_memory_requests: null +openshift_prometheus_alertmanager_cpu_limit: null +openshift_prometheus_alertmanager_memory_limit: null +openshift_prometheus_alertmanager_cpu_requests: null +openshift_prometheus_alertmanager_memory_requests: null +openshift_prometheus_alertbuffer_cpu_limit: null +openshift_prometheus_alertbuffer_memory_limit: null +openshift_prometheus_alertbuffer_cpu_requests: null +openshift_prometheus_alertbuffer_memory_requests: null +openshift_prometheus_oauth_proxy_cpu_limit: null +openshift_prometheus_oauth_proxy_memory_limit: null +openshift_prometheus_oauth_proxy_cpu_requests: null +openshift_prometheus_oauth_proxy_memory_requests: null diff --git a/roles/openshift_prometheus/files/openshift_prometheus.exports b/roles/openshift_prometheus/files/openshift_prometheus.exports new file mode 100644 index 000000000..3ccedb1fd --- /dev/null +++ b/roles/openshift_prometheus/files/openshift_prometheus.exports @@ -0,0 +1,3 @@ +/exports/prometheus *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertmanager *(rw,no_root_squash,no_wdelay) +/exports/prometheus-alertbuffer *(rw,no_root_squash,no_wdelay) diff --git a/roles/openshift_prometheus/meta/main.yaml b/roles/openshift_prometheus/meta/main.yaml new file mode 100644 index 000000000..33188bb7e --- /dev/null +++ b/roles/openshift_prometheus/meta/main.yaml @@ -0,0 +1,19 @@ +--- +galaxy_info: + author: OpenShift Development <dev@lists.openshift.redhat.com> + description: Deploy OpenShift prometheus integration for the cluster + company: Red Hat, Inc. + license: license (Apache) + min_ansible_version: 2.2 + platforms: + - name: EL + versions: + - 7 + - name: Fedora + versions: + - all + categories: + - openshift +dependencies: +- { role: lib_openshift } +- { role: openshift_facts } diff --git a/roles/openshift_prometheus/tasks/create_pvs.yaml b/roles/openshift_prometheus/tasks/create_pvs.yaml new file mode 100644 index 000000000..4e79da05f --- /dev/null +++ b/roles/openshift_prometheus/tasks/create_pvs.yaml @@ -0,0 +1,36 @@ +--- +# Check for existance and then conditionally: +# - evaluate templates +# - PVs +# +# These tasks idempotently create required Prometheus PV objects. Do not +# call this file directly. This file is intended to be ran as an +# include that has a 'with_items' attached to it. Hence the use below +# of variables like "{{ item.pv_label }}" + +- name: "Check if the {{ item.pv_label }} template has been created already" + oc_obj: + namespace: "{{ openshift_prometheus_namespace }}" + state: list + kind: pv + name: "{{ item.pv_name }}" + register: prom_pv_check + +# Skip all of this if the PV already exists +- block: + - name: "Ensure the {{ item.pv_label }} template is evaluated" + template: + src: "{{ item.pv_template }}.j2" + dest: "{{ tempdir }}/templates/{{ item.pv_template }}" + + - name: "Ensure {{ item.pv_label }} is created" + oc_obj: + namespace: "{{ openshift_prometheus_namespace }}" + kind: pv + name: "{{ item.pv_name }}" + state: present + delete_after: True + files: + - "{{ tempdir }}/templates/{{ item.pv_template }}" + when: + - not prom_pv_check.results.results.0 diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml new file mode 100644 index 000000000..93bdda3e8 --- /dev/null +++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml @@ -0,0 +1,241 @@ +--- + +# namespace +- name: Add prometheus project + oc_project: + state: "{{ state }}" + name: "{{ openshift_prometheus_namespace }}" + node_selector: "{{ openshift_prometheus_node_selector | oo_selector_to_string_list() }}" + description: Prometheus + +# secrets +- name: Set alert and prometheus secrets + oc_secret: + state: "{{ state }}" + name: "{{ item }}-proxy" + namespace: "{{ openshift_prometheus_namespace }}" + contents: + - path: session_secret + data: "{{ 43 | oo_random_word }}=" + with_items: + - prometheus + - alerts + +# serviceaccount +- name: create prometheus serviceaccount + oc_serviceaccount: + state: "{{ state }}" + name: prometheus + namespace: "{{ openshift_prometheus_namespace }}" + # TODO add annotations when supproted + # annotations: + # serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' + # serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + + secrets: + - prometheus-secrets + changed_when: no + +# TODO remove this when annotations are supported by oc_serviceaccount +- name: annotate serviceaccount + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + serviceaccount prometheus + serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}' + serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}' + + +# create clusterrolebinding for prometheus serviceaccount +- name: Set cluster-reader permissions for prometheus + oc_adm_policy_user: + state: "{{ state }}" + namespace: "{{ openshift_prometheus_namespace }}" + resource_kind: cluster-role + resource_name: cluster-reader + user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus" + + +###################################################################### +# NFS +# In the case that we are not running on a cloud provider, volumes must be statically provisioned + +- include: nfs.yaml + when: not (openshift_cloudprovider_kind is defined and (openshift_cloudprovider_kind == 'aws' or openshift_cloudprovider_kind == 'gce')) + + +# create prometheus and alerts services +# TODO join into 1 task with loop +- name: Create prometheus service + oc_service: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + selector: + app: prometheus + labels: + name: "{{ item.name }}" + # TODO add annotations when supported + # annotations: + # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" + ports: + - port: 443 + targetPort: 8443 + with_items: + - name: prometheus + +- name: Create alerts service + oc_service: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + selector: + app: prometheus + labels: + name: "{{ item.name }}" + # TODO add annotations when supported + # annotations: + # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls" + ports: + - port: 443 + targetPort: 9443 + with_items: + - name: alerts + + +# Annotate services with secret name +# TODO remove this when annotations are supported by oc_service +- name: annotate prometheus service + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + service prometheus 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls' + +- name: annotate alerts service + command: > + {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }} + service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls' + +# create prometheus and alerts routes +- name: create prometheus and alerts routes + oc_route: + state: "{{ state }}" + name: "{{ item.name }}" + namespace: "{{ openshift_prometheus_namespace }}" + service_name: "{{ item.name }}" + tls_termination: reencrypt + with_items: + - name: prometheus + - name: alerts + +# Storage +- name: create prometheus pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_pvc_name }}" + access_modes: "{{ openshift_prometheus_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_pvc_size }}" + selector: "{{ openshift_prometheus_pvc_pv_selector }}" + +- name: create alertmanager pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_alertmanager_pvc_name }}" + access_modes: "{{ openshift_prometheus_alertmanager_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_alertmanager_pvc_size }}" + selector: "{{ openshift_prometheus_alertmanager_pvc_pv_selector }}" + +- name: create alertbuffer pvc + oc_pvc: + namespace: "{{ openshift_prometheus_namespace }}" + name: "{{ openshift_prometheus_alertbuffer_pvc_name }}" + access_modes: "{{ openshift_prometheus_alertbuffer_pvc_access_modes }}" + volume_capacity: "{{ openshift_prometheus_alertbuffer_pvc_size }}" + selector: "{{ openshift_prometheus_alertbuffer_pvc_pv_selector }}" + +# create prometheus deployment +- name: Set prometheus deployment template + template: + src: prometheus_deployment.j2 + dest: "{{ tempdir }}/templates/prometheus.yaml" + vars: + namespace: "{{ openshift_prometheus_namespace }}" + prom_replicas: "{{ openshift_prometheus_replicas }}" + +- name: Set prometheus deployment + oc_obj: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + kind: deployment + files: + - "{{ tempdir }}/templates/prometheus.yaml" + delete_after: true + +# prometheus configmap +# Copy the additional rules file if it is defined +- name: Copy additional rules file to host + copy: + src: "{{ openshift_prometheus_additional_rules_file }}" + dest: "{{ tempdir }}/prometheus.additional.rules" + when: + - openshift_prometheus_additional_rules_file is defined + - openshift_prometheus_additional_rules_file is not none + - openshift_prometheus_additional_rules_file | trim | length > 0 + +- stat: + path: "{{ tempdir }}/prometheus.additional.rules" + register: additional_rules_stat + +# The kubernetes version impacts the prometheus scraping endpoint +# so gathering it before constructing the configmap +- name: get oc version + oc_version: + register: oc_version + +- set_fact: + kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}" + +- template: + src: prometheus.yml.j2 + dest: "{{ tempdir }}/prometheus.yml" + changed_when: no + +- template: + src: prometheus.rules.j2 + dest: "{{ tempdir }}/prometheus.rules" + changed_when: no + +# In prometheus configmap create "additional.rules" section if file exists +- name: Set prometheus configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + prometheus.rules: "{{ tempdir }}/prometheus.rules" + prometheus.additional.rules: "{{ tempdir }}/prometheus.additional.rules" + prometheus.yml: "{{ tempdir }}/prometheus.yml" + when: additional_rules_stat.stat.exists == True + +- name: Set prometheus configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + prometheus.rules: "{{ tempdir }}/prometheus.rules" + prometheus.yml: "{{ tempdir }}/prometheus.yml" + when: additional_rules_stat.stat.exists == False + +# alertmanager configmap +- template: + src: alertmanager.yml.j2 + dest: "{{ tempdir }}/alertmanager.yml" + changed_when: no + +- name: Set alertmanager configmap + oc_configmap: + state: "{{ state }}" + name: "prometheus-alerts" + namespace: "{{ openshift_prometheus_namespace }}" + from_file: + alertmanager.yml: "{{ tempdir }}/alertmanager.yml" diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml new file mode 100644 index 000000000..523a64334 --- /dev/null +++ b/roles/openshift_prometheus/tasks/main.yaml @@ -0,0 +1,26 @@ +--- + +- name: Create temp directory for doing work in on target + command: mktemp -td openshift-prometheus-ansible-XXXXXX + register: mktemp + changed_when: False + +- set_fact: + tempdir: "{{ mktemp.stdout }}" + +- name: Create templates subdirectory + file: + state: directory + path: "{{ tempdir }}/templates" + mode: 0755 + changed_when: False + +- include: install_prometheus.yaml + vars: + state: "{{ openshift_prometheus_state }}" + +- name: Delete temp directory + file: + name: "{{ tempdir }}" + state: absent + changed_when: False diff --git a/roles/openshift_prometheus/tasks/nfs.yaml b/roles/openshift_prometheus/tasks/nfs.yaml new file mode 100644 index 000000000..0b45f2cee --- /dev/null +++ b/roles/openshift_prometheus/tasks/nfs.yaml @@ -0,0 +1,44 @@ +--- +# Tasks to statically provision NFS volumes +# Include if not using dynamic volume provisioning +- name: Ensure the /exports/ directory exists + file: + path: /exports/ + state: directory + mode: 0755 + owner: root + group: root + +- name: Ensure the prom-pv0X export directories exist + file: + path: "/exports/{{ item }}" + state: directory + mode: 0777 + owner: nfsnobody + group: nfsnobody + with_items: "{{ openshift_prometheus_pv_exports }}" + +- name: Ensure the NFS exports for Prometheus PVs exist + copy: + src: openshift_prometheus.exports + dest: /etc/exports.d/openshift_prometheus.exports + register: nfs_exports_updated + +- name: Ensure the NFS export table is refreshed if exports were added + command: exportfs -ar + when: + - nfs_exports_updated.changed + + +###################################################################### +# Create the required Prometheus PVs. Check out these online docs if you +# need a refresher on includes looping with items: +# * http://docs.ansible.com/ansible/playbooks_loops.html#loops-and-includes-in-2-0 +# * http://stackoverflow.com/a/35128533 +# +# TODO: Handle the case where a PV template is updated in +# openshift-ansible and the change needs to be landed on the managed +# cluster. + +- include: create_pvs.yaml + with_items: "{{ openshift_prometheus_pv_data }}" diff --git a/roles/openshift_prometheus/templates/alertmanager.yml.j2 b/roles/openshift_prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..6c432a3d0 --- /dev/null +++ b/roles/openshift_prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,20 @@ +global: + +# The root route on which each incoming alert enters. +route: + # default route if none match + receiver: alert-buffer-wh + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # TODO: + group_by: [] + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + +receivers: +- name: alert-buffer-wh + webhook_configs: + - url: http://localhost:9099/topics/alerts diff --git a/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 new file mode 100644 index 000000000..55a5e19c3 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertbuffer + labels: + storage: prometheus-alertbuffer +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertbuffer + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 new file mode 100644 index 000000000..4ee518735 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertmanager + labels: + storage: prometheus-alertmanager +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertmanager + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 new file mode 100644 index 000000000..933bf0f60 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus + labels: + storage: prometheus +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prometheus.rules.j2 b/roles/openshift_prometheus/templates/prometheus.rules.j2 new file mode 100644 index 000000000..e861dc127 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.rules.j2 @@ -0,0 +1,4 @@ +groups: +- name: example-rules + interval: 30s # defaults to global interval + rules: diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 new file mode 100644 index 000000000..63430f834 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,174 @@ +rule_files: + - 'prometheus.rules' +{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} + - 'prometheus.additional.rules' +{% endif %} + + + +# A scrape configuration for running Prometheus on a Kubernetes cluster. +# This uses separate scrape configs for cluster components (i.e. API server, node) +# and services to allow each to use different authentication configs. +# +# Kubernetes labels will be added as Prometheus labels on metrics via the +# `labelmap` relabeling action. + +# Scrape config for API servers. +# +# Kubernetes exposes API servers as endpoints to the default/kubernetes +# service so this uses `endpoints` role and uses relabelling to only keep +# the endpoints associated with the default/kubernetes service using the +# default named port `https`. This works for single API server deployments as +# well as HA API server deployments. +scrape_configs: +- job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for controllers. +# +# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for +# the controllers. +# +# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via +# endpoints. +- job_name: 'kubernetes-controllers' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + # Keep only the default/kubernetes service endpoints for the https port, and then + # set the port to 8444. This is the default configuration for the controllers on OpenShift + # masters. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - source_labels: [__address__] + action: replace + target_label: __address__ + regex: (.+)(?::\d+) + replacement: $1:8444 + +# Scrape config for cAdvisor. +# +# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that +# reports container metrics for each running pod. Scrape those by default. +- job_name: 'kubernetes-cadvisor' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +{% if kubernetes_version | float() >= 1.7 | float() %} + metrics_path: /metrics/cadvisor +{% else %} + metrics_path: /metrics +{% endif %} + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for service endpoints. +# +# The relabeling allows the actual service scrape endpoint to be configured +# via the following annotations: +# +# * `prometheus.io/scrape`: Only scrape services that have a value of `true` +# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# to set this to `https` & most likely set the `tls_config` of the scrape config. +# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# * `prometheus.io/port`: If the metrics are exposed on a different port to the +# service then set this appropriately. +- job_name: 'kubernetes-service-endpoints' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # TODO: this should be per target + insecure_skip_verify: true + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] + action: replace + target_label: __basic_auth_username__ + regex: (.+) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] + action: replace + target_label: __basic_auth_password__ + regex: (.+) + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "localhost:9093" diff --git a/roles/openshift_prometheus/templates/prometheus_deployment.j2 b/roles/openshift_prometheus/templates/prometheus_deployment.j2 new file mode 100644 index 000000000..98c117f19 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus_deployment.j2 @@ -0,0 +1,240 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus + namespace: {{ namespace }} + labels: + app: prometheus +spec: + replicas: {{ prom_replicas|default(1) }} + selector: + provider: openshift + matchLabels: + app: prometheus + template: + metadata: + name: prometheus + labels: + app: prometheus + spec: + serviceAccountName: prometheus +{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} + nodeSelector: +{% for key, value in openshift_prometheus_node_selector.iteritems() %} + {{key}}: "{{value}}" +{% endfor %} +{% endif %} + containers: + # Deploy Prometheus behind an oauth proxy + - name: prom-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 8443 + name: web + args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9090 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + - -skip-auth-regex=^/metrics + volumeMounts: + - mountPath: /etc/tls/private + name: prometheus-tls + - mountPath: /etc/proxy/secrets + name: prometheus-secrets + - mountPath: /prometheus + name: prometheus-data + + - name: prometheus + args: + - --storage.tsdb.retention=6h + - --config.file=/etc/prometheus/prometheus.yml + - --web.listen-address=localhost:9090 + image: "{{ openshift_prometheus_image_prometheus }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %} + memory: "{{openshift_prometheus_memory_requests}}" +{% endif %} +{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %} + cpu: "{{openshift_prometheus_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %} + memory: "{{ openshift_prometheus_memory_limit }}" +{% endif %} +{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %} + cpu: "{{openshift_prometheus_cpu_limit}}" +{% endif %} + + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + - mountPath: /prometheus + name: prometheus-data + + # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy + - name: alerts-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9443 + name: web + args: + - -provider=openshift + - -https-address=:9443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9099 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + volumeMounts: + - mountPath: /etc/tls/private + name: alerts-tls + - mountPath: /etc/proxy/secrets + name: alerts-secrets + + - name: alert-buffer + args: + - --storage-path=/alert-buffer/messages.db + image: "{{ openshift_prometheus_image_alertbuffer }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}" +{% endif %} + volumeMounts: + - mountPath: /alert-buffer + name: alert-buffer-data + ports: + - containerPort: 9099 + name: alert-buf + + - name: alertmanager + args: + - -config.file=/etc/alertmanager/alertmanager.yml + image: "{{ openshift_prometheus_image_alertmanager }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9093 + name: web + volumeMounts: + - mountPath: /etc/alertmanager + name: alertmanager-config + - mountPath: /alertmanager + name: alertmanager-data + + restartPolicy: Always + volumes: + - name: prometheus-config + configMap: + defaultMode: 420 + name: prometheus + - name: prometheus-secrets + secret: + secretName: prometheus-proxy + - name: prometheus-tls + secret: + secretName: prometheus-tls + - name: prometheus-data +{% if openshift_prometheus_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alertmanager-config + configMap: + defaultMode: 420 + name: prometheus-alerts + - name: alerts-secrets + secret: + secretName: alerts-proxy + - name: alerts-tls + secret: + secretName: prometheus-alerts-tls + - name: alertmanager-data +{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertmanager_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alert-buffer-data +{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} +{% else %} + emptydir: {} +{% endif %} diff --git a/roles/openshift_prometheus/tests/inventory b/roles/openshift_prometheus/tests/inventory new file mode 100644 index 000000000..878877b07 --- /dev/null +++ b/roles/openshift_prometheus/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/roles/openshift_prometheus/tests/test.yaml b/roles/openshift_prometheus/tests/test.yaml new file mode 100644 index 000000000..37baf573c --- /dev/null +++ b/roles/openshift_prometheus/tests/test.yaml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - openshift_prometheus |