summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/tasks/main.yml40
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml198
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml22
3 files changed, 238 insertions, 22 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
index 59c89bb02..d0b307a3d 100644
--- a/roles/os_zabbix/tasks/main.yml
+++ b/roles/os_zabbix/tasks/main.yml
@@ -8,15 +8,35 @@
register: templates
- include_vars: template_heartbeat.yml
+ tags:
+ - heartbeat
- include_vars: template_os_linux.yml
+ tags:
+ - linux
- include_vars: template_docker.yml
+ tags:
+ - docker
- include_vars: template_openshift_master.yml
+ tags:
+ - openshift_master
- include_vars: template_openshift_node.yml
+ tags:
+ - openshift_node
- include_vars: template_ops_tools.yml
+ tags:
+ - ops_tools
- include_vars: template_app_zabbix_server.yml
+ tags:
+ - zabbix_server
- include_vars: template_app_zabbix_agent.yml
+ tags:
+ - zabbix_agent
- include_vars: template_performance_copilot.yml
+ tags:
+ - pcp
- include_vars: template_aws.yml
+ tags:
+ - aws
- name: Include Template Heartbeat
include: ../../lib_zabbix/tasks/create_template.yml
@@ -25,6 +45,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - heartbeat
- name: Include Template os_linux
include: ../../lib_zabbix/tasks/create_template.yml
@@ -33,6 +55,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - linux
- name: Include Template docker
include: ../../lib_zabbix/tasks/create_template.yml
@@ -41,6 +65,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - docker
- name: Include Template Openshift Master
include: ../../lib_zabbix/tasks/create_template.yml
@@ -49,6 +75,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - openshift_master
- name: Include Template Openshift Node
include: ../../lib_zabbix/tasks/create_template.yml
@@ -57,6 +85,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - openshift_node
- name: Include Template Ops Tools
include: ../../lib_zabbix/tasks/create_template.yml
@@ -65,6 +95,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - ops_tools
- name: Include Template App Zabbix Server
include: ../../lib_zabbix/tasks/create_template.yml
@@ -73,6 +105,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - zabbix_server
- name: Include Template App Zabbix Agent
include: ../../lib_zabbix/tasks/create_template.yml
@@ -81,6 +115,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - zabbix_agent
- name: Include Template Performance Copilot
include: ../../lib_zabbix/tasks/create_template.yml
@@ -89,6 +125,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - pcp
- name: Include Template AWS
include: ../../lib_zabbix/tasks/create_template.yml
@@ -97,3 +135,5 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - aws
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 174486e15..514d6fd24 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -7,12 +7,24 @@ g_template_openshift_master:
- Openshift Master
key: create_app
+ - key: openshift.master.registry.healthz
+ description: "Shows the health status of the cluster's docker registry"
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.process.count
description: Shows number of master processes running
type: int
applications:
- Openshift Master
+ - key: openshift.master.api.ping
+ description: "Verify that the Openshift API is up"
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.api.healthz
description: "Checks the healthz check of the master's api: https://master_host/healthz"
type: int
@@ -44,12 +56,48 @@ g_template_openshift_master:
applications:
- Openshift Master
+ - key: openshift.master.node.count
+ description: Shows the total number of nodes found in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.project.count
description: Shows number of projects on a cluster
type: int
applications:
- Openshift Master
+ - key: openshift.master.pv.total.count
+ description: Total number of Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.available.count
+ description: Total number of Available Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.released.count
+ description: Total number of Released Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.bound.count
+ description: Total number of Bound Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.failed.count
+ description: Total number of Failed Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.etcd.create.success
description: Show number of successful create actions
type: int
@@ -122,17 +170,67 @@ g_template_openshift_master:
applications:
- Openshift Etcd
- ztriggers:
- - name: 'Application creation has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- priority: avg
+ - key: openshift.master.metric.ping
+ description: "This check verifies that the https://master/metrics check is alive and communicating properly."
+ type: int
+ applications:
+ - Openshift Master Metrics
- - name: 'Openshift Master API health check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+ ztriggers:
- name: 'Openshift Master process not running on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -143,6 +241,16 @@ g_template_openshift_master:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: high
+ - name: 'Low number of etcd watchers on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: avg
+
+ - name: 'Etcd ping failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: high
+
- name: 'Number of users for Openshift Master on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -153,12 +261,72 @@ g_template_openshift_master:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: info
- - name: 'Low number of etcd watchers on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ # Put triggers that depend on other triggers here (deps must be created first)
+ - name: 'Application creation has failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
priority: avg
- - name: 'Etcd ping failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ - name: 'Openshift Master API health check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
priority: high
+
+ - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: avg
+
+ - name: 'Docker Registry check failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ zgraphs:
+ - name: Openshift Master API Server Latency Pods LIST Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
+ color: red
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
+ color: blue
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
+ color: orange
+
+ - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
+ color: red
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
+ color: blue
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
+ color: orange
+
+ - name: Openshift Master Scheduler End to End Latency Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
+ color: red
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
+ color: blue
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
+ color: orange
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 04665be62..c6e557f12 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -258,26 +258,34 @@ g_template_os_linux:
- Network
ztriggerprototypes:
- - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: warn
-
- name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
- - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ # This has a dependency on the previous trigger
+ # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: warn
+ dependencies:
+ - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
- name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
+ # This has a dependency on the previous trigger
+ # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+ dependencies:
+ - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+
ztriggers:
- name: 'Too many TOTAL processes on {HOST.NAME}'
expression: '{Template OS Linux:proc.nprocs.last()}>5000'