summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
authorJoel Diaz <jdiaz@redhat.com>2016-04-08 13:48:13 -0400
committerJoel Diaz <jdiaz@redhat.com>2016-04-15 09:35:54 -0400
commit3681ab5fb2a39ccb06024e6ad514ad50df21f9d2 (patch)
tree5d1dd7da5404f431a60241a419155204a7784dcc /roles/os_zabbix
parent51e390459d7eda90c44d9ffd7f1199a3590beac1 (diff)
downloadopenshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.gz
openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.bz2
openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.xz
openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.zip
cleanup roles after roles move to openshift-tools
also removing inventory/multi_inventory* things left behind unchanged even though they were copied: playbooks/adhoc/* roles/dns roles/kube_nfs_volumes roles/os_update_latest
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/README.md40
-rw-r--r--roles/os_zabbix/defaults/main.yml1
-rw-r--r--roles/os_zabbix/handlers/main.yml1
-rw-r--r--roles/os_zabbix/meta/main.yml9
-rw-r--r--roles/os_zabbix/tasks/main.yml166
-rw-r--r--roles/os_zabbix/vars/main.yml1
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_agent.yml23
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml412
-rw-r--r--roles/os_zabbix/vars/template_aws.yml25
-rw-r--r--roles/os_zabbix/vars/template_config_loop.yml14
-rw-r--r--roles/os_zabbix/vars/template_docker.yml116
-rw-r--r--roles/os_zabbix/vars/template_heartbeat.yml18
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml458
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml70
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml54
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml314
-rw-r--r--roles/os_zabbix/vars/template_performance_copilot.yml14
-rw-r--r--roles/os_zabbix/vars/template_zagg_server.yml46
18 files changed, 0 insertions, 1782 deletions
diff --git a/roles/os_zabbix/README.md b/roles/os_zabbix/README.md
deleted file mode 100644
index ac3dc2833..000000000
--- a/roles/os_zabbix/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-os_zabbix
-=========
-
-Automate zabbix tasks.
-
-Requirements
-------------
-
-This requires the openshift_tools rpm be installed for the zbxapi.py library. It can be found here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now.
-
-Role Variables
---------------
-
-zab_server
-zab_username
-zab_password
-
-Dependencies
-------------
-
-This depeonds on the zbxapi.py library located here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now.
-
-Example Playbook
-----------------
-
- - zbx_host:
- server: zab_server
- user: zab_user
- password: zab_password
- name: 'myhost'
-
-License
--------
-
-ASL 2.0
-
-Author Information
-------------------
-
-OpenShift operations, Red Hat, Inc
diff --git a/roles/os_zabbix/defaults/main.yml b/roles/os_zabbix/defaults/main.yml
deleted file mode 100644
index ed97d539c..000000000
--- a/roles/os_zabbix/defaults/main.yml
+++ /dev/null
@@ -1 +0,0 @@
----
diff --git a/roles/os_zabbix/handlers/main.yml b/roles/os_zabbix/handlers/main.yml
deleted file mode 100644
index ed97d539c..000000000
--- a/roles/os_zabbix/handlers/main.yml
+++ /dev/null
@@ -1 +0,0 @@
----
diff --git a/roles/os_zabbix/meta/main.yml b/roles/os_zabbix/meta/main.yml
deleted file mode 100644
index 360f5aad2..000000000
--- a/roles/os_zabbix/meta/main.yml
+++ /dev/null
@@ -1,9 +0,0 @@
----
-galaxy_info:
- author: OpenShift
- description: ZabbixAPI
- company: Red Hat, Inc
- license: ASL 2.0
- min_ansible_version: 1.2
-dependencies:
-- lib_zabbix
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
deleted file mode 100644
index 1c8d88854..000000000
--- a/roles/os_zabbix/tasks/main.yml
+++ /dev/null
@@ -1,166 +0,0 @@
----
-- name: Main List all templates
- zbx_template:
- zbx_server: "{{ ozb_server }}"
- zbx_user: "{{ ozb_user }}"
- zbx_password: "{{ ozb_password }}"
- state: list
- register: templates
-
-- include_vars: template_heartbeat.yml
- tags:
- - heartbeat
-- include_vars: template_os_linux.yml
- tags:
- - linux
-- include_vars: template_docker.yml
- tags:
- - docker
-- include_vars: template_openshift_master.yml
- tags:
- - openshift_master
-- include_vars: template_openshift_node.yml
- tags:
- - openshift_node
-- include_vars: template_ops_tools.yml
- tags:
- - ops_tools
-- include_vars: template_app_zabbix_server.yml
- tags:
- - zabbix_server
-- include_vars: template_app_zabbix_agent.yml
- tags:
- - zabbix_agent
-- include_vars: template_performance_copilot.yml
- tags:
- - pcp
-- include_vars: template_aws.yml
- tags:
- - aws
-- include_vars: template_zagg_server.yml
- tags:
- - zagg_server
-
-- include_vars: template_config_loop.yml
- tags:
- - config_loop
-
-- name: Include Template Heartbeat
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_heartbeat }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - heartbeat
-
-- name: Include Template os_linux
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_os_linux }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - linux
-
-- name: Include Template docker
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_docker }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - docker
-
-- name: Include Template Openshift Master
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_openshift_master }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - openshift_master
-
-- name: Include Template Openshift Node
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_openshift_node }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - openshift_node
-
-- name: Include Template Ops Tools
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_ops_tools }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - ops_tools
-
-- name: Include Template App Zabbix Server
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_app_zabbix_server }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - zabbix_server
-
-- name: Include Template App Zabbix Agent
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_app_zabbix_agent }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - zabbix_agent
-
-- name: Include Template Performance Copilot
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_performance_copilot }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - pcp
-
-- name: Include Template AWS
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_aws }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - aws
-
-- name: Include Template Zagg Server
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_zagg_server }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - zagg_server
-
-- name: Include Template Config Loop
- include: ../../lib_zabbix/tasks/create_template.yml
- vars:
- template: "{{ g_template_config_loop }}"
- server: "{{ ozb_server }}"
- user: "{{ ozb_user }}"
- password: "{{ ozb_password }}"
- tags:
- - config_loop
diff --git a/roles/os_zabbix/vars/main.yml b/roles/os_zabbix/vars/main.yml
deleted file mode 100644
index ed97d539c..000000000
--- a/roles/os_zabbix/vars/main.yml
+++ /dev/null
@@ -1 +0,0 @@
----
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
deleted file mode 100644
index d636d4822..000000000
--- a/roles/os_zabbix/vars/template_app_zabbix_agent.yml
+++ /dev/null
@@ -1,23 +0,0 @@
----
-g_template_app_zabbix_agent:
- name: Template App Zabbix Agent
- zitems:
- - key: agent.hostname
- applications:
- - Zabbix agent
- value_type: character
- zabbix_type: agent
-
- - key: agent.ping
- applications:
- - Zabbix agent
- description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check.
- value_type: int
- zabbix_type: agent
-
- ztriggers:
- - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes'
- description: Zabbix agent is unreachable for 15 minutes.
- expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1'
- priority: high
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
deleted file mode 100644
index 43517113b..000000000
--- a/roles/os_zabbix/vars/template_app_zabbix_server.yml
+++ /dev/null
@@ -1,412 +0,0 @@
----
-g_template_app_zabbix_server:
- name: Template App Zabbix Server
- zitems:
- - key: housekeeper_creates
- applications:
- - Zabbix server
- description: A simple count of the number of partition creates output by the housekeeper script.
- units: ''
- value_type: int
- zabbix_type: internal
-
- - key: housekeeper_drops
- applications:
- - Zabbix server
- description: A simple count of the number of partition drops output by the housekeeper script.
- units: ''
- value_type: int
- zabbix_type: internal
-
- - key: housekeeper_errors
- applications:
- - Zabbix server
- description: A simple count of the number of errors output by the housekeeper script.
- units: ''
- value_type: int
- zabbix_type: internal
-
- - key: housekeeper_total
- applications:
- - Zabbix server
- description: A simple count of the total number of lines output by the housekeeper
- script.
- units: ''
- value_type: int
- zabbix_type: internal
-
- - key: zabbix[process,alerter,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,configuration syncer,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,db watchdog,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,discoverer,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,escalator,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,history syncer,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,housekeeper,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,http poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,icmp pinger,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,ipmi poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,java poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,node watcher,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,proxy poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,self-monitoring,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,snmp trapper,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,timer,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,trapper,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[process,unreachable poller,avg,busy]
- applications:
- - Zabbix server
- description: ''
- units: '%'
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[queue,10m]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: int
- zabbix_type: internal
- interval: 600
-
- - key: zabbix[queue]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: int
- zabbix_type: internal
- interval: 600
-
- - key: zabbix[rcache,buffer,pfree]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[wcache,history,pfree]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[wcache,text,pfree]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[wcache,trend,pfree]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: float
- zabbix_type: internal
-
- - key: zabbix[wcache,values]
- applications:
- - Zabbix server
- description: ''
- units: ''
- value_type: float
- zabbix_type: internal
- delta: 1 # speed per second
-
- ztriggers:
- - description: "There has been unexpected output while running the housekeeping script\
- \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
- \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
- \ for more details."
- expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
- name: Unexpected output in Zabbix DB Housekeeping
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
-
- - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
- expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
- name: Errors during Zabbix DB Housekeeping
- priority: high
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
- name: Zabbix alerter processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
- name: Zabbix configuration syncer processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
- name: Zabbix db watchdog processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
- name: Zabbix discoverer processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
- name: Zabbix escalator processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
- name: Zabbix history syncer processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
- name: Zabbix housekeeper processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
- name: Zabbix http poller processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
- name: Zabbix icmp pinger processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
- name: Zabbix ipmi poller processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
- name: Zabbix java poller processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
- name: Zabbix node watcher processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
- name: Zabbix poller processes more than 75% busy
- priority: high
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
- name: Zabbix proxy poller processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
- name: Zabbix self-monitoring processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
- name: Zabbix snmp trapper processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: Timer processes usually are busy because they have to process time
- based trigger functions
- expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
- name: Zabbix timer processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
- name: Zabbix trapper processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
- name: Zabbix unreachable poller processes more than 75% busy
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
-
- - description: "This alert generally indicates a performance problem or a problem\
- \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
- \ is Administration > Queue. Be sure to check the general view and the per-proxy\
- \ view."
- expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
- name: More than 1000 items having missing data for more than 10 minutes
- priority: high
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
-
- - description: Consider increasing CacheSize in the zabbix_server.conf configuration
- file
- expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
- name: Less than 5% free in the configuration cache
- priority: info
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
- name: Less than 25% free in the history cache
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
- name: Less than 25% free in the text history cache
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
-
- - description: ''
- expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
- name: Less than 25% free in the trends cache
- priority: avg
- url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_aws.yml b/roles/os_zabbix/vars/template_aws.yml
deleted file mode 100644
index 57832a3fe..000000000
--- a/roles/os_zabbix/vars/template_aws.yml
+++ /dev/null
@@ -1,25 +0,0 @@
----
-g_template_aws:
- name: Template AWS
- zdiscoveryrules:
- - name: disc.aws
- key: disc.aws
- lifetime: 14
- description: "Dynamically register AWS bucket info"
-
- zitemprototypes:
- - discoveryrule_key: disc.aws
- name: "S3 bucket size (GB) [{#S3_BUCKET}]"
- key: "disc.aws.size[{#S3_BUCKET}]"
- value_type: int
- description: "Size of S3 bucket"
- applications:
- - AWS
-
- - discoveryrule_key: disc.aws
- name: "S3 bucket object count [{#S3_BUCKET}]"
- key: "disc.aws.objects[{#S3_BUCKET}]"
- value_type: int
- description: "Objects in S3 bucket"
- applications:
- - AWS
diff --git a/roles/os_zabbix/vars/template_config_loop.yml b/roles/os_zabbix/vars/template_config_loop.yml
deleted file mode 100644
index 823da1868..000000000
--- a/roles/os_zabbix/vars/template_config_loop.yml
+++ /dev/null
@@ -1,14 +0,0 @@
----
-g_template_config_loop:
- name: Template Config Loop
- zitems:
- - key: config_loop.run.exit_code
- applications:
- - Config Loop
- value_type: int
-
- ztriggers:
- - name: 'config_loop.run.exit_code not zero on {HOST.NAME}'
- expression: '{Template Config Loop:config_loop.run.exit_code.min(#2)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_config_loop.asciidoc'
- priority: average
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
deleted file mode 100644
index dd13e76f7..000000000
--- a/roles/os_zabbix/vars/template_docker.yml
+++ /dev/null
@@ -1,116 +0,0 @@
----
-g_template_docker:
- name: Template Docker
- zitems:
- - key: docker.ping
- applications:
- - Docker Daemon
- value_type: int
-
- - key: docker.info_elapsed_ms
- applications:
- - Docker Daemon
- value_type: int
-
- - key: docker.container.dns.resolution
- applications:
- - Docker Daemon
- value_type: int
-
- - key: docker.container.existing.dns.resolution.failed
- applications:
- - Docker Daemon
- value_type: int
-
- - key: docker.storage.is_loopback
- applications:
- - Docker Storage
- value_type: int
-
- - key: docker.storage.data.space.total
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.data.space.used
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.data.space.available
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.data.space.percent_available
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.metadata.space.total
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.metadata.space.used
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.metadata.space.available
- applications:
- - Docker Storage
- value_type: float
-
- - key: docker.storage.metadata.space.percent_available
- applications:
- - Docker Storage
- value_type: float
- ztriggers:
- - name: 'docker.ping failed on {HOST.NAME}'
- expression: '{Template Docker:docker.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
- priority: high
-
- # Re-enable for OpenShift 3.1.1 (https://bugzilla.redhat.com/show_bug.cgi?id=1292971#c6)
- - name: 'docker.container.dns.resolution failed on {HOST.NAME}'
- expression: '{Template Docker:docker.container.dns.resolution.min(#3)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
- priority: average
- status: disabled
-
- - name: 'docker.container.existing.dns.resolution.failed on {HOST.NAME}'
- expression: '{Template Docker:docker.container.existing.dns.resolution.failed.min(#3)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
- priority: average
-
- - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
- expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
- priority: high
-
- - name: 'Critically low docker storage data space on {HOST.NAME}'
- expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
- priority: high
-
- - name: 'Critically low docker storage metadata space on {HOST.NAME}'
- expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
- priority: high
-
- # Put triggers that depend on other triggers here (deps must be created first)
- - name: 'Low docker storage data space on {HOST.NAME}'
- expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
- dependencies:
- - 'Critically low docker storage data space on {HOST.NAME}'
- priority: average
-
- - name: 'Low docker storage metadata space on {HOST.NAME}'
- expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
- dependencies:
- - 'Critically low docker storage metadata space on {HOST.NAME}'
- priority: average
-
diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml
deleted file mode 100644
index ec953c79b..000000000
--- a/roles/os_zabbix/vars/template_heartbeat.yml
+++ /dev/null
@@ -1,18 +0,0 @@
----
-g_template_heartbeat:
- name: Template Heartbeat
- zitems:
- - name: Heartbeat Ping
- applications:
- - Heartbeat
- key: heartbeat.ping
- ztriggers:
- - name: 'Heartbeat.ping has failed on {HOST.NAME}'
- expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
- priority: avg
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
-
- - name: 'Heartbeat.ping has failed (60 min) on {HOST.NAME}'
- expression: '{Template Heartbeat:heartbeat.ping.nodata(60m)}=1'
- priority: high
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
deleted file mode 100644
index a38db9f65..000000000
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ /dev/null
@@ -1,458 +0,0 @@
----
-g_template_openshift_master:
- name: Template Openshift Master
- zitems:
- - name: openshift.master.app.create
- applications:
- - Openshift Master
- key: openshift.master.app.create
-
- - key: openshift.master.app.build.create
- description: "check the app create with a build process"
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.app.create.time
- description: "check the time it takes app create with a build process"
- value_type: float
- applications:
- - Openshift Master
-
- - key: openshift.master.app.build.time
- description: "check the time it takes app build"
- value_type: float
- applications:
- - Openshift Master
-
- - key: openshift.master.process.count
- description: Shows number of master processes running
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.api.ping
- description: "Verify that the Openshift API is up (uses the cluster API URL)"
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.local.api.ping
- description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)"
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.api.healthz
- description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz"
- value_type: int
- data_type: bool
- applications:
- - Openshift Master
-
- - key: openshift.master.local.api.healthz
- description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz"
- value_type: int
- data_type: bool
- applications:
- - Openshift Master
-
- - key: openshift.master.user.count
- description: Shows number of users in a cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pod.running.count
- description: Shows number of pods running
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pod.user.running.count
- description: Shows number of user pods running (non infrastructure pods)
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pod.total.count
- description: Shows total number of pods (running and non running)
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.node.count
- description: Shows the total number of nodes found in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.project.count
- description: Shows number of projects on a cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.space.total
- description: Shows the total space of pv
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.space.available
- description: Shows the available space of pv
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.total.count
- description: Total number of Persistent Volumes in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.available.count
- description: Total number of Available Persistent Volumes in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.released.count
- description: Total number of Released Persistent Volumes in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.bound.count
- description: Total number of Bound Persistent Volumes in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.pv.failed.count
- description: Total number of Failed Persistent Volumes in the Openshift Cluster
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.skydns.port.open
- description: State of the SkyDNS port open and listening
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.skydns.query
- description: SkyDNS can be queried or not
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.etcd.create.success
- description: Show number of successful create actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.create.fail
- description: Show number of failed create actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.delete.success
- description: Show number of successful delete actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.delete.fail
- description: Show number of failed delete actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.get.success
- description: Show number of successful get actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.get.fail
- description: Show number of failed get actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.set.success
- description: Show number of successful set actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.set.fail
- description: Show number of failed set actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.update.success
- description: Show number of successful update actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.update.fail
- description: Show number of failed update actions
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.watchers
- description: Show number of etcd watchers
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.etcd.ping
- description: etcd ping
- value_type: int
- applications:
- - Openshift Etcd
-
- - key: openshift.master.metric.ping
- description: "This check verifies that the https://master/metrics check is alive and communicating properly."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.nodesnotready.count
- description: "This check shows how many nodes in a cluster are in NotReady state."
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.nodesnotschedulable.count
- description: "This check shows how many nodes in a cluster are not schedulable."
- value_type: int
- applications:
- - Openshift Master
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
- value_type: int
- applications:
- - Openshift Master Metrics
-
- zdiscoveryrules:
- - name: disc.pv
- key: disc.pv
- lifetime: 1
- description: "Dynamically register the Persistent Volumes"
-
- zitemprototypes:
- - discoveryrule_key: disc.pv
- name: "disc.pv.count.{#OSO_PV}"
- key: "disc.pv.count[{#OSO_PV}]"
- value_type: int
- description: "Number of PV's of this size"
- applications:
- - Openshift Master
-
- - discoveryrule_key: disc.pv
- name: "disc.pv.available.{#OSO_PV}"
- key: "disc.pv.available[{#OSO_PV}]"
- value_type: int
- description: "Number of PV's of this size that are available"
- applications:
- - Openshift Master
-
- ztriggers:
- - name: 'Openshift Master process not running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
-
- - name: 'Too many Openshift Master processes running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
-
- - name: 'Etcd ping failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
- priority: high
-
- - name: 'Number of users for Openshift Master on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: info
-
- - name: 'There are no projects running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.project.count.last()}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: info
-
- # Put triggers that depend on other triggers here (deps must be created first)
- - name: 'Application creation has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
-
- - name: 'Application creation with build has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.app.build.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.build.create.last(#2)}=1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
-
- - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- description: The application create loop has failed 4 or more times in the last hour
- priority: avg
-
- - name: 'Application with build creation has failed multiple times in the last 2 hour on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.app.build.create.sum(2h)}>3'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- description: The application create loop has failed 4 or more times in the last hour
- priority: avg
-
- - name: 'Openshift Master API health check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
-
- - name: 'Openshift Master Local API health check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
-
- - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
-
- - name: 'SkyDNS port not listening on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- - name: 'SkyDNS query failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master API health check is failing on {HOST.NAME}'
- priority: high
-
- - name: 'Hosts not ready according to {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- zgraphs:
- - name: Openshift Master API Server Latency Pods LIST Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
- color: red
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
- color: blue
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
- color: orange
-
- - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
- color: red
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
- color: blue
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
- color: orange
-
- - name: Openshift Master Scheduler End to End Latency Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
- color: red
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
- color: blue
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
- color: orange
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
deleted file mode 100644
index 9f84a2cdf..000000000
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ /dev/null
@@ -1,70 +0,0 @@
----
-g_template_openshift_node:
- name: Template Openshift Node
- zitems:
- - key: openshift.node.process.count
- description: Shows number of OpenShift Node processes running
- value_type: int
- applications:
- - Openshift Node
-
- - key: openshift.node.ovs.pids.count
- description: Shows number of ovs process ids running
- value_type: int
- applications:
- - Openshift Node
-
- - key: openshift.node.ovs.ports.count
- description: Shows number of OVS ports defined
- value_type: int
- applications:
- - Openshift Node
-
- - key: openshift.node.ovs.stray.rules
- description: Number of OVS stray rules found/removed
- value_type: int
- applications:
- - Openshift Node
-
- - key: openshift.node.registry-pods.healthy_pct
- description: Shows the percentage of healthy registries in the cluster
- value_type: int
- applications:
- - Openshift Node
-
- - key: openshift.node.registry.service.ping
- description: Ping docker-registry service from node
- value_type: int
- applications:
- - Openshift Node
-
- ztriggers:
- - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
- priority: avg
-
- - name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
- priority: avg
-
- - name: 'Openshift Node process not running on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
- priority: high
-
- - name: 'Too many Openshift Node processes running on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
- url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
- priority: high
-
- - name: '[Heal] OVS may not be running on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
- url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
- priority: high
-
- - name: 'Number of OVS ports is 0 on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
- url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
- priority: high
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
deleted file mode 100644
index a0a5a4d03..000000000
--- a/roles/os_zabbix/vars/template_ops_tools.yml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-g_template_ops_tools:
- name: Template Operations Tools
- zdiscoveryrules:
- - name: disc.ops.runner
- key: disc.ops.runner
- lifetime: 1
- description: "Dynamically register operations runner items"
-
- zitemprototypes:
- - discoveryrule_key: disc.ops.runner
- name: "Exit code of ops-runner[{#OSO_COMMAND}]"
- key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
- value_type: int
- description: "The exit code of the command run from ops-runner"
- applications:
- - Ops Runner
-
- ztriggerprototypes:
- - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
- expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
- priority: average
-
- zactions:
- - name: 'Remote command for [Heal] triggers'
- status: enabled
- escalation_time: 60
- conditions_filter:
- calculation_type: "and/or"
- conditions:
- - conditiontype: maintenance status
- operator: not in
- - conditiontype: trigger name
- operator: like
- value: "[Heal]"
- - conditiontype: trigger value
- operator: "="
- value: PROBLEM
- operations:
- - esc_step_from: 1
- esc_step_to: 1
- esc_period: 0
- operationtype: remote command
- opcommand:
- command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
- execute_on: "zabbix server"
- type: 'custom script'
- target_hosts:
- - target_type: 'zabbix server'
- opconditions:
- - conditiontype: 'event acknowledged'
- operator: '='
- value: 'not acknowledged'
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
deleted file mode 100644
index c6e557f12..000000000
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ /dev/null
@@ -1,314 +0,0 @@
----
-g_template_os_linux:
- name: Template OS Linux
- zitems:
- - key: kernel.uname.sysname
- applications:
- - Kernel
- value_type: string
-
- - key: kernel.all.cpu.wait.total
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.cpu.irq.hard
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.cpu.idle
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.uname.distro
- applications:
- - Kernel
- value_type: string
-
- - key: kernel.uname.nodename
- applications:
- - Kernel
- value_type: string
-
- - key: kernel.all.cpu.irq.soft
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.load.15_minute
- applications:
- - Kernel
- value_type: float
-
- - key: kernel.all.cpu.sys
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.load.5_minute
- applications:
- - Kernel
- value_type: float
-
- - key: kernel.all.cpu.nice
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.load.1_minute
- applications:
- - Kernel
- value_type: float
-
- - key: kernel.uname.version
- applications:
- - Kernel
- value_type: string
-
- - key: kernel.all.uptime
- applications:
- - Kernel
- value_type: int
-
- - key: kernel.all.cpu.user
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.uname.machine
- applications:
- - Kernel
- value_type: string
-
- - key: hinv.ncpu
- applications:
- - Kernel
- value_type: int
-
- - key: kernel.all.cpu.steal
- applications:
- - Kernel
- value_type: float
- units: '%'
-
- - key: kernel.all.pswitch
- applications:
- - Kernel
- value_type: int
-
- - key: kernel.uname.release
- applications:
- - Kernel
- value_type: string
-
- - key: proc.nprocs
- applications:
- - Kernel
- value_type: int
-
- # Memory Items
- - key: mem.freemem
- applications:
- - Memory
- value_type: int
- description: "PCP: free system memory metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: mem.util.bufmem
- applications:
- - Memory
- value_type: int
- description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: swap.used
- applications:
- - Memory
- value_type: int
- description: "PCP: swap used metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: swap.length
- applications:
- - Memory
- value_type: int
- description: "PCP: total swap available metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: mem.physmem
- applications:
- - Memory
- value_type: int
- description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
- multiplier: 1024
- units: B
-
- - key: swap.free
- applications:
- - Memory
- value_type: int
- description: "PCP: swap free metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: mem.util.available
- applications:
- - Memory
- value_type: int
- description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: mem.util.used
- applications:
- - Memory
- value_type: int
- description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- - key: mem.util.cached
- applications:
- - Memory
- value_type: int
- description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
- multiplier: 1024
- units: B
-
- zdiscoveryrules:
- - name: disc.filesys
- key: disc.filesys
- lifetime: 1
- description: "Dynamically register the filesystems"
-
- - name: disc.disk
- key: disc.disk
- lifetime: 1
- description: "Dynamically register disks on a node"
-
- - name: disc.network
- key: disc.network
- lifetime: 1
- description: "Dynamically register network interfaces on a node"
-
- zitemprototypes:
- - discoveryrule_key: disc.filesys
- name: "disc.filesys.full.{#OSO_FILESYS}"
- key: "disc.filesys.full[{#OSO_FILESYS}]"
- value_type: float
- description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
- applications:
- - Disk
-
- - discoveryrule_key: disc.filesys
- name: "Percentage of used inodes on {#OSO_FILESYS}"
- key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
- value_type: float
- description: "PCP derived value of percentage of used inodes on a filesystem."
- applications:
- - Disk
-
- - discoveryrule_key: disc.disk
- name: "TPS (IOPS) for disk {#OSO_DISK}"
- key: "disc.disk.tps[{#OSO_DISK}]"
- value_type: int
- description: "PCP disk.dev.totals metric measured over a period of time. This shows how many disk transactions per second the disk is using"
- applications:
- - Disk
-
- - discoveryrule_key: disc.disk
- name: "Percent Utilized for disk {#OSO_DISK}"
- key: "disc.disk.putil[{#OSO_DISK}]"
- value_type: float
- description: "PCP disk.dev.avactive metric measured over a period of time. This is the '%util' in the iostat command"
- applications:
- - Disk
-
- - discoveryrule_key: disc.network
- name: "Bytes per second IN on network interface {#OSO_NET_INTERFACE}"
- key: "disc.network.in.bytes[{#OSO_NET_INTERFACE}]"
- value_type: int
- units: B
- delta: 1
- description: "PCP network.interface.in.bytes metric. This is setup as a delta in Zabbix to measure the speed per second"
- applications:
- - Network
-
- - discoveryrule_key: disc.network
- name: "Bytes per second OUT on network interface {#OSO_NET_INTERFACE}"
- key: "disc.network.out.bytes[{#OSO_NET_INTERFACE}]"
- value_type: int
- units: B
- delta: 1
- description: "PCP network.interface.out.bytes metric. This is setup as a delta in Zabbix to measure the speed per second"
- applications:
- - Network
-
- ztriggerprototypes:
- - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: high
-
- # This has a dependency on the previous trigger
- # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
- - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: warn
- dependencies:
- - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
-
- - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: high
-
- # This has a dependency on the previous trigger
- # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
- - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
- expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: warn
- dependencies:
- - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
-
- ztriggers:
- - name: 'Too many TOTAL processes on {HOST.NAME}'
- expression: '{Template OS Linux:proc.nprocs.last()}>5000'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
- priority: warn
-
- - name: 'Lack of available memory on {HOST.NAME}'
- expression: '{Template OS Linux:mem.freemem.last()}<30720000'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
- priority: warn
- description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
-
- # CPU Utilization #
- - name: 'CPU idle less than 5% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
- priority: average
- description: 'CPU is less than 5% idle'
-
- - name: 'CPU idle less than 10% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
- priority: average
- description: 'CPU is less than 10% idle'
- dependencies:
- - 'CPU idle less than 5% on {HOST.NAME}'
diff --git a/roles/os_zabbix/vars/template_performance_copilot.yml b/roles/os_zabbix/vars/template_performance_copilot.yml
deleted file mode 100644
index b62fa0228..000000000
--- a/roles/os_zabbix/vars/template_performance_copilot.yml
+++ /dev/null
@@ -1,14 +0,0 @@
----
-g_template_performance_copilot:
- name: Template Performance Copilot
- zitems:
- - key: pcp.ping
- applications:
- - Performance Copilot
- value_type: int
-
- ztriggers:
- - name: 'pcp.ping failed on {HOST.NAME}'
- expression: '{Template Performance Copilot:pcp.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_pcp_ping.asciidoc'
- priority: average
diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml
deleted file mode 100644
index db5665993..000000000
--- a/roles/os_zabbix/vars/template_zagg_server.yml
+++ /dev/null
@@ -1,46 +0,0 @@
----
-g_template_zagg_server:
- name: Template Zagg Server
- zitems:
- - key: zagg.server.metrics.count
- applications:
- - Zagg Server
- value_type: int
-
- - key: zagg.server.metrics.errors
- applications:
- - Zagg Server
- value_type: int
-
- - key: zagg.server.heartbeat.errors
- applications:
- - Zagg Server
- value_type: int
-
- - key: zagg.server.heartbeat.count
- applications:
- - Zagg Server
- value_type: int
-
- ztriggers:
- - name: 'Error processing metrics on {HOST.NAME}'
- expression: '{Template Zagg Server:zagg.server.metrics.errors.min(#3)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
- priority: average
-
- - name: 'Error processing heartbeats on {HOST.NAME}'
- expression: '{Template Zagg Server:zagg.server.heartbeat.errors.min(#3)}>0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
- priority: average
-
- - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}'
- expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
- priority: high
-
- - name: 'High number of metrics in Zagg queue {HOST.NAME}'
- expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
- dependencies:
- - 'Critically High number of metrics in Zagg queue {HOST.NAME}'
- priority: average