summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/README.md40
-rw-r--r--roles/os_zabbix/defaults/main.yml1
-rw-r--r--roles/os_zabbix/handlers/main.yml1
-rwxr-xr-xroles/os_zabbix/library/zbxapi.py370
-rw-r--r--roles/os_zabbix/meta/main.yml9
-rw-r--r--roles/os_zabbix/tasks/main.yml81
-rw-r--r--roles/os_zabbix/vars/main.yml1
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_agent.yml23
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml412
-rw-r--r--roles/os_zabbix/vars/template_docker.yml94
-rw-r--r--roles/os_zabbix/vars/template_heartbeat.yml13
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml58
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml44
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml23
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml260
15 files changed, 1060 insertions, 370 deletions
diff --git a/roles/os_zabbix/README.md b/roles/os_zabbix/README.md
new file mode 100644
index 000000000..ac3dc2833
--- /dev/null
+++ b/roles/os_zabbix/README.md
@@ -0,0 +1,40 @@
+os_zabbix
+=========
+
+Automate zabbix tasks.
+
+Requirements
+------------
+
+This requires the openshift_tools rpm be installed for the zbxapi.py library. It can be found here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now.
+
+Role Variables
+--------------
+
+zab_server
+zab_username
+zab_password
+
+Dependencies
+------------
+
+This depeonds on the zbxapi.py library located here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now.
+
+Example Playbook
+----------------
+
+ - zbx_host:
+ server: zab_server
+ user: zab_user
+ password: zab_password
+ name: 'myhost'
+
+License
+-------
+
+ASL 2.0
+
+Author Information
+------------------
+
+OpenShift operations, Red Hat, Inc
diff --git a/roles/os_zabbix/defaults/main.yml b/roles/os_zabbix/defaults/main.yml
new file mode 100644
index 000000000..ed97d539c
--- /dev/null
+++ b/roles/os_zabbix/defaults/main.yml
@@ -0,0 +1 @@
+---
diff --git a/roles/os_zabbix/handlers/main.yml b/roles/os_zabbix/handlers/main.yml
new file mode 100644
index 000000000..ed97d539c
--- /dev/null
+++ b/roles/os_zabbix/handlers/main.yml
@@ -0,0 +1 @@
+---
diff --git a/roles/os_zabbix/library/zbxapi.py b/roles/os_zabbix/library/zbxapi.py
deleted file mode 100755
index b5fa5ee2b..000000000
--- a/roles/os_zabbix/library/zbxapi.py
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python
-# vim: expandtab:tabstop=4:shiftwidth=4
-'''
- ZabbixAPI ansible module
-'''
-
-# Copyright 2015 Red Hat Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Purpose: An ansible module to communicate with zabbix.
-#
-
-# pylint: disable=line-too-long
-# Disabling line length for readability
-
-import json
-import httplib2
-import sys
-import os
-import re
-import copy
-
-class ZabbixAPIError(Exception):
- '''
- ZabbixAPIError
- Exists to propagate errors up from the api
- '''
- pass
-
-class ZabbixAPI(object):
- '''
- ZabbixAPI class
- '''
- classes = {
- 'Action': ['create', 'delete', 'get', 'update'],
- 'Alert': ['get'],
- 'Application': ['create', 'delete', 'get', 'massadd', 'update'],
- 'Configuration': ['export', 'import'],
- 'Dcheck': ['get'],
- 'Dhost': ['get'],
- 'Drule': ['copy', 'create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Dservice': ['get'],
- 'Event': ['acknowledge', 'get'],
- 'Graph': ['create', 'delete', 'get', 'update'],
- 'Graphitem': ['get'],
- 'Graphprototype': ['create', 'delete', 'get', 'update'],
- 'History': ['get'],
- 'Hostgroup': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'],
- 'Hostinterface': ['create', 'delete', 'get', 'massadd', 'massremove', 'replacehostinterfaces', 'update'],
- 'Host': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'],
- 'Hostprototype': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Httptest': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Iconmap': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Image': ['create', 'delete', 'get', 'update'],
- 'Item': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Itemprototype': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Maintenance': ['create', 'delete', 'get', 'update'],
- 'Map': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Mediatype': ['create', 'delete', 'get', 'update'],
- 'Proxy': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Screen': ['create', 'delete', 'get', 'update'],
- 'Screenitem': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update', 'updatebyposition'],
- 'Script': ['create', 'delete', 'execute', 'get', 'getscriptsbyhosts', 'update'],
- 'Service': ['adddependencies', 'addtimes', 'create', 'delete', 'deletedependencies', 'deletetimes', 'get', 'getsla', 'isreadable', 'iswritable', 'update'],
- 'Template': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'],
- 'Templatescreen': ['copy', 'create', 'delete', 'get', 'isreadable', 'iswritable', 'update'],
- 'Templatescreenitem': ['get'],
- 'Trigger': ['adddependencies', 'create', 'delete', 'deletedependencies', 'get', 'isreadable', 'iswritable', 'update'],
- 'Triggerprototype': ['create', 'delete', 'get', 'update'],
- 'User': ['addmedia', 'create', 'delete', 'deletemedia', 'get', 'isreadable', 'iswritable', 'login', 'logout', 'update', 'updatemedia', 'updateprofile'],
- 'Usergroup': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massupdate', 'update'],
- 'Usermacro': ['create', 'createglobal', 'delete', 'deleteglobal', 'get', 'update', 'updateglobal'],
- 'Usermedia': ['get'],
- }
-
- def __init__(self, data=None):
- if not data:
- data = {}
- self.server = data.get('server', None)
- self.username = data.get('user', None)
- self.password = data.get('password', None)
- if any([value == None for value in [self.server, self.username, self.password]]):
- print 'Please specify zabbix server url, username, and password.'
- sys.exit(1)
-
- self.verbose = data.get('verbose', False)
- self.use_ssl = data.has_key('use_ssl')
- self.auth = None
-
- for cname, _ in self.classes.items():
- setattr(self, cname.lower(), getattr(self, cname)(self))
-
- # pylint: disable=no-member
- # This method does not exist until the metaprogramming executed
- # This is permanently disabled.
- results = self.user.login(user=self.username, password=self.password)
-
- if results[0]['status'] == '200':
- if results[1].has_key('result'):
- self.auth = results[1]['result']
- elif results[1].has_key('error'):
- print "Unable to authenticate with zabbix server. {0} ".format(results[1]['error'])
- sys.exit(1)
- else:
- print "Error in call to zabbix. Http status: {0}.".format(results[0]['status'])
- sys.exit(1)
-
- def perform(self, method, rpc_params):
- '''
- This method calls your zabbix server.
-
- It requires the following parameters in order for a proper request to be processed:
- jsonrpc - the version of the JSON-RPC protocol used by the API;
- the Zabbix API implements JSON-RPC version 2.0;
- method - the API method being called;
- rpc_params - parameters that will be passed to the API method;
- id - an arbitrary identifier of the request;
- auth - a user authentication token; since we don't have one yet, it's set to null.
- '''
- http_method = "POST"
- jsonrpc = "2.0"
- rid = 1
-
- http = None
- if self.use_ssl:
- http = httplib2.Http()
- else:
- http = httplib2.Http(disable_ssl_certificate_validation=True,)
-
- headers = {}
- headers["Content-type"] = "application/json"
-
- body = {
- "jsonrpc": jsonrpc,
- "method": method,
- "params": rpc_params.get('params', {}),
- "id": rid,
- 'auth': self.auth,
- }
-
- if method in ['user.login', 'api.version']:
- del body['auth']
-
- body = json.dumps(body)
-
- if self.verbose:
- print body
- print method
- print headers
- httplib2.debuglevel = 1
-
- response, content = http.request(self.server, http_method, body, headers)
-
- if response['status'] not in ['200', '201']:
- raise ZabbixAPIError('Error calling zabbix. Zabbix returned %s' % response['status'])
-
- if self.verbose:
- print response
- print content
-
- try:
- content = json.loads(content)
- except ValueError as err:
- content = {"error": err.message}
-
- return response, content
-
- @staticmethod
- def meta(cname, method_names):
- '''
- This bit of metaprogramming is where the ZabbixAPI subclasses are created.
- For each of ZabbixAPI.classes we create a class from the key and methods
- from the ZabbixAPI.classes values. We pass a reference to ZabbixAPI class
- to each subclass in order for each to be able to call the perform method.
- '''
- def meta_method(_class, method_name):
- '''
- This meta method allows a class to add methods to it.
- '''
- # This template method is a stub method for each of the subclass
- # methods.
- def template_method(self, params=None, **rpc_params):
- '''
- This template method is a stub method for each of the subclass methods.
- '''
- if params:
- rpc_params['params'] = params
- else:
- rpc_params['params'] = copy.deepcopy(rpc_params)
-
- return self.parent.perform(cname.lower()+"."+method_name, rpc_params)
-
- template_method.__doc__ = \
- "https://www.zabbix.com/documentation/2.4/manual/api/reference/%s/%s" % \
- (cname.lower(), method_name)
- template_method.__name__ = method_name
- # this is where the template method is placed inside of the subclass
- # e.g. setattr(User, "create", stub_method)
- setattr(_class, template_method.__name__, template_method)
-
- # This class call instantiates a subclass. e.g. User
- _class = type(cname,
- (object,),
- {'__doc__': \
- "https://www.zabbix.com/documentation/2.4/manual/api/reference/%s" % cname.lower()})
- def __init__(self, parent):
- '''
- This init method gets placed inside of the _class
- to allow it to be instantiated. A reference to the parent class(ZabbixAPI)
- is passed in to allow each class access to the perform method.
- '''
- self.parent = parent
-
- # This attaches the init to the subclass. e.g. Create
- setattr(_class, __init__.__name__, __init__)
- # For each of our ZabbixAPI.classes dict values
- # Create a method and attach it to our subclass.
- # e.g. 'User': ['delete', 'get', 'updatemedia', 'updateprofile',
- # 'update', 'iswritable', 'logout', 'addmedia', 'create',
- # 'login', 'deletemedia', 'isreadable'],
- # User.delete
- # User.get
- for method_name in method_names:
- meta_method(_class, method_name)
- # Return our subclass with all methods attached
- return _class
-
-# Attach all ZabbixAPI.classes to ZabbixAPI class through metaprogramming
-for _class_name, _method_names in ZabbixAPI.classes.items():
- setattr(ZabbixAPI, _class_name, ZabbixAPI.meta(_class_name, _method_names))
-
-def exists(content, key='result'):
- ''' Check if key exists in content or the size of content[key] > 0
- '''
- if not content.has_key(key):
- return False
-
- if not content[key]:
- return False
-
- return True
-
-def diff_content(from_zabbix, from_user):
- ''' Compare passed in object to results returned from zabbix
- '''
- terms = ['search', 'output', 'groups', 'select', 'expand']
- regex = '(' + '|'.join(terms) + ')'
- retval = {}
- for key, value in from_user.items():
- if re.findall(regex, key):
- continue
-
- if from_zabbix[key] != str(value):
- retval[key] = str(value)
-
- return retval
-
-def main():
- '''
- This main method runs the ZabbixAPI Ansible Module
- '''
-
- module = AnsibleModule(
- argument_spec=dict(
- server=dict(default='https://localhost/zabbix/api_jsonrpc.php', type='str'),
- user=dict(default=None, type='str'),
- password=dict(default=None, type='str'),
- zbx_class=dict(choices=ZabbixAPI.classes.keys()),
- params=dict(),
- debug=dict(default=False, type='bool'),
- state=dict(default='present', type='str'),
- ),
- #supports_check_mode=True
- )
-
- user = module.params.get('user', None)
- if not user:
- user = os.environ['ZABBIX_USER']
-
- passwd = module.params.get('password', None)
- if not passwd:
- passwd = os.environ['ZABBIX_PASSWORD']
-
-
-
- api_data = {
- 'user': user,
- 'password': passwd,
- 'server': module.params['server'],
- 'verbose': module.params['debug']
- }
-
- if not user or not passwd or not module.params['server']:
- module.fail_json(msg='Please specify the user, password, and the zabbix server.')
-
- zapi = ZabbixAPI(api_data)
-
- zbx_class = module.params.get('zbx_class')
- rpc_params = module.params.get('params', {})
- state = module.params.get('state')
-
- # Get the instance we are trying to call
- zbx_class_inst = zapi.__getattribute__(zbx_class.lower())
-
- # perform get
- # Get the instance's method we are trying to call
-
- zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['get']
- _, content = zbx_action_method(zbx_class_inst, rpc_params)
-
- if state == 'list':
- module.exit_json(changed=False, results=content['result'], state="list")
-
- if state == 'absent':
- if not exists(content):
- module.exit_json(changed=False, state="absent")
- # If we are coming from a query, we need to pass in the correct rpc_params for delete.
- # specifically the zabbix class name + 'id'
- # if rpc_params is a list then we need to pass it. (list of ids to delete)
- idname = zbx_class.lower() + "id"
- if not isinstance(rpc_params, list) and content['result'][0].has_key(idname):
- rpc_params = [content['result'][0][idname]]
-
- zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['delete']
- _, content = zbx_action_method(zbx_class_inst, rpc_params)
- module.exit_json(changed=True, results=content['result'], state="absent")
-
- if state == 'present':
- # It's not there, create it!
- if not exists(content):
- zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['create']
- _, content = zbx_action_method(zbx_class_inst, rpc_params)
- module.exit_json(changed=True, results=content['result'], state='present')
-
- # It's there and the same, do nothing!
- diff_params = diff_content(content['result'][0], rpc_params)
- if not diff_params:
- module.exit_json(changed=False, results=content['result'], state="present")
-
- # Add the id to update with
- idname = zbx_class.lower() + "id"
- diff_params[idname] = content['result'][0][idname]
-
-
- ## It's there and not the same, update it!
- zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['update']
- _, content = zbx_action_method(zbx_class_inst, diff_params)
- module.exit_json(changed=True, results=content, state="present")
-
- module.exit_json(failed=True,
- changed=False,
- results='Unknown state passed. %s' % state,
- state="unknown")
-
-# pylint: disable=redefined-builtin, unused-wildcard-import, wildcard-import, locally-disabled
-# import module snippets. This are required
-from ansible.module_utils.basic import *
-
-main()
diff --git a/roles/os_zabbix/meta/main.yml b/roles/os_zabbix/meta/main.yml
new file mode 100644
index 000000000..360f5aad2
--- /dev/null
+++ b/roles/os_zabbix/meta/main.yml
@@ -0,0 +1,9 @@
+---
+galaxy_info:
+ author: OpenShift
+ description: ZabbixAPI
+ company: Red Hat, Inc
+ license: ASL 2.0
+ min_ansible_version: 1.2
+dependencies:
+- lib_zabbix
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
new file mode 100644
index 000000000..a503b24d7
--- /dev/null
+++ b/roles/os_zabbix/tasks/main.yml
@@ -0,0 +1,81 @@
+---
+- name: Main List all templates
+ zbx_template:
+ zbx_server: "{{ ozb_server }}"
+ zbx_user: "{{ ozb_user }}"
+ zbx_password: "{{ ozb_password }}"
+ state: list
+ register: templates
+
+- include_vars: template_heartbeat.yml
+- include_vars: template_os_linux.yml
+- include_vars: template_docker.yml
+- include_vars: template_openshift_master.yml
+- include_vars: template_openshift_node.yml
+- include_vars: template_ops_tools.yml
+- include_vars: template_app_zabbix_server.yml
+- include_vars: template_app_zabbix_agent.yml
+
+- name: Include Template Heartbeat
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_heartbeat }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template os_linux
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_os_linux }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template docker
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_docker }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Openshift Master
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_openshift_master }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Openshift Node
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_openshift_node }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Ops Tools
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_ops_tools }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template App Zabbix Server
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_server }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template App Zabbix Agent
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_agent }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
diff --git a/roles/os_zabbix/vars/main.yml b/roles/os_zabbix/vars/main.yml
new file mode 100644
index 000000000..ed97d539c
--- /dev/null
+++ b/roles/os_zabbix/vars/main.yml
@@ -0,0 +1 @@
+---
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
new file mode 100644
index 000000000..d636d4822
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
@@ -0,0 +1,23 @@
+---
+g_template_app_zabbix_agent:
+ name: Template App Zabbix Agent
+ zitems:
+ - key: agent.hostname
+ applications:
+ - Zabbix agent
+ value_type: character
+ zabbix_type: agent
+
+ - key: agent.ping
+ applications:
+ - Zabbix agent
+ description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check.
+ value_type: int
+ zabbix_type: agent
+
+ ztriggers:
+ - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes'
+ description: Zabbix agent is unreachable for 15 minutes.
+ expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1'
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
new file mode 100644
index 000000000..43517113b
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml
@@ -0,0 +1,412 @@
+---
+g_template_app_zabbix_server:
+ name: Template App Zabbix Server
+ zitems:
+ - key: housekeeper_creates
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition creates output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_drops
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition drops output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_errors
+ applications:
+ - Zabbix server
+ description: A simple count of the number of errors output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_total
+ applications:
+ - Zabbix server
+ description: A simple count of the total number of lines output by the housekeeper
+ script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: zabbix[process,alerter,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,configuration syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,db watchdog,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,discoverer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,escalator,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,history syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,housekeeper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,http poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,icmp pinger,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,ipmi poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,java poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,node watcher,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,proxy poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,self-monitoring,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,snmp trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,timer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,unreachable poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[queue,10m]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[queue]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[rcache,buffer,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,history,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,text,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,trend,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,values]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+ delta: 1 # speed per second
+
+ ztriggers:
+ - description: "There has been unexpected output while running the housekeeping script\
+ \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
+ \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
+ \ for more details."
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
+ name: Unexpected output in Zabbix DB Housekeeping
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
+
+ - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
+ name: Errors during Zabbix DB Housekeeping
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
+ name: Zabbix alerter processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
+ name: Zabbix configuration syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
+ name: Zabbix db watchdog processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
+ name: Zabbix discoverer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
+ name: Zabbix escalator processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
+ name: Zabbix history syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
+ name: Zabbix housekeeper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
+ name: Zabbix http poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
+ name: Zabbix icmp pinger processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
+ name: Zabbix ipmi poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
+ name: Zabbix java poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
+ name: Zabbix node watcher processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
+ name: Zabbix poller processes more than 75% busy
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
+ name: Zabbix proxy poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
+ name: Zabbix self-monitoring processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
+ name: Zabbix snmp trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: Timer processes usually are busy because they have to process time
+ based trigger functions
+ expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
+ name: Zabbix timer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
+ name: Zabbix trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
+ name: Zabbix unreachable poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: "This alert generally indicates a performance problem or a problem\
+ \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
+ \ is Administration > Queue. Be sure to check the general view and the per-proxy\
+ \ view."
+ expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
+ name: More than 1000 items having missing data for more than 10 minutes
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
+
+ - description: Consider increasing CacheSize in the zabbix_server.conf configuration
+ file
+ expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
+ name: Less than 5% free in the configuration cache
+ priority: info
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
+ name: Less than 25% free in the history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
+ name: Less than 25% free in the text history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
+ name: Less than 25% free in the trends cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
new file mode 100644
index 000000000..bfabf50c5
--- /dev/null
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -0,0 +1,94 @@
+---
+g_template_docker:
+ name: Template Docker
+ zitems:
+ - key: docker.ping
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.info_elapsed_ms
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.storage.is_loopback
+ applications:
+ - Docker Storage
+ value_type: int
+
+ - key: docker.storage.data.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+ ztriggers:
+ - name: 'docker.ping failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
+ priority: high
+
+ - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ # Put triggers that depend on other triggers here (deps must be created first)
+ - name: 'Low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage data space on {HOST.NAME}'
+ priority: average
+
+ - name: 'Low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage metadata space on {HOST.NAME}'
+ priority: average
+
diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml
new file mode 100644
index 000000000..8dbe0d0d6
--- /dev/null
+++ b/roles/os_zabbix/vars/template_heartbeat.yml
@@ -0,0 +1,13 @@
+---
+g_template_heartbeat:
+ name: Template Heartbeat
+ zitems:
+ - name: Heartbeat Ping
+ applications:
+ - Heartbeat
+ key: heartbeat.ping
+ ztriggers:
+ - name: 'Heartbeat.ping has failed on {HOST.NAME}'
+ expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
+ priority: avg
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
new file mode 100644
index 000000000..1de4fefbb
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -0,0 +1,58 @@
+---
+g_template_openshift_master:
+ name: Template Openshift Master
+ zitems:
+ - name: create_app
+ applications:
+ - Openshift Master
+ key: create_app
+
+ - key: openshift.master.process.count
+ description: Shows number of master processes running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.user.count
+ description: Shows number of users in a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.running.count
+ description: Shows number of pods running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.project.counter
+ description: Shows number of projects on a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ ztriggers:
+ - name: 'Application creation has failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ priority: avg
+
+ - name: 'Openshift Master process not running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Master processes running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Number of users for Openshift Master on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
+
+ - name: 'There are no projects running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
new file mode 100644
index 000000000..ce28b1048
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -0,0 +1,44 @@
+---
+g_template_openshift_node:
+ name: Template Openshift Node
+ zitems:
+ - key: openshift.node.process.count
+ description: Shows number of OpenShift Node processes running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.pids.count
+ description: Shows number of ovs process ids running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.ports.count
+ description: Shows number of OVS ports defined
+ type: int
+ applications:
+ - Openshift Node
+
+ ztriggers:
+ - name: 'Openshift Node process not running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Node processes running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Number of OVS ports is 0 on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
new file mode 100644
index 000000000..d1b8a2514
--- /dev/null
+++ b/roles/os_zabbix/vars/template_ops_tools.yml
@@ -0,0 +1,23 @@
+---
+g_template_ops_tools:
+ name: Template Operations Tools
+ zdiscoveryrules:
+ - name: disc.ops.runner
+ key: disc.ops.runner
+ lifetime: 1
+ description: "Dynamically register operations runner items"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.ops.runner
+ name: "Exit code of ops-runner[{#OSO_COMMAND}]"
+ key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
+ value_type: int
+ description: "The exit code of the command run from ops-runner"
+ applications:
+ - Ops Runner
+
+ ztriggerprototypes:
+ - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
+ expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
new file mode 100644
index 000000000..3ae1500bc
--- /dev/null
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -0,0 +1,260 @@
+---
+g_template_os_linux:
+ name: Template OS Linux
+ zitems:
+ - key: kernel.uname.sysname
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.cpu.wait.total
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.cpu.irq.hard
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.cpu.idle
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.uname.distro
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.uname.nodename
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.cpu.irq.soft
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.15_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.all.cpu.sys
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.5_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.all.cpu.nice
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.1_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.uname.version
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.uptime
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.all.cpu.user
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.uname.machine
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: hinv.ncpu
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.all.cpu.steal
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.pswitch
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.uname.release
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: proc.nprocs
+ applications:
+ - Kernel
+ value_type: int
+
+ # Memory Items
+ - key: mem.freemem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: free system memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.bufmem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: swap.used
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: swap used metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: swap.length
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: total swap available metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.physmem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
+ multiplier: 1024
+ units: B
+
+ - key: swap.free
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: swap free metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.available
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.used
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.cached
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ zdiscoveryrules:
+ - name: disc.filesys
+ key: disc.filesys
+ lifetime: 1
+ description: "Dynamically register the filesystems"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.filesys
+ name: "disc.filesys.full.{#OSO_FILESYS}"
+ key: "disc.filesys.full[{#OSO_FILESYS}]"
+ value_type: float
+ description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
+ applications:
+ - Disk
+
+ - discoveryrule_key: disc.filesys
+ name: "Percentage of used inodes on {#OSO_FILESYS}"
+ key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
+ value_type: float
+ description: "PCP derived value of percentage of used inodes on a filesystem."
+ applications:
+ - Disk
+
+ ztriggerprototypes:
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ ztriggers:
+ - name: 'Too many TOTAL processes on {HOST.NAME}'
+ expression: '{Template OS Linux:proc.nprocs.last()}>5000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
+ priority: warn
+
+ - name: 'Lack of available memory on {HOST.NAME}'
+ expression: '{Template OS Linux:mem.freemem.last()}<30720000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
+ priority: warn
+ description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
+
+ # CPU Utilization #
+ - name: 'CPU idle less than 5% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: average
+ description: 'CPU is less than 5% idle'
+
+ - name: 'CPU idle less than 10% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: warn
+ description: 'CPU is less than 10% idle'
+ dependencies:
+ - 'CPU idle less than 5% on {HOST.NAME}'